# Clustering Crypto

In [4]:
# Initial imports
import requests
import pandas as pd
import altair as alt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [5]:
# Use the following endpoint to fetch json data
import json
import pprint
url = "https://min-api.cryptocompare.com/data/all/coinlist"
r = requests.get(url)
content = r.content
content_d = content.decode('utf-8')
data = json.loads(content_d)

In [6]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
# dataframe creation
crypto_df = pd.DataFrame(data=data['Data'])

# Transpose Dataframe
crypto_df = crypto_df.T

crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Algorithm,ProofType,...,SortOrder,Sponsored,Taxonomy,Rating,IsTrading,TotalCoinsMined,BlockNumber,NetHashesPerSecond,BlockReward,BlockTime
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Scrypt,PoW/PoS,...,34,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,42.0,195131.0,0.0,0.0,0.0
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),,,...,2212,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,300.0,0.0,0.0,0.0,0.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),X11,PoW/PoS,...,916,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),Scrypt,PoW/PoS,...,602,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,1222730.0,45736.0,0.0,16.0475,60.0
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),,,...,3505,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,112518000.0,10821438.0,231060807658772.0,2.0,0.0


In [7]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame

### Data Preprocessing

In [8]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df.drop(columns = ['BlockNumber', 'BlockReward', 'BlockTime', 'BuiltOn',
                          'ContentCreatedOn', 'DecimalPlaces', 'FullName',
                          'FullyPremined', 'Id', 'ImageUrl', 'Name',
                          'NetHashesPerSecond', 'PreMinedValue', 'Rating',
                          'SmartContractAddress', 'SortOrder', 'Sponsored', 'Symbol', 'Taxonomy',
                          'TotalCoinsFreeFloat', 'Url'], inplace=True)

# Reorder Columns that are still in the dataframe
crypto_df = crypto_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply']]

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
300,300 token,,True,,300.0,300
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1222730.0,532000000
433,433 Token,,False,,112518000.0,1000000000


In [9]:
# Keep only cryptocurrencies that are trading

# determine the count of false values(1094)
# crypto_df['IsTrading'].value_counts() 

crypto_df.drop(crypto_df[crypto_df['IsTrading'] == False].index,
              inplace=True)

In [10]:
# Keep only cryptocurrencies with a working algorithm
crypto_df.drop(crypto_df[crypto_df['Algorithm'] == 'N/A'].index, inplace=True)

In [11]:
# Remove the "IsTrading" column
crypto_df.drop(columns = ['IsTrading'], inplace=True)

In [12]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)

In [13]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == 0].index, inplace=True)

In [14]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df != 'N/A'].dropna()

In [15]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
CoinName_df = crypto_df['CoinName'].to_frame()

In [16]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns = ['CoinName'], inplace=True)

In [17]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'], drop_first=True)

In [18]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)

### Reducing Dimensions Using PCA

In [19]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)

In [20]:
# Create a DataFrame with the principal components data
pca_df = pd.DataFrame(data=crypto_pca,
                            columns=['PC 1', 'PC 2', 'PC 3'],
                            index=crypto_df.index
                            )
pca_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.371708,0.795624,-0.699285
404,-0.367801,0.795571,-0.699288
1337,2.353984,1.639391,-0.79327
BTCD,-0.343226,0.723671,-0.464557
XPY,-0.290492,0.561191,-0.108147


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [23]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=6)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)
alt.Chart(elbow_df).mark_line().encode(x='k', y='inertia')

Running K-Means with `k=<your best value for k here>`

In [24]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)
pca_df['class'] = model.labels_

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, CoinName_df, pca_df], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName,PC 1,PC 2,PC 3,class
42,Scrypt,PoW/PoS,42.0,42,42 Coin,-0.371708,0.795624,-0.699285,1
404,Scrypt,PoW/PoS,1222730.0,532000000,404Coin,-0.367801,0.795571,-0.699288,1
1337,X13,PoW/PoS,29480300000.0,314159265359,EliteCoin,2.353984,1.639391,-0.79327,1
BTCD,SHA-256,PoW/PoS,1288862.0,22000000,BitcoinDark,-0.343226,0.723671,-0.464557,1
XPY,SHA-256,PoS,11995300.0,12500000,PayCoin,-0.290492,0.561191,-0.108147,1


### Visualizing Results

#### 3D-Scatter with Clusters

In [26]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
                   x='PC 1',
                   y='PC 2',
                    color='class',
                   tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'TotalCoinSupply']
).interactive()

#### Table of Tradable Cryptocurrencies

In [50]:
# Table with tradable cryptos
clustered_df_no_pc = clustered_df.drop(columns=['PC 1', 'PC 2', 'PC 3'])
display(clustered_df_no_pc)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName,class
42,Scrypt,PoW/PoS,42,42,42 Coin,1
404,Scrypt,PoW/PoS,1.22273e+06,532000000,404Coin,1
1337,X13,PoW/PoS,2.94803e+10,314159265359,EliteCoin,1
BTCD,SHA-256,PoW/PoS,1288862,22000000,BitcoinDark,1
XPY,SHA-256,PoS,1.19953e+07,12500000,PayCoin,1
...,...,...,...,...,...,...
BTC,SHA-256,PoW,18485262,21000000,Bitcoin,0
ETH,Ethash,PoW,1.12518e+08,0,Ethereum,0
WAVES,Leased POS,LPoS,100000000,100000000,Waves,3
ADA,Ouroboros,PoS,25927070538,45000000000,Cardano,1


In [51]:
# Print the total number of tradable cryptocurrencies
print(f'The total number of tradable cryptocurrenices is: {len(clustered_df_no_pc)}')

The total number of tradable cryptocurrenices is: 541


#### Scatter Plot with Tradable Cryptocurrencies

In [52]:
# Scale data to create the scatter plot
scaler = MinMaxScaler()

normalize_columns = ['TotalCoinsMined', 'TotalCoinSupply']
x = crypto_df[normalize_columns].values
x_scaled = scaler.fit_transform(x)

df_tradable = pd.DataFrame(x_scaled, columns=normalize_columns, index=crypto_df.index).reset_index()


In [53]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(df_tradable).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
)