# Clustering Crypto

In [1]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Fetching Cryptocurrency Data

In [2]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response_data = requests.get(url)
data = response_data.json()

In [3]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame.from_dict(data['Data']).transpose()
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Algorithm,ProofType,...,SortOrder,Sponsored,Taxonomy,Rating,IsTrading,TotalCoinsMined,BlockNumber,NetHashesPerSecond,BlockReward,BlockTime
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Scrypt,PoW/PoS,...,34,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,42.0,200520.0,0.0,0.0,0.0
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),,,...,2212,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,300.0,0.0,0.0,0.0,0.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),X11,PoW/PoS,...,916,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),Scrypt,PoW/PoS,...,602,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,1567090.0,55698.0,0.0,6.48552,60.0
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),,,...,3505,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,112518000.0,10821438.0,231060807658772.0,2.0,0.0


### Data Preprocessing

In [4]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
300,300 token,,True,,300.0,300
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1567090.0,532000000
433,433 Token,,False,,112518000.0,1000000000


In [5]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True]
crypto_df.IsTrading.value_counts()

True    4325
Name: IsTrading, dtype: int64

In [6]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A']
crypto_df.Algorithm.value_counts()

Scrypt                 451
X11                    219
SHA-256                154
PoS                     70
X13                     69
                      ... 
HybridScryptHash256      1
vBlake                   1
IMesh                    1
BMW512 / Echo512         1
ECC 256K1                1
Name: Algorithm, Length: 122, dtype: int64

In [7]:
# Remove the 'IsTrading' column
crypto_df = crypto_df.drop(columns=['IsTrading'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1567090.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [8]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
crypto_df.head()


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42
404,404Coin,Scrypt,PoW/PoS,1567090.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29494900000.0,314159265359
BTCD,BitcoinDark,SHA-256,PoW/PoS,1288862.0,22000000


In [9]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] > 0]
crypto_df.TotalCoinsMined.min()

41.9999522

In [10]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df.replace('N/A', float("NaN"))
crypto_df =  crypto_df.dropna()

In [11]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df 
coinName_df = crypto_df[['CoinName']]
coinName_df.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTCD,BitcoinDark
XPY,PayCoin


In [12]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns=['CoinName'])
crypto_df.head()


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1567089.0,532000000
1337,X13,PoW/PoS,29494870000.0,314159265359
BTCD,SHA-256,PoW/PoS,1288862.0,22000000
XPY,SHA-256,PoS,11995330.0,12500000


In [13]:
# Create dummy variables for text features
crypto_df_encoded = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
crypto_df_encoded.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1567089.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29494870000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTCD,1288862.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XPY,11995330.0,12500000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Standardize data
scaler = StandardScaler()
scaler.fit(crypto_df_encoded[['TotalCoinsMined','TotalCoinSupply']])
crypto_df_encoded[['TotalCoinsMined','TotalCoinSupply']] = scaler.transform(crypto_df_encoded[['TotalCoinsMined','TotalCoinSupply']])
crypto_df_encoded.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,-0.12005,-0.1576,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,-0.120016,-0.149727,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0.526435,4.491515,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTCD,-0.120022,-0.157274,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XPY,-0.119787,-0.157415,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Reducing Dimensions Using PCA

In [15]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_df_encoded)

In [16]:
# Create a DataFrame with the principal components data
pca_columns = ["principal component 1", "principal component 2", "principal component 3"]
df_crypto_pca = pd.DataFrame(
    data=crypto_pca, columns=pca_columns
)
df_crypto_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.213987,-0.775264,0.165527
1,-0.208403,-0.77454,0.170972
2,3.532286,-0.399539,2.895206
3,-0.216237,-0.678541,0.073884
4,-0.205221,-0.133942,-0.120997


### Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [17]:
import altair as alt

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
Installing collected packages: altair
Successfully installed altair-4.1.0


In [18]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_crypto_pca)
    inertia.append(km.inertia_)


# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
)



Running K-Means with `k=<your best value for k here>`

In [19]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(df_crypto_pca)
# Predict clusters
predictions = model.predict(df_crypto_pca)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([coinName_df,crypto_df],axis=1)
clustered_df["class"] = model.labels_
clustered_df = pd.concat([clustered_df.reset_index(),df_crypto_pca.reset_index().drop(columns='index')],axis=1)
clustered_df = clustered_df.set_index('index')
clustered_df.head()

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,class,principal component 1,principal component 2,principal component 3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42,3,-0.213987,-0.775264,0.165527
404,404Coin,Scrypt,PoW/PoS,1567089.0,532000000,3,-0.208403,-0.77454,0.170972
1337,EliteCoin,X13,PoW/PoS,29494870000.0,314159265359,3,3.532286,-0.399539,2.895206
BTCD,BitcoinDark,SHA-256,PoW/PoS,1288862.0,22000000,3,-0.216237,-0.678541,0.073884
XPY,PayCoin,SHA-256,PoS,11995330.0,12500000,3,-0.205221,-0.133942,-0.120997


### Visualizing Results

#### 3D-Scatter with Clusters

In [20]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='principal component 1',
    y='principal component 2',
    color='class',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()





#### Scatter Plot with Tradable Cryptocurrencies

In [21]:
# Scale data to create the scatter plot
scaler = StandardScaler()
scaler.fit(clustered_df[['TotalCoinsMined','TotalCoinSupply']])
clustered_df[['TotalCoinsMined','TotalCoinSupply']] = scaler.transform(clustered_df[['TotalCoinsMined','TotalCoinSupply']])
clustered_df.head()


Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,class,principal component 1,principal component 2,principal component 3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,42 Coin,Scrypt,PoW/PoS,-0.12005,-0.1576,3,-0.213987,-0.775264,0.165527
404,404Coin,Scrypt,PoW/PoS,-0.120016,-0.149727,3,-0.208403,-0.77454,0.170972
1337,EliteCoin,X13,PoW/PoS,0.526435,4.491515,3,3.532286,-0.399539,2.895206
BTCD,BitcoinDark,SHA-256,PoW/PoS,-0.120022,-0.157274,3,-0.216237,-0.678541,0.073884
XPY,PayCoin,SHA-256,PoS,-0.119787,-0.157415,3,-0.205221,-0.133942,-0.120997


In [22]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"

alt.Chart(clustered_df).mark_circle(size=60).encode(x="TotalCoinsMined", y="TotalCoinSupply", color='class')

In [23]:
display(clustered_df)

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,class,principal component 1,principal component 2,principal component 3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,42 Coin,Scrypt,PoW/PoS,-0.120050,-0.157600,3,-0.213987,-0.775264,0.165527
404,404Coin,Scrypt,PoW/PoS,-0.120016,-0.149727,3,-0.208403,-0.774540,0.170972
1337,EliteCoin,X13,PoW/PoS,0.526435,4.491515,3,3.532286,-0.399539,2.895206
BTCD,BitcoinDark,SHA-256,PoW/PoS,-0.120022,-0.157274,3,-0.216237,-0.678541,0.073884
XPY,PayCoin,SHA-256,PoS,-0.119787,-0.157415,3,-0.205221,-0.133942,-0.120997
...,...,...,...,...,...,...,...,...,...
STEEM,Steem,PoS,PoW,-0.117865,-0.156125,0,-0.180961,0.656419,-0.143377
BTS,Bitshares,SHA-512,PoS,-0.041131,-0.104317,3,-0.105107,-0.166367,-0.117110
BTC,Bitcoin,SHA-256,PoW,-0.119645,-0.157289,0,-0.186793,0.692684,-0.152810
ETH,Ethereum,Ethash,PoW,-0.117584,-0.157600,0,-0.179547,0.693345,-0.138284
