# Clustering Crypto

In [41]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [42]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [43]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame(response['Data']).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,61.905089,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [44]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame

### Data Preprocessing

In [45]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
433,433 Token,,False,,,
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0


In [46]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df['IsTrading'] == True]
print(crypto_df.shape)
crypto_df.head(10)

(6514, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,True,PoW/PoS,,


In [47]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df['Algorithm'] != "N/A"]
print(crypto_df.shape)
crypto_df.head(10)


(1648, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,True,PoW/PoS,,
XPY,PayCoin,SHA-256,True,PoS,,
PRC,ProsperCoin,Scrypt,True,PoW,,


In [48]:
# Remove the "IsTrading" column

crypto_df.drop("IsTrading", axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(10)


(1648, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42.0
365,365Coin,X11,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,0.0,0.0
2015,2015 coin,X11,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,PoW/PoS,,
XPY,PayCoin,SHA-256,PoS,,
PRC,ProsperCoin,Scrypt,PoW,,


In [49]:
# Remove rows with at least 1 null value

crypto_df = crypto_df.dropna(axis=0, how="any")
print(crypto_df.shape)
crypto_df.head(10)

(700, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
365,365Coin,X11,PoW/PoS,0.0,-1
404,404Coin,Scrypt,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,PoW,0.0,0
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,0.0,0
2015,2015 coin,X11,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,,0.0,-1
ACOIN,ACoin,SHA-256,PoW,0.0,0
XMY,MyriadCoin,Multiple,PoW,0.0,2000000000


In [50]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
print(crypto_df.shape)
crypto_df.head(10)


(308, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6174617037.9692,0
TRI,Triangles Coin,X13,PoW/PoS,191620.917403,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,PoW,76084225.620462,105000000
PURA,Pura,X11,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
FOIN,Foin,SHA-256,,92631000.8161,100000000


In [51]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != 'N/A'].dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6174617037.9692,0
TRI,Triangles Coin,X13,PoW/PoS,191620.917403,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,PoW,76084225.620462,105000000
PURA,Pura,X11,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,Zano,ProgPowZ,PoW/PoS,13110116.929714,-1


In [52]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
print(coins_name.shape)
coins_name.head()

(136, 1)


Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat


In [53]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(136, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6174617037.9692,0
TRI,X13,PoW/PoS,191620.917403,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1
QRL,RandomX,PoW,76084225.620462,105000000
PURA,X11,PoW,188358976.839698,-1
ADK,IMesh,PoW,25000000.0,0
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,ProgPowZ,PoW/PoS,13110116.929714,-1


In [56]:
# Create dummy variables for text features
X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
print(X.shape)
X.head(10)

(136, 82)


Unnamed: 0,TotalCoinsMined,MaxSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,41.999952,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6174617037.9692,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,191620.917403,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QRL,76084225.620462,105000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PURA,188358976.839698,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADK,25000000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAPS,62319462900.0,70000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZANO,13110116.929714,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:5]

array([[-0.08786764, -0.09220965, -0.0860663 , -0.0860663 , -0.0860663 ,
        -0.12216944, -0.0860663 , -0.0860663 , -0.12216944, -0.12216944,
        -0.15018785, -0.0860663 , -0.0860663 , -0.0860663 , -0.23294541,
        -0.12216944, -0.0860663 , -0.0860663 , -0.0860663 , -0.29664794,
        -0.0860663 , -0.0860663 , -0.23294541, -0.0860663 , -0.0860663 ,
        -0.12216944, -0.0860663 , -0.0860663 , -0.0860663 , -0.0860663 ,
        -0.0860663 , -0.0860663 , -0.15018785, -0.0860663 , -0.0860663 ,
        -0.12216944, -0.19536617, -0.0860663 , -0.0860663 , -0.15018785,
        -0.12216944, -0.29664794, -0.12216944, -0.0860663 , -0.0860663 ,
        -0.0860663 ,  2.1602469 , -0.0860663 , -0.0860663 , -0.0860663 ,
        -0.0860663 , -0.19536617, -0.0860663 , -0.19536617, -0.12216944,
        -0.0860663 , -0.0860663 , -0.0860663 , -0.0860663 , -0.0860663 ,
        -0.25      , -0.0860663 , -0.0860663 , -0.12216944, -0.12216944,
        -0.0860663 , -0.32510161, -0.0860663 , -0.0

### Reducing Dimensions Using PCA

In [58]:
# Use PCA to reduce dimensions to 3 principal components
n_comp = 3
pca = PCA(n_components=n_comp)
principal_components = pca.fit_transform(X)
principal_components

array([[ 1.79972183e-01, -1.30438824e+00, -1.41176331e+00],
       [ 6.58961362e-01, -1.19374411e+00, -2.61226173e-01],
       [ 5.89767529e-01, -1.96652170e+00, -1.69917158e+00],
       [-8.25946539e-01,  4.93824547e-01, -3.80201444e-01],
       [ 1.79977153e-01, -1.30438458e+00, -1.41176352e+00],
       [-1.16308973e+00,  1.30136181e+00,  2.47364912e-01],
       [-5.66690630e-01,  2.62676019e-01, -1.65570407e-01],
       [-8.89019752e-01,  9.30151024e-01,  3.08774405e-01],
       [ 8.05383218e-01, -2.01109430e+00,  6.20223877e+00],
       [ 5.53423621e-01, -1.96710418e+00, -1.79635474e+00],
       [ 5.53404541e-01, -1.96707248e+00, -1.79634857e+00],
       [-1.16308554e+00,  1.30135473e+00,  2.47363595e-01],
       [-1.16413088e+00,  1.30310161e+00,  2.47689712e-01],
       [-8.25941746e-01,  4.93828079e-01, -3.80201650e-01],
       [ 5.53423584e-01, -1.96710420e+00, -1.79635474e+00],
       [-1.24212364e+00,  1.40380712e+00,  1.92597502e-01],
       [ 8.37354764e-01, -1.44131089e+00

In [59]:
# Create a DataFrame with the principal components data
col_names = [f"PC {i}" for i in range (1, n_comp + 1)]
pcs_df = pd.DataFrame(principal_components, columns=col_names, index=crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

(136, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,0.179972,-1.304388,-1.411763
NSR,0.658961,-1.193744,-0.261226
TRI,0.589768,-1.966522,-1.699172
CMTC,-0.825947,0.493825,-0.380201
CHAT,0.179977,-1.304385,-1.411764
QRL,-1.16309,1.301362,0.247365
PURA,-0.566691,0.262676,-0.16557
ADK,-0.88902,0.930151,0.308774
DAPS,0.805383,-2.011094,6.202239
ZANO,0.553424,-1.967104,-1.796355


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [62]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)
    

# Create the Elbow Curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [64]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, sort=False)
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_
print(clustered_df.shape)
clustered_df.head(10)

(136, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.999952,42,0.179972,-1.304388,-1.411763,42 Coin,0
NSR,PoS,PoS,6174617037.9692,0,0.658961,-1.193744,-0.261226,NuShares,0
TRI,X13,PoW/PoS,191620.917403,0,0.589768,-1.966522,-1.699172,Triangles Coin,0
CMTC,Scrypt,PoW,872830.0,0,-0.825947,0.493825,-0.380201,CometCoin,1
CHAT,Scrypt,PoW/PoS,1000000000.0,-1,0.179977,-1.304385,-1.411764,OpenChat,0
QRL,RandomX,PoW,76084225.620462,105000000,-1.16309,1.301362,0.247365,Quantum Resistant Ledger,1
PURA,X11,PoW,188358976.839698,-1,-0.566691,0.262676,-0.16557,Pura,1
ADK,IMesh,PoW,25000000.0,0,-0.88902,0.930151,0.308774,Aidos Kuneen,1
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000,0.805383,-2.011094,6.202239,DAPS Coin,2
ZANO,ProgPowZ,PoW/PoS,13110116.929714,-1,0.553424,-1.967104,-1.796355,Zano,0


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [65]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["MaxSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["MaxSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["Class"] = clustered_df["Class"]
plot_df.head()

Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,Class
42,2.047619e-12,0.0,42 Coin,0
NSR,4.761905e-14,6.236987e-06,NuShares,0
TRI,4.761905e-14,1.935141e-10,Triangles Coin,0
CMTC,4.761905e-14,8.81604e-10,CometCoin,1
CHAT,0.0,1.010101e-06,OpenChat,0


In [66]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df.hvplot.scatter(
    x="TotalCoinsMined", y="MaxSupply", hover_cols=["CoinName"], by="Class"
)

#### Table of Tradable Cryptocurrencies

In [67]:
# Table with tradable cryptos
clustered_df[["CoinName", "Algorithm", "ProofType", "MaxSupply", "TotalCoinsMined", "Class"]].hvplot.table() 

In [68]:
# Print the total number of tradable cryptocurrencies
print(f"There are {clustered_df.shape[0]} tradable cryptocurrencies.")

There are 136 tradable cryptocurrencies.
