# Clustering Crypto

In [28]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [29]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [30]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame(response["Data"]).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,BuiltOn,SmartContractAddress,DecimalPoints,Difficulty,AlgorithmType
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0.0,0.0,0.0,0.0,,,,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0,,
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [31]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
# crypto_df = pd.read_csv(file_path, index_col=0)
# crypto_df.head(10)

### Data Preprocessing

### Data Preprocessing

In [32]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df=crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0.0,0.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
433,433 Token,,False,,,
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0


In [33]:
# Keep only cryptocurrencies that are trading
crypto_df=crypto_df[crypto_df["IsTrading"]==True]
print(crypto_df.shape)
crypto_df.head(10)

(6102, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0.0,0.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
CRAIG,CraigsCoin,X11,True,PoS,,


In [34]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df["Algorithm"] != "N/A"]
print(crypto_df.shape)
crypto_df.head(10)
      

(1637, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,0.0,0.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
CRAIG,CraigsCoin,X11,True,PoS,,
XBS,Bitstake,X11,True,PoW/PoS,,
XPY,PayCoin,SHA-256,True,PoS,,


In [35]:
# Remove the "IsTrading" column
crypto_df.drop("IsTrading", axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(10)

(1637, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,0.0,0.0
365,365Coin,X11,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,0.0,0.0
2015,2015 coin,X11,PoW/PoS,0.0,0.0
CRAIG,CraigsCoin,X11,PoS,,
XBS,Bitstake,X11,PoW/PoS,,
XPY,PayCoin,SHA-256,PoS,,


In [36]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(axis=0, how="any")
print(crypto_df.shape)
crypto_df.head(10)


(690, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,0,0
365,365Coin,X11,PoW/PoS,0,-1
404,404Coin,Scrypt,PoW/PoS,0,-1
611,SixEleven,SHA-256,PoW,0,0
808,808,SHA-256,PoW/PoS,0,0
1337,EliteCoin,X13,PoW/PoS,0,0
2015,2015 coin,X11,PoW/PoS,0,0
XPD,PetroDollar,SHA-256D,,0,-1
ACOIN,ACoin,SHA-256,PoW,0,0
XMY,MyriadCoin,Multiple,PoW,0,2000000000


In [37]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
print(crypto_df.shape)
crypto_df.head(10)

(297, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NSR,NuShares,PoS,PoS,6171700777.8311,0
TRI,Triangles Coin,X13,PoW/PoS,191617.845172,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,PoW,75801530.184273,105000000
PURA,Pura,X11,PoW,188358976.839698,-1
BTCP,Bitcoin Private,Equihash,PoW,3818878.387802,22873588
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
FOIN,Foin,SHA-256,,92631000.8161,100000000


In [38]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != "N/A"].dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NSR,NuShares,PoS,PoS,6171700777.8311,0
TRI,Triangles Coin,X13,PoW/PoS,191617.845172,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
QRL,Quantum Resistant Ledger,RandomX,PoW,75801530.184273,105000000
PURA,Pura,X11,PoW,188358976.839698,-1
BTCP,Bitcoin Private,Equihash,PoW,3818878.387802,22873588
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,Zano,ProgPowZ,PoW/PoS,13033953.536804,-1


In [39]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
print(coins_name.shape)
coins_name.head()

(131, 1)


Unnamed: 0,CoinName
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat
QRL,Quantum Resistant Ledger


In [40]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(131, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
NSR,PoS,PoS,6171700777.8311,0
TRI,X13,PoW/PoS,191617.845172,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1
QRL,RandomX,PoW,75801530.184273,105000000
PURA,X11,PoW,188358976.839698,-1
BTCP,Equihash,PoW,3818878.387802,22873588
ADK,IMesh,PoW,25000000.0,0
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
ZANO,ProgPowZ,PoW/PoS,13033953.536804,-1


In [43]:
# Create dummy variables for text features
X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
print(X.shape)
X.head(10)

(131, 80)


Unnamed: 0,TotalCoinsMined,MaxSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
NSR,6171700777.8311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,191617.845172,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
QRL,75801530.184273,105000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PURA,188358976.839698,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BTCP,3818878.387802,22873588,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADK,25000000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAPS,62319462900.0,70000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZANO,13033953.536804,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:5]

array([[-0.09640865, -0.09392626, -0.0877058 , -0.0877058 , -0.0877058 ,
        -0.12451456, -0.0877058 , -0.0877058 , -0.12451456, -0.12451456,
        -0.15309311, -0.0877058 , -0.0877058 , -0.23759548, -0.12451456,
        -0.0877058 , -0.0877058 , -0.0877058 , -0.30276504, -0.0877058 ,
        -0.0877058 , -0.23759548, -0.0877058 , -0.0877058 , -0.12451456,
        -0.0877058 , -0.0877058 , -0.0877058 , -0.0877058 , -0.0877058 ,
        -0.0877058 , -0.15309311, -0.0877058 , -0.0877058 , -0.12451456,
         5.01996016, -0.0877058 , -0.0877058 , -0.15309311, -0.12451456,
        -0.30276504, -0.12451456, -0.0877058 , -0.0877058 , -0.0877058 ,
        -0.44926035, -0.0877058 , -0.0877058 , -0.0877058 , -0.1774713 ,
        -0.0877058 , -0.19920477, -0.12451456, -0.0877058 , -0.0877058 ,
        -0.0877058 , -0.0877058 , -0.0877058 , -0.25503069, -0.0877058 ,
        -0.0877058 , -0.12451456, -0.0877058 , -0.0877058 ,  3.3028913 ,
        -0.0877058 , -0.0877058 , -0.0877058 , -0.9

### Reducing Dimensions Using PCA

In [45]:
# Use PCA to reduce dimensions to 3 principal components
n_comp = 3
pca = PCA(n_components=n_comp)
principal_components = pca.fit_transform(X)
principal_components
        

array([[-1.37161435,  0.67728946, -0.71255182],
       [-1.54657889, -0.75651557, -0.33606363],
       [ 0.73048319, -0.80611017, -0.24990159],
       [-0.91167688, -0.94016151, -0.30655066],
       [ 1.27026124, -0.65250638, -0.16249343],
       [ 0.48134712, -0.64583219, -0.13510663],
       [ 1.0764133 , -0.70583714, -0.17045616],
       [ 0.86476965, -0.57221032, -0.10414605],
       [-1.52418628,  1.25556972,  6.61469627],
       [-1.50421282, -0.90698287, -0.27356198],
       [-1.50407361, -0.90681174, -0.27352216],
       [ 1.27022455, -0.65254806, -0.16250292],
       [ 1.27891584, -0.64232797, -0.16015796],
       [ 0.73069403, -0.80587925, -0.2498521 ],
       [-1.50421437, -0.9069848 , -0.27356241],
       [ 2.26122052,  0.66693514,  0.13878937],
       [-1.61694819,  0.8370511 , -0.78903722],
       [ 0.88829663, -0.67589394, -0.15809091],
       [-1.52041703, -0.92026363, -0.277799  ],
       [ 1.84540354,  0.21748911,  0.03879529],
       [-1.56944052,  0.82566236,  3.014

In [46]:
# Create a DataFrame with the principal components data
col_names = [f"PC {i}" for i in range(1, n_comp + 1)]
pcs_df = pd.DataFrame(principal_components, columns=col_names, index=crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

(131, 3)


Unnamed: 0,PC 1,PC 2,PC 3
NSR,-1.371614,0.677289,-0.712552
TRI,-1.546579,-0.756516,-0.336064
CMTC,0.730483,-0.80611,-0.249902
CHAT,-0.911677,-0.940162,-0.306551
QRL,1.270261,-0.652506,-0.162493
PURA,0.481347,-0.645832,-0.135107
BTCP,1.076413,-0.705837,-0.170456
ADK,0.86477,-0.57221,-0.104146
DAPS,-1.524186,1.25557,6.614696
ZANO,-1.504213,-0.906983,-0.273562


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [47]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

  "KMeans is known to have a memory leak on Windows "


Running K-Means with `k=<your best value for k here>`

In [48]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, sort=False)
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_
print(clustered_df.shape)
clustered_df.head(10)

(131, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
NSR,PoS,PoS,6171700777.8311,0,-1.371614,0.677289,-0.712552,NuShares,1
TRI,X13,PoW/PoS,191617.845172,0,-1.546579,-0.756516,-0.336064,Triangles Coin,0
CMTC,Scrypt,PoW,872830.0,0,0.730483,-0.80611,-0.249902,CometCoin,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1,-0.911677,-0.940162,-0.306551,OpenChat,0
QRL,RandomX,PoW,75801530.184273,105000000,1.270261,-0.652506,-0.162493,Quantum Resistant Ledger,0
PURA,X11,PoW,188358976.839698,-1,0.481347,-0.645832,-0.135107,Pura,0
BTCP,Equihash,PoW,3818878.387802,22873588,1.076413,-0.705837,-0.170456,Bitcoin Private,0
ADK,IMesh,PoW,25000000.0,0,0.86477,-0.57221,-0.104146,Aidos Kuneen,0
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000,-1.524186,1.25557,6.614696,DAPS Coin,3
ZANO,ProgPowZ,PoW/PoS,13033953.536804,-1,-1.504213,-0.906983,-0.273562,Zano,0


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [49]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["MaxSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["MaxSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["Class"] = clustered_df["Class"]
plot_df.head()

Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,Class
NSR,4.761905e-14,0.0003255124,NuShares,1
TRI,4.761905e-14,9.97588e-09,Triangles Coin,0
CMTC,4.761905e-14,4.590489e-08,CometCoin,0
CHAT,0.0,5.274264e-05,OpenChat,0
QRL,5e-06,3.997852e-06,Quantum Resistant Ledger,0


In [50]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
plot_df.hvplot.scatter(
    x="TotalCoinsMined", y="MaxSupply", hover_cols=["CoinName"], by="Class"
)

#### Table of Tradable Cryptocurrencies

In [51]:
# Table with tradable cryptos
# Table with tradable cryptos
clustered_df[
    [
        "CoinName",
        "Algorithm",
        "ProofType",
        "MaxSupply",
        "TotalCoinsMined",
        "Class",
    ]
].hvplot.table()

In [52]:
# Print the total number of tradable cryptocurrencies
print(f"There are {clustered_df.shape[0]} tradable cryptocurrencies.")

There are 131 tradable cryptocurrencies.
