# Clustering Crypto

In [1]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
#import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

### Read in Cryptocurrency Data

In [85]:
# Read in crypto data for all coins into a DataFrame
# csv data generated from the the CryptoCompare URL API (https://min-api.cryptocompare.com/data/all/coinlist)
data_path = Path("crypto_data.csv")
crypto_df = pd.read_csv(data_path, index_col="Unnamed: 0")
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [86]:
# Keep only necessary columns which are already present in the csv file:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

In [87]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df["IsTrading"] != False]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [88]:
# Keep only cryptocurrencies with a working algorithm (i.e, ProofType has to contain "PoW")
crypto_df = crypto_df[crypto_df["ProofType"].str.contains("PoW")]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
VPRC,VapersCoin,Scrypt,True,PoW,,42750000000
GAP,Gapcoin,Scrypt,True,PoW/PoS,1.493105e+07,250000000
SERO,Super Zero,Ethash,True,PoW,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610


In [89]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(columns="IsTrading")
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
VPRC,VapersCoin,Scrypt,PoW,,42750000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
SERO,Super Zero,Ethash,PoW,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [90]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
RCC,Reality Clash,Ethash,PoW,2.448794e+07,24487944
ILT,iOlite,Ethash,PoW,0.000000e+00,1000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [91]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] != 0]
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
VOLLAR,Vollar,Equihash+Scrypt,PoW,1.000000e+08,2100000000
RCC,Reality Clash,Ethash,PoW,2.448794e+07,24487944
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [92]:
# There are no other values in the DataFrame that are'N/A' text values, so no further dropping of rows is needed

In [93]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(index=crypto_df.index)
coins_name["CoinName"] = crypto_df["CoinName"]
coins_name

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
VOLLAR,Vollar
RCC,Reality Clash
GAP,Gapcoin
BDX,Beldex


In [94]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns="CoinName")
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
VOLLAR,Equihash+Scrypt,PoW,1.000000e+08,2100000000
RCC,Ethash,PoW,2.448794e+07,24487944
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610


In [95]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoS/PoW,ProofType_PoS/PoW/PoT,ProofType_PoW,ProofType_PoW + Hive,ProofType_PoW and PoS,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_dPoW/PoW
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VOLLAR,1.000000e+08,2100000000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
RCC,2.448794e+07,24487944,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [96]:
# Standardize data using Standard Scaler
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.2150223 , -0.16172741, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924],
       [-0.15406818, -0.15261707, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924],
       [ 1.47634157,  5.21815545, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924],
       ...,
       [-0.21415979, -0.15744624, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924],
       [-0.15839847, -0.13774902, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924],
       [-0.2146008 , -0.16136779, -0.04867924, ..., -0.04867924,
        -0.04867924, -0.04867924]])

### Reducing Dimensions Using PCA

In [97]:
# Use PCA to reduce dimensions to 3 principal components
# Initiate the PCA model
pca = PCA(n_components=3)

# Get 3 components from the crypto data in X_scaled
crypto_pca = pca.fit_transform(X_scaled)

In [98]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca,
    columns=["PC 1", "PC 2", "PC 3"],
    index=crypto_df.index)

pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-1.606744,-0.160799,-0.404964
404,-1.599349,-0.162452,-0.428818
1337,-1.412785,-0.284939,-2.743130
BTC,1.162957,0.010059,0.504444
ETH,1.822038,0.045365,0.660612
...,...,...,...
VOLLAR,1.804029,0.043965,0.635581
RCC,1.821594,0.045472,0.662140
GAP,-1.606133,-0.160905,-0.406590
BDX,2.065260,-0.007798,-0.456527


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [74]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values


# Create the Elbow Curve using hvPlot


Running K-Means with `k=<your best value for k here>`

In [75]:
# Initialize the K-Means model

# Fit the model

# Predict clusters

# Create a new DataFrame including predicted clusters and cryptocurrencies features


### Visualizing Results

#### 3D-Scatter with Clusters

In [76]:
# Create a 3D-Scatter with the PCA data and the clusters


#### Table of Tradable Cryptocurrencies

In [77]:
# Table with tradable cryptos


In [78]:
# Print the total number of tradable cryptocurrencies


#### Scatter Plot with Tradable Cryptocurrencies

In [79]:
# Scale data to create the scatter plot


In [80]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
