# Clustering Crypto

In [2]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [1]:
!pip install -U altair

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
[K     |████████████████████████████████| 727 kB 31.7 MB/s eta 0:00:01
Installing collected packages: altair
Successfully installed altair-4.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [18]:
import altair as alt

### Fetching Cryptocurrency Data

In [4]:
# Create a DataFrame
crypto_df = pd.read_csv('crypto_data.csv', index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [5]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]

In [6]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True, :]

In [7]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A', :]

In [8]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(columns='IsTrading')

In [9]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace=True)

In [10]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] != 0, :]

In [11]:
# Drop rows where there are 'N/A' text values
for i, r in crypto_df.iterrows():
    for j, c in r.iteritems():
        if c == 'N/A':
            crypto_df.drop(index=i, inplace=True)

In [12]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coin_name = crypto_df['CoinName'].to_frame()

In [13]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns='CoinName')

In [14]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'], drop_first=True)

In [15]:
# Standardize data
X_standardized = StandardScaler().fit_transform(X)

### Reducing Dimensions Using PCA

In [16]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_standardized)

In [17]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(crypto_pca, columns=['PC1', 'PC2', 'PC3'], index=crypto_df.index)
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.338559,1.040405,-0.576957
404,-0.321881,1.040631,-0.57737
1337,2.313831,1.640439,-0.658424
BTC,-0.145014,-1.322517,0.201948
ETH,-0.153123,-2.006441,0.402941


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [25]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(x="k", y="inertia").properties(title="Elbow Curb")

Running K-Means with `k=4`

In [26]:
# Initialize the K-Means model
km = KMeans(n_clusters=4, random_state=0) 
# Fit the model
km.fit(pcs_df)
# Predict clusters
pred = km.predict(pcs_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df, coin_name], axis=1, join='outer')
clustered_df['Class'] = pred
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.338559,1.040405,-0.576957,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.321881,1.040631,-0.57737,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.313831,1.640439,-0.658424,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.145014,-1.322517,0.201948,Bitcoin,3
ETH,Ethash,PoW,107684200.0,0,-0.153123,-2.006441,0.402941,Ethereum,3


### Visualizing Results

#### 3D-Scatter with Clusters

In [31]:
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='PC1',
    y='PC2',
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive().properties(width=400, height=400)

#### Table of Tradable Cryptocurrencies

In [28]:
# Print the total number of tradable cryptocurrencies
len(clustered_df)

533

#### Scatter Plot with Tradable Cryptocurrencies

In [33]:
# Scale data to create the scatter plot
total_coins_minded_scaled = StandardScaler().fit_transform(clustered_df['TotalCoinsMined'].values.reshape(-1, 1))
total_coin_supply_scaled = StandardScaler().fit_transform(clustered_df['TotalCoinSupply'].values.reshape(-1, 1))
clustered_df_scaled = clustered_df.copy()
clustered_df_scaled['TotalCoinsMinded'] = total_coins_minded_scaled
clustered_df_scaled['TotalCoinSupply'] = total_coin_supply_scaled

In [35]:
alt.Chart(clustered_df_scaled).mark_circle(size=60).encode(
    x="TotalCoinsMined", y="TotalCoinSupply", tooltip=['CoinName']
).interactive().properties(width=400, height=400)

In [36]:
display(clustered_df_scaled)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class,TotalCoinsMinded
42,Scrypt,PoW/PoS,4.199995e+01,-0.152865,-0.338559,1.040405,-0.576957,42 Coin,0,-0.116748
404,Scrypt,PoW/PoS,1.055185e+09,-0.144996,-0.321881,1.040631,-0.577370,404Coin,0,-0.093589
1337,X13,PoW/PoS,2.927942e+10,4.493764,2.313831,1.640439,-0.658424,EliteCoin,0,0.525872
BTC,SHA-256,PoW,1.792718e+07,-0.152554,-0.145014,-1.322517,0.201948,Bitcoin,3,-0.116354
ETH,Ethash,PoW,1.076842e+08,-0.152865,-0.153123,-2.006441,0.402941,Ethereum,3,-0.114384
...,...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,-0.123283,2.470871,0.758166,-0.131220,ZEPHYR,0,-0.072852
GAP,Scrypt,PoW/PoS,1.493105e+07,-0.149167,-0.336602,1.040300,-0.576973,Gapcoin,0,-0.116420
BDX,CryptoNight,PoW,9.802226e+08,-0.132154,0.334761,-2.306674,0.419676,Beldex,3,-0.095234
ZEN,Equihash,PoW,7.296538e+06,-0.152554,-0.135797,-1.978359,0.507056,Horizen,3,-0.116588
