# Clustering Crypto


In [1]:
# Install the altair library for plotting
!pip install -U altair

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
     |████████████████████████████████| 727 kB 6.8 MB/s            
Installing collected packages: altair
Successfully installed altair-4.1.0


In [26]:
# Initial imports
import requests
import pandas as pd
import numpy as np
import altair as alt
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Set Date Frame options for consistent formatting

In [3]:
# Set Options for dataframes
pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Read Cryptocurrency Data CSV

In [7]:
# Alternatively, use the provided csv file:
file_path = Path("crypto_data.csv")

# Create a DataFrame
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055184902.04,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279424622.5027,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927175.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684222.6865,0
LTC,Litecoin,Scrypt,True,PoW,63039243.3,84000000


### Data Preprocessing

In [8]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply'
crypto_df = crypto_df.filter(['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055184902.04,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [9]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True]


In [10]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A']


In [11]:
# Remove the "IsTrading" column
crypto_df.drop(columns = 'IsTrading', inplace=True)


In [12]:
# Count Null Values in dataframe
crypto_df.isnull().sum()


CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64

In [13]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace = True)

# Check Null count post dropna
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [14]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] > 0]


In [15]:
# Find the Columns where there are any N/A Text Values
crypto_df.columns[crypto_df.isin(['N/A']).any()]


Index([], dtype='object')

In [16]:
# Drop rows where there are 'N/A' text values
# The above code identified only the ProofType column contains N/A text values
crypto_df = crypto_df.loc[crypto_df['ProofType'] != 'N/A']


In [17]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
crypto_coin_name = pd.DataFrame(crypto_df['CoinName'])
crypto_coin_name.head(3)

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin


In [18]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns="CoinName", inplace=True)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055184902.04,532000000
1337,X13,PoW/PoS,29279424622.5027,314159265359
BTC,SHA-256,PoW,17927175.0,21000000
ETH,Ethash,PoW,107684222.6865,0


In [19]:
# Check Data Types for Data Frame to see which need to be converted to numeric with get_dummies
crypto_df.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Algorithm        532 non-null    object
 1   ProofType        532 non-null    object
 2   TotalCoinSupply  532 non-null    object
dtypes: object(3)
memory usage: 16.6+ KB


In [20]:
# Convert TotalCoinSupply column to float data type
crypto_df = crypto_df.astype({"TotalCoinSupply": float})

In [21]:
# Create dummy variables for the remaining text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])

In [22]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Data has been cleaned and scaled and is ready for PCA and Clustering

### Reducing Dimensions Using PCA

In [23]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)


In [24]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=crypto_df.index
)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.33524,1.05476,-0.55487
404,-0.31858,1.05488,-0.55537
1337,2.30511,1.63952,-0.68246
BTC,-0.14742,-1.36125,0.20306
ETH,-0.16077,-1.99004,0.39249
LTC,-0.16539,-1.10785,0.01086
DASH,-0.40002,1.24036,-0.57522
XMR,-0.15702,-2.16893,0.45673
ETC,-0.15921,-1.99014,0.39246
ZEC,-0.129,-2.02117,0.3617


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [53]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the DateFrame for plotting
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Plot the Elbow Curve using Altair Scatter Plot
alt.Chart(df_elbow).mark_circle(size=60).encode(
    x="k", 
    y="inertia", 
    color=alt.Color('inertia', scale=alt.Scale(scheme='viridis')),
    tooltip=['inertia', 'k']
)

Running K-Means with `k=4`

In [40]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.DataFrame({
    "Algorithm": crypto_df.Algorithm,
    "ProofType": crypto_df.ProofType,
    "TotalCoinsMined": crypto_df.TotalCoinsMined,
    "TotalCoinSupply": crypto_df.TotalCoinSupply,
    "PC 1": pcs_df['PC 1'],
    "PC 2": pcs_df['PC 2'],
    "PC 3": pcs_df['PC 3'],
    "CoinName": crypto_coin_name.CoinName,
    "Class": model.labels_,
    },
    index=crypto_df.index
)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42.0,-0.33524,1.05476,-0.55487,42 Coin,0
404,Scrypt,PoW/PoS,1055184902.04,532000000.0,-0.31858,1.05488,-0.55537,404Coin,0
1337,X13,PoW/PoS,29279424622.5027,314159265359.0,2.30511,1.63952,-0.68246,EliteCoin,0
BTC,SHA-256,PoW,17927175.0,21000000.0,-0.14742,-1.36125,0.20306,Bitcoin,2
ETH,Ethash,PoW,107684222.6865,0.0,-0.16077,-1.99004,0.39249,Ethereum,2


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [41]:
# Scale data to create the scatter plot
clustered_scaled_df = pd.DataFrame(StandardScaler().fit_transform(clustered_df[["TotalCoinsMined", "TotalCoinSupply"]]), index=clustered_df.index)
clustered_scaled_df.head()

Unnamed: 0,0,1
42,-0.11711,-0.15287
404,-0.09397,-0.14501
1337,0.52495,4.48942
BTC,-0.11672,-0.15256
ETH,-0.11475,-0.15287


In [42]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
df = pd.DataFrame({
    "Algorithm": clustered_df.Algorithm,
    "ProofType": clustered_df.ProofType,
    "TotalCoinsMined": clustered_scaled_df[0],
    "TotalCoinSupply": clustered_scaled_df[1],
    "PC 1": clustered_df['PC 1'],
    "PC 2": clustered_df['PC 2'],
    "PC 3": clustered_df['PC 3'],
    "CoinName": clustered_df.CoinName,
    "Class": clustered_df.Class,
    },
    index=crypto_df.index
)
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,-0.11711,-0.15287,-0.33524,1.05476,-0.55487,42 Coin,0
404,Scrypt,PoW/PoS,-0.09397,-0.14501,-0.31858,1.05488,-0.55537,404Coin,0
1337,X13,PoW/PoS,0.52495,4.48942,2.30511,1.63952,-0.68246,EliteCoin,0
BTC,SHA-256,PoW,-0.11672,-0.15256,-0.14742,-1.36125,0.20306,Bitcoin,2
ETH,Ethash,PoW,-0.11475,-0.15287,-0.16077,-1.99004,0.39249,Ethereum,2


In [54]:
# Use the altair scatter plot to visualize the clusters, x="PC 1" and y="PC 2" for the axes
alt.Chart(df).mark_circle(size=60).encode(
    x="PC 1", 
    y="PC 2", 
    color=alt.Color('Class', scale=alt.Scale(scheme='viridis')),
    # Add the required tool tips "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
)

In [56]:
#Use the altair scatter plot to visualize the tradable cryptocurrencies, x="TotalCoinsMined" and y="TotalCoinSupply" for the axes
alt.Chart(df).mark_circle(size=60).encode(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    color=alt.Color('Class', scale=alt.Scale(scheme='viridis')),
    # Add the required tool tips "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
)

#### Table of Tradable Cryptocurrencies

In [70]:
# Print the total number of tradable cryptocurrencies using the display() command
display(clustered_df[['CoinName','Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class']])

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,Class
42,42 Coin,Scrypt,PoW/PoS,42.00000,41.99995,0
404,404Coin,Scrypt,PoW/PoS,532000000.00000,1055184902.04000,0
1337,EliteCoin,X13,PoW/PoS,314159265359.00000,29279424622.50270,0
BTC,Bitcoin,SHA-256,PoW,21000000.00000,17927175.00000,2
ETH,Ethereum,Ethash,PoW,0.00000,107684222.68650,2
...,...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2000000000.00000,1999999995.30560,0
GAP,Gapcoin,Scrypt,PoW/PoS,250000000.00000,14931046.15466,0
BDX,Beldex,CryptoNight,PoW,1400222610.00000,980222595.00000,2
ZEN,Horizen,Equihash,PoW,21000000.00000,7296537.50000,2
