# Clustering Crypto

## IMPORTANT NOTE: The API has been updated since this homework assignment was created such that it now returns "TotalCoinSupply as "MaxSupply". Please note this when examining code, results and visualisations that anywhere "MaxSupply" is used, it can be interchanged with "TotalCoinSupply" due to the changes in the key labelling returned by the API. 

In [3]:
# Install the altair library for plotting
!pip install -U altair

Collecting altair
  Downloading altair-4.1.0-py3-none-any.whl (727 kB)
     |████████████████████████████████| 727 kB 8.4 MB/s            
Installing collected packages: altair
Successfully installed altair-4.1.0


In [72]:
# Initial imports
import requests
import pandas as pd
import numpy as np
import altair as alt
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Set Date Frame options for consistent formatting

In [73]:
# Set Options for dataframes
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_rows', 1000)

### Fetching Cryptocurrency Data

In [74]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [75]:
# Create a DataFrame 
# Get crypto data using call to CryptoCompare API endpoint
crypto_response_data = requests.get(url)

# Convert response from api call to json format
crypto_response_data_json = crypto_response_data.json()

# Reach into json response content and select the Data then Create DataFrame 
crypto_data = pd.DataFrame(crypto_response_data_json["Data"])

# Transponse Date Frame and inspect
crypto_df = crypto_data.T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,3.10799,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


### Data Preprocessing

In [76]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','CirculatingSupply'
crypto_df = crypto_df.filter(['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply'])
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
433,433 Token,,False,,,


In [77]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True]


In [78]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A']


In [79]:
# Remove the "IsTrading" column
crypto_df.drop(columns = 'IsTrading', inplace=True)


In [80]:
# Count Null Values in dataframe
crypto_df.isnull().sum()


CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    949
MaxSupply          949
dtype: int64

In [81]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace = True)

# Check Null count post dropna
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
MaxSupply          0
dtype: int64

In [82]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df.loc[crypto_df['TotalCoinsMined'] > 0]


In [83]:
# Find the Columns where there are any N/A Text Values
crypto_df.columns[crypto_df.isin(['N/A']).any()]


Index(['ProofType'], dtype='object')

In [84]:
# Drop rows where there are 'N/A' text values
# The above code identified only the ProofType column contains N/A text values
crypto_df = crypto_df.loc[crypto_df['ProofType'] != 'N/A']


In [85]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
crypto_coin_name = pd.DataFrame(crypto_df['CoinName'])
crypto_coin_name.head(3)

Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin


In [86]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop(columns="CoinName", inplace=True)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.99995,42
NSR,PoS,PoS,6172691537.8311,0
TRI,X13,PoW/PoS,191620.8424,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1


In [87]:
# Check Data Types for Data Frame to see which need to be converted to numeric with get_dummies
crypto_df.select_dtypes(include='object').info()

<class 'pandas.core.frame.DataFrame'>
Index: 133 entries, 42 to SIGNA
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Algorithm        133 non-null    object
 1   ProofType        133 non-null    object
 2   TotalCoinsMined  133 non-null    object
 3   MaxSupply        133 non-null    object
dtypes: object(4)
memory usage: 5.2+ KB


In [88]:
# Convert TotalCoinsMined and MaxSupply column to float data type
crypto_df = crypto_df.astype({"TotalCoinsMined": float, "MaxSupply": float})

In [89]:
# Create dummy variables for the remaining text features
X = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])

In [90]:
# Standardize data
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.09938151 -0.0932114  -0.08703883 -0.08703883 -0.08703883 -0.12356041
  -0.08703883 -0.08703883 -0.12356041 -0.12356041 -0.15191091 -0.08703883
  -0.08703883 -0.08703883 -0.23570226 -0.12356041 -0.08703883 -0.08703883
  -0.08703883 -0.3002731  -0.08703883 -0.08703883 -0.23570226 -0.08703883
  -0.08703883 -0.12356041 -0.08703883 -0.08703883 -0.08703883 -0.08703883
  -0.08703883 -0.08703883 -0.15191091 -0.08703883 -0.08703883 -0.12356041
  -0.19764235 -0.08703883 -0.08703883 -0.12356041 -0.12356041 -0.3002731
  -0.12356041 -0.08703883 -0.08703883 -0.08703883  2.18691762 -0.08703883
  -0.08703883 -0.08703883 -0.08703883 -0.17609018 -0.08703883 -0.19764235
  -0.12356041 -0.08703883 -0.08703883 -0.08703883 -0.08703883 -0.08703883
  -0.25298221 -0.08703883 -0.08703883 -0.12356041 -0.12356041 -0.08703883
  -0.31491833 -0.08703883 -0.08703883 -0.08703883 -0.96308682  2.07846097
  -0.08703883 -0.08703883 -0.08703883 -0.08703883 -0.08703883 -0.08703883
  -0.08703883 -0.08703883 -0.08703883 -

### Data has been cleaned and scaled and is ready for PCA and Clustering

### Reducing Dimensions Using PCA

In [91]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(X_scaled)


In [92]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=crypto_df.index
)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.9047,-0.98218,-0.28106
NSR,-1.2852,0.57579,-0.64769
TRI,-1.48123,-0.80279,-0.30291
CMTC,0.68832,-0.82446,-0.23626
CHAT,-0.90447,-0.98196,-0.28101
QRL,1.26054,-0.64431,-0.15855
PURA,0.4798,-0.64375,-0.12856
BTCP,1.07035,-0.69789,-0.16525
ADK,0.85857,-0.55982,-0.10366
DAPS,-1.52323,1.40868,6.71409


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [93]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the DateFrame for plotting
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Plot the Elbow Curve using Altair Scatter Plot
alt.Chart(df_elbow).mark_circle(size=60).encode(
    x="k", 
    y="inertia", 
    color=alt.Color('inertia', scale=alt.Scale(scheme='viridis')),
    tooltip=['inertia', 'k']
)

Running K-Means with `k=5`

In [99]:
# Initialize the K-Means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.DataFrame({
    "Algorithm": crypto_df.Algorithm,
    "ProofType": crypto_df.ProofType,
    "TotalCoinsMined": crypto_df.TotalCoinsMined,
    "MaxSupply": crypto_df.MaxSupply,
    "PC 1": pcs_df['PC 1'],
    "PC 2": pcs_df['PC 2'],
    "PC 3": pcs_df['PC 3'],
    "CoinName": crypto_coin_name.CoinName,
    "Class": model.labels_,
    },
    index=crypto_df.index
)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42.0,-0.9047,-0.98218,-0.28106,42 Coin,1
NSR,PoS,PoS,6172691537.8311,0.0,-1.2852,0.57579,-0.64769,NuShares,1
TRI,X13,PoW/PoS,191620.8424,0.0,-1.48123,-0.80279,-0.30291,Triangles Coin,1
CMTC,Scrypt,PoW,872830.0,0.0,0.68832,-0.82446,-0.23626,CometCoin,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1.0,-0.90447,-0.98196,-0.28101,OpenChat,1


### Visualizing Results

#### Scatter Plot with Tradable Cryptocurrencies

In [100]:
# Scale data to create the scatter plot
clustered_scaled_df = pd.DataFrame(StandardScaler().fit_transform(clustered_df[["TotalCoinsMined", "MaxSupply"]]), index=clustered_df.index)
clustered_scaled_df.head()

Unnamed: 0,0,1
42,-0.09938,-0.09321
NSR,-0.09562,-0.09321
TRI,-0.09938,-0.09321
CMTC,-0.09938,-0.09321
CHAT,-0.09877,-0.09321


In [101]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features
df = pd.DataFrame({
    "Algorithm": clustered_df.Algorithm,
    "ProofType": clustered_df.ProofType,
    "TotalCoinsMined": clustered_scaled_df[0],
    "MaxSupply": clustered_scaled_df[1],
    "PC 1": clustered_df['PC 1'],
    "PC 2": clustered_df['PC 2'],
    "PC 3": clustered_df['PC 3'],
    "CoinName": clustered_df.CoinName,
    "Class": clustered_df.Class,
    },
    index=crypto_df.index
)
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,-0.09938,-0.09321,-0.9047,-0.98218,-0.28106,42 Coin,1
NSR,PoS,PoS,-0.09562,-0.09321,-1.2852,0.57579,-0.64769,NuShares,1
TRI,X13,PoW/PoS,-0.09938,-0.09321,-1.48123,-0.80279,-0.30291,Triangles Coin,1
CMTC,Scrypt,PoW,-0.09938,-0.09321,0.68832,-0.82446,-0.23626,CometCoin,0
CHAT,Scrypt,PoW/PoS,-0.09877,-0.09321,-0.90447,-0.98196,-0.28101,OpenChat,1


In [102]:
# Use the altair scatter plot to visualize the clusters, x="PC 1" and y="PC 2" for the axes
alt.Chart(df).mark_circle(size=60).encode(
    x="PC 1", 
    y="PC 2", 
    color=alt.Color('Class', scale=alt.Scale(scheme='viridis')),
    # Add the required tool tips "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "MaxSupply"]
)

In [104]:
#Use the altair scatter plot to visualize the tradable cryptocurrencies, x="TotalCoinsMined" and y="MaxSupply" for the axes
alt.Chart(df).mark_circle(size=60).encode(
    x="TotalCoinsMined",
    y="MaxSupply",
    color=alt.Color('Class', scale=alt.Scale(scheme='viridis')),
    # Add the required tool tips "CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "MaxSupply"]
)

#### Table of Tradable Cryptocurrencies

In [107]:
# Order DF alphabetically 
clustered_df = clustered_df.sort_values("CoinName")

# Print the total number of tradable cryptocurrencies using the display() command
display(clustered_df[['CoinName','Algorithm', 'ProofType', 'MaxSupply', 'TotalCoinsMined', 'Class']])

Unnamed: 0,CoinName,Algorithm,ProofType,MaxSupply,TotalCoinsMined,Class
42,42 Coin,Scrypt,PoW/PoS,42.0,41.99995,1
AEON,AEON,CryptoNight-Lite,PoW,-1.0,18019772.72546,0
ARK,ARK,DPoS,DPoS,-1.0,163233684.0,3
ACT,Achain,DPoS,DPoS,0.0,1000000000.0,3
AAC,Acute Angle Cloud,ECC 256K1,DPOS,-1.0,1000000000.0,4
ADK,Aidos Kuneen,IMesh,PoW,0.0,25000000.0,0
AION,Aion,"Equihash210,9",PoW/PoS,-1.0,501800769.0,1
AMB,Amber,Dagger,PoA,-1.0,906024641.0,4
ANC,Anchor Protocol,PoS,PoS,-1.0,1000000000.0,1
AR,Arweave,SHA-256,PoW,66000000.0,64598643.0,0
