In [30]:
#import dependencies for PCA and KMeans clustering
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [66]:
#load the cyptocurrencies data from csv file
file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [67]:
#check the count of rows before filtering
crypto_df.count()

CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64

In [68]:
#filter out cryptocurrencies which are not trading currently
crypto_filtered_df = crypto_df[crypto_df['IsTrading']==True]
crypto_filtered_df.count()

CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [69]:
#filter out cryptocurrencies that have algorithm defined
crypto_filtered_df = crypto_filtered_df[crypto_filtered_df['Algorithm'].notnull()]
crypto_filtered_df.count()

CoinName           1144
Algorithm          1144
IsTrading          1144
ProofType          1144
TotalCoinsMined     685
TotalCoinSupply    1144
dtype: int64

In [70]:
#drop the IsTrading column from the dataframe
crypto_filtered_df = crypto_filtered_df.drop(columns=['IsTrading'], axis=1)
crypto_filtered_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [71]:
#remove cryptocurrencies with at least one null value
crypto_filtered_df = crypto_filtered_df.dropna()
crypto_filtered_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [72]:
#remove cryptocurrencies which doesn't have any coins mined
crypto_filtered_df = crypto_filtered_df[crypto_filtered_df['TotalCoinsMined'] != 0]
crypto_filtered_df.count()

CoinName           533
Algorithm          533
ProofType          533
TotalCoinsMined    533
TotalCoinSupply    533
dtype: int64

In [73]:
#get all the coin name to a dataframe
coin_name_df = pd.DataFrame(crypto_filtered_df['CoinName'], index=crypto_filtered_df.index)
coin_name_df.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [74]:
#remove the column coin name
crypto_filtered_df = crypto_filtered_df.drop(columns=['CoinName'], axis=1)
crypto_filtered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [75]:
textual_columns = ['Algorithm', 'ProofType']
#encode using pandas get_dummies
X = pd.get_dummies(crypto_filtered_df, columns = textual_columns)
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
#standardize the features before using the feature
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:10])

[[-0.11674788 -0.15286468 -0.0433555  -0.0433555  -0.0433555  -0.06137164
  -0.07523548 -0.0433555  -0.06137164 -0.06137164 -0.0433555  -0.0433555
  -0.19226279 -0.06137164 -0.09731237 -0.0433555  -0.11536024 -0.07523548
  -0.0433555  -0.0433555  -0.15176505 -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.08695652 -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.06137164 -0.0433555  -0.08695652 -0.08695652 -0.08695652 -0.0433555
  -0.13105561 -0.13827675 -0.13827675 -0.0433555  -0.06137164 -0.0433555
  -0.07523548 -0.1815096  -0.0433555  -0.0433555  -0.0433555  -0.07523548
  -0.15811388 -0.3145935  -0.0433555  -0.08695652 -0.07523548 -0.06137164
  -0.0433555   1.38873015 -0.0433555  -0.0433555  -0.06137164 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555
  -0.0433555  -0.39836623 -0.0433555  -0.1815096  -0.0433555  -0.08695652
  -0.08695652 -0.10670145 -0.0433555  -0.0433555  -0.13105561 -0.0433555
  -0.0433555  -0.0433555  -0.0433555  -0.07523

In [87]:
#Initialize PCA model
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [88]:
#create new dataframe with PCA
pcs_df = pd.DataFrame(
                       data=X_pca,
                       index = X.index,
                       columns=['PC 1', 'PC 2', 'PC 3'])
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.341455,1.094847,-0.553961
404,-0.324842,1.095431,-0.554604
1337,2.281838,1.770139,-0.659218
BTC,-0.154407,-1.346324,0.177309
ETH,-0.15497,-2.044623,0.388269


In [89]:
pca.explained_variance_ratio_

array([0.02736711, 0.02090763, 0.02006771])

In [90]:
#do elbow method to get the number of clusters for KMeans
k = list(range(1,12))
inertia = []
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

#create a dictionary with the k and inertia
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x='k', y='inertia', xticks=k, title='Elbow curve')



In [92]:
#where k=4 the line becomes almost straight so lets pick the cluster size as 4 for KMeans
model = KMeans(n_clusters=4, random_state=0)
model.fit(pcs_df)

X_predictions = model.predict(pcs_df)
pcs_df['Class'] = model.labels_
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,Class
42,-0.341455,1.094847,-0.553961,0
404,-0.324842,1.095431,-0.554604,0
1337,2.281838,1.770139,-0.659218,0
BTC,-0.154407,-1.346324,0.177309,1
ETH,-0.15497,-2.044623,0.388269,1


In [114]:
#create new clusterd dataframe
clustered_df = crypto_filtered_df.join(coin_name_df)
clustered_df = clustered_df.join(pcs_df)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,CoinName,PC 1,PC 2,PC 3,Class
42,Scrypt,PoW/PoS,41.99995,42,42 Coin,-0.341455,1.094847,-0.553961,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,404Coin,-0.324842,1.095431,-0.554604,0
1337,X13,PoW/PoS,29279420000.0,314159265359,EliteCoin,2.281838,1.770139,-0.659218,0
BTC,SHA-256,PoW,17927180.0,21000000,Bitcoin,-0.154407,-1.346324,0.177309,1
ETH,Ethash,PoW,107684200.0,0,Ethereum,-0.15497,-2.044623,0.388269,1


In [115]:
clustered_df = clustered_df[['Algorithm','ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC 1', 'PC 2', 'PC 3', 'CoinName', 'Class']]
clustered_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.341455,1.094847,-0.553961,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.324842,1.095431,-0.554604,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.281838,1.770139,-0.659218,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.154407,-1.346324,0.177309,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.15497,-2.044623,0.388269,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.168474,-1.087515,0.011749,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.418147,1.310355,-0.540897,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.160513,-2.229015,0.398104,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.153418,-2.044673,0.388241,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.150276,-1.97856,0.547972,ZCash,1


In [116]:
#import plotly for plotting 3d scatter
import plotly.express as px

In [117]:
fig = px.scatter_3d(
    clustered_df,
    x='PC 1',
    y='PC 2',
    z='PC 3',
    color='Class',
    symbol='Class',
    hover_name='CoinName',
    hover_data=['Algorithm']
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()  

In [118]:
#create a table of cryptocurrencies
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'])

In [121]:
#scale only TotalCoinsMined and TotalCoinSupply
extracted_column_df = clustered_df[['TotalCoinSupply', 'TotalCoinsMined']]
scaled_extracted_column = StandardScaler().fit_transform(extracted_column_df)
scaled_extracted_column_df = pd.DataFrame(
                                    data=scaled_extracted_column,
                                    columns=['TotalCoinSupply', 'TotalCoinsMined'],
                                    index = clustered_df.index)
scaled_extracted_column_df.head()


Unnamed: 0,TotalCoinSupply,TotalCoinsMined
42,-0.152865,-0.116748
404,-0.144996,-0.093589
1337,4.493764,0.525872
BTC,-0.152554,-0.116354
ETH,-0.152865,-0.114384


In [122]:
#merged dataframe with scaled data
merged_df = clustered_df.copy();
merged_df['TotalCoinSupply'] = scaled_extracted_column_df['TotalCoinSupply']
merged_df['TotalCoinsMined'] = scaled_extracted_column_df['TotalCoinsMined']
merged_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,-0.116748,-0.152865,-0.341455,1.094847,-0.553961,42 Coin,0
404,Scrypt,PoW/PoS,-0.093589,-0.144996,-0.324842,1.095431,-0.554604,404Coin,0
1337,X13,PoW/PoS,0.525872,4.493764,2.281838,1.770139,-0.659218,EliteCoin,0
BTC,SHA-256,PoW,-0.116354,-0.152554,-0.154407,-1.346324,0.177309,Bitcoin,1
ETH,Ethash,PoW,-0.114384,-0.152865,-0.15497,-2.044623,0.388269,Ethereum,1


In [119]:
clustered_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class"
    )

In [124]:
merged_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class"
    )