In [428]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.cluster import AgglomerativeClustering
import plotly.figure_factory as ff

In [429]:
def initial_data_info(df):
    print('DATA TYPES:')
    display(df.dtypes)
    print('\nNULL VALUES:')
    for column in df.columns:
        print(f"Column '{column}' has {df[column].isnull().sum()} null values.")
    print('\nDUPLICATE ENTRIES:')
    print(f"{df.duplicated().sum()}\n")
    display(df.head())
    
def drop_column(df, column):
    df.drop(columns=[column], inplace=True)
    display(df.head())

def load_csv(file_path, df='dataframe'):
    df = pd.read_csv(file_path)
    return df

def save_csv(file_path, df):
    df.to_csv(file_path, index=False)
    
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model.fit(df)
    df["class"] = model.labels_
    
def elbow_curve(df, xstart=1, xend=11):
    inertia = []
    k = list(range(xstart, xend))
    for i in k:
        km = KMeans(n_clusters=i, random_state=0)
        km.fit(df)
        inertia.append(km.inertia_)   
    elbow_data = {"k": k, "inertia": inertia}
    df_elbow = pd.DataFrame(elbow_data)
    display(df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k))
    
def get_clusters(k, data):
    data = data.copy()
    model = KMeans(n_clusters=k, random_state=0)
    model.fit(data)
    predictions = model.predict(data)
    data["class"] = model.labels_
    return data

# Data Preprocessing

In [430]:
crypto_df = pd.read_csv("Resources/crypto_data.csv")
initial_data_info(crypto_df)

DATA TYPES:


Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object


NULL VALUES:
Column 'Unnamed: 0' has 0 null values.
Column 'CoinName' has 0 null values.
Column 'Algorithm' has 0 null values.
Column 'IsTrading' has 0 null values.
Column 'ProofType' has 0 null values.
Column 'TotalCoinsMined' has 508 null values.
Column 'TotalCoinSupply' has 0 null values.

DUPLICATE ENTRIES:
0



Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [431]:
def change_bool(status):
    if status == True:
        return 1
    else:
        return 0
    
crypto_df['IsTrading'] = crypto_df['IsTrading'].apply(change_bool)

In [432]:
crypto_df = crypto_df[crypto_df['IsTrading'] == 1]

In [433]:
crypto_df = crypto_df.dropna(subset=['Algorithm'])

In [434]:
crypto_df = crypto_df.drop(columns=['IsTrading'])

In [435]:
crypto_df = crypto_df.dropna()

In [436]:
crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == 0].index, inplace=True)
crypto_df = crypto_df.set_index('Unnamed: 0')
crypto_df = crypto_df.sort_index()
crypto_df.index.name = None
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
1CR,1Credit,Scrypt,PoW,8.821300e+04,92000000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
8BIT,8BIT Coin,Scrypt,PoW/PoS,1.467841e+06,0
...,...,...,...,...,...
ZET,ZetaCoin,SHA-256,PoW,1.702044e+08,169795588
ZNE,ZoneCoin,Scrypt,PoW/PoS,2.581970e+06,21000000
ZNY,BitZeny,Scrypt,PoW,7.561450e+07,250000000
ZOI,Zoin,Lyra2RE,PoW,1.854576e+07,21000000


In [437]:
coins_name = crypto_df.sort_values(['CoinName'], ascending=True)

In [438]:
coins_name = coins_name.drop(columns=['CoinName'])
coins_name

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
CLUB,Scrypt,PoW/PoS,1.036219e+08,160000000
MCT,Ethash,PoS,1.618033e+06,1618033
1CR,Scrypt,PoW,8.821300e+04,92000000000
404,Scrypt,PoW/PoS,1.055185e+09,532000000
42,Scrypt,PoW/PoS,4.199995e+01,42
...,...,...,...,...
ZNE,Scrypt,PoW/PoS,2.581970e+06,21000000
EBST,Scrypt,PoW,9.999000e+07,100000000
GCN,Scrypt,PoW,1.630551e+11,200000000000
IBANK,Scrypt,PoW/PoS,4.526324e+06,44333333


In [439]:
X = coins_name.copy()
X = pd.get_dummies(X, columns=["Algorithm", "ProofType"])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
CLUB,1.036219e+08,160000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
MCT,1.618033e+06,1618033,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1CR,8.821300e+04,92000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNE,2.581970e+06,21000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
EBST,9.999000e+07,100000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCN,1.630551e+11,200000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IBANK,4.526324e+06,44333333,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [440]:
X = StandardScaler().fit_transform(X)

# Reducing Data Dimensions Using PCA

In [441]:
pca = PCA(n_components=3)

In [442]:
X = pca.fit_transform(X)

In [443]:
pcs_df = pd.DataFrame(data=X, columns=['PC 1', 'PC 2', 'PC 3'], index=crypto_df.index)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
1337,-0.328912,1.040205,-0.542624
1CR,-0.282877,-0.055105,0.099605
404,0.489309,-1.177712,-0.004183
42,-0.314633,1.040201,-0.543088
8BIT,-0.331316,1.040277,-0.542557
...,...,...,...
ZET,-0.331135,1.040265,-0.542561
ZNE,-0.161954,-1.122441,0.008722
ZNY,3.250403,-1.205038,-0.089980
ZOI,-0.330946,1.040252,-0.542565


# Clustering Cryptocurrencies Using K-means

In [444]:
elbow_curve(pcs_df)

In [445]:
clustered_df = get_clusters(4, pcs_df)

In [446]:
clustered_df = clustered_df.join(crypto_df, how='outer')

In [447]:
clustered_df = clustered_df[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC 1', 'PC 2', 'PC 3', 'CoinName', 'class']]
clustered_df['Class'] = clustered_df['class']
clustered_df = clustered_df.drop(columns=['class'])
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
1337,X13,PoW/PoS,2.927942e+10,314159265359,-0.328912,1.040205,-0.542624,EliteCoin,2
1CR,Scrypt,PoW,8.821300e+04,92000000000,-0.282877,-0.055105,0.099605,1Credit,2
404,Scrypt,PoW/PoS,1.055185e+09,532000000,0.489309,-1.177712,-0.004183,404Coin,0
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.314633,1.040201,-0.543088,42 Coin,2
8BIT,Scrypt,PoW/PoS,1.467841e+06,0,-0.331316,1.040277,-0.542557,8BIT Coin,2
...,...,...,...,...,...,...,...,...,...
ZET,SHA-256,PoW,1.702044e+08,169795588,-0.331135,1.040265,-0.542561,ZetaCoin,2
ZNE,Scrypt,PoW/PoS,2.581970e+06,21000000,-0.161954,-1.122441,0.008722,ZoneCoin,0
ZNY,Scrypt,PoW,7.561450e+07,250000000,3.250403,-1.205038,-0.089980,BitZeny,0
ZOI,Lyra2RE,PoW,1.854576e+07,21000000,-0.330946,1.040252,-0.542565,Zoin,2


# Visualizing Results

#### Create a 3D scatter plot using Plotly Express to plot the clusters using the clustered_df DataFrame. You should include the following parameters on the plot: hover_name="CoinName" and hover_data=["Algorithm"] to show this additional info on each data point.

In [448]:
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Use hvplot.table to create a data table with all the current tradable cryptocurrencies. The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, TotalCoinsMined, and Class.

In [449]:
clustered_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'Class'], 
                          width=800)

#### Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies having x="TotalCoinsMined" and y="TotalCoinSupply" to contrast the number of available coins versus the total number of mined coins. Use the hover_cols=["CoinName"] parameter to include the cryptocurrency name on each data point.

In [450]:
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class", hover_cols=["CoinName"])