In [124]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [125]:
# Load data
crypto_df = pd.read_csv("../crypto_data.csv")
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [126]:
crypto_df.rename(columns={"Unnamed: 0" :'Symbol'}, inplace=True)
crypto_df.head()

Unnamed: 0,Symbol,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [127]:
crypto_df = crypto_df[(crypto_df["IsTrading"] == True)]
crypto_df.head()

Unnamed: 0,Symbol,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [128]:
# Validate that data frame only has rows with isTrading == True
crypto_df[(crypto_df["IsTrading"] != True)].head()

Unnamed: 0,Symbol,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply


In [129]:
# Drop isTrading column
crypto_df = crypto_df.drop(["IsTrading"], axis=1)
crypto_df.head()

Unnamed: 0,Symbol,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [130]:
crypto_df = crypto_df.dropna()
crypto_df.head()

Unnamed: 0,Symbol,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,808,SHA-256,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [131]:
crypto_df = crypto_df[(crypto_df["TotalCoinsMined"] != 0)]
crypto_df.head()

Unnamed: 0,Symbol,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [132]:
crypto_df[(crypto_df["TotalCoinsMined"] == 0)]

Unnamed: 0,Symbol,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply


In [133]:
coin_name_df = crypto_df["CoinName"]
coin_name_df.head()

0      42 Coin
2      404Coin
5    EliteCoin
7      Bitcoin
8     Ethereum
Name: CoinName, dtype: object

In [134]:
# Drop isTrading column
crypto_df = crypto_df.drop(["CoinName"], axis=1)
crypto_df.head()

Unnamed: 0,Symbol,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42
2,404,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethash,PoW,107684200.0,0


In [135]:
crypto_df["Symbol"] = LabelEncoder().fit_transform(crypto_df["Symbol"])
crypto_df["Algorithm"] = LabelEncoder().fit_transform(crypto_df["Algorithm"])
crypto_df["ProofType"] = LabelEncoder().fit_transform(crypto_df["ProofType"])
X = crypto_df
X.head()

Unnamed: 0,Symbol,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,3,53,16,41.99995,42
2,2,53,16,1055185000.0,532000000
5,0,67,16,29279420000.0,314159265359
7,61,47,13,17927180.0,21000000
8,160,20,13,107684200.0,0


In [136]:
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-1.70930643,  0.39335561,  0.8871567 , -0.11674788, -0.15286468],
       [-1.7158057 ,  0.39335561,  0.8871567 , -0.09358885, -0.14499604],
       [-1.72880422,  1.21439604,  0.8871567 ,  0.52587231,  4.4937636 ],
       ...,
       [-1.48183219, -2.12841143,  0.00878917, -0.09523411, -0.13215444],
       [ 1.68330938, -1.65924547,  0.00878917, -0.11658774, -0.15255408],
       [ 1.40384102,  0.39335561, -1.45515672, -0.11674507, -0.15284989]])

In [137]:
# Initialize PCA model
pca = PCA(n_components=3)

In [138]:
# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)

In [139]:
X_pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2", "PC3"])
X_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
0,-0.317753,-1.421685,1.32692
1,-0.296043,-1.428113,1.330705
2,3.239852,-2.571439,0.580838
3,-0.120947,-0.498242,1.222628
4,-0.020842,0.790171,1.219174


In [140]:
pca.explained_variance_ratio_

array([0.34804464, 0.23506183, 0.194829  ])

In [8]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)
    
print(inertia)

[574.8058521688919, 198.70683592637369, 116.1092402140154, 89.72510241804252, 66.31069890113902, 57.26671348460513, 47.964565364690465, 40.242988880344654, 32.18519537310345, 28.40013955717787]


In [9]:
# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df

Unnamed: 0,k,inertia
0,1,574.805852
1,2,198.706836
2,3,116.10924
3,4,89.725102
4,5,66.310699
5,6,57.266713
6,7,47.964565
7,8,40.242989
8,9,32.185195
9,10,28.40014


In [10]:
elbow_df.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [13]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(iris_pca_df)

# Add the predicted class columns
iris_pca_df["class"] = model.labels_
iris_pca_df.head()

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.36795,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1


In [14]:
pca_df.hvplot.scatter(x="principal component 1", y="principal component 2", hover_cols=["class"], by="class", )