In [220]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


# Deliverable 1: Preprocessing the Data for PCA

In [221]:
# Load the crypto_data.csv dataset.
# YOUR CODE HERE
file_path = "crypto_data.csv"
crypto_data_df = pd.read_csv(file_path)
crypto_data_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [222]:
# Keep all the cryptocurrencies that are being traded.
# YOUR CODE HERE
trading_crypto_df = crypto_data_df[crypto_data_df["IsTrading"] == True]
trading_crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [223]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE
trading_crypto_df = trading_crypto_df.dropna(subset=["Algorithm"])
trading_crypto_df 

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [224]:
# Remove the "IsTrading" column. 
# YOUR CODE HERE
# Renamed "Unnamed: 0" column and then set it as the index
trading_crypto_df  = trading_crypto_df.rename(columns={'Unnamed: 0': ''})
trading_crypto_df  = trading_crypto_df.set_index('')
# Remove the "IsTrading" column. 
trading_crypto_df.drop(columns=["IsTrading"], inplace=True)

trading_crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42.0,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
365.0,365Coin,X11,PoW/PoS,,2300000000.0
404.0,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
611.0,SixEleven,SHA-256,PoW,,611000.0
808.0,808,SHA-256,PoW/PoS,0.0,0.0


In [225]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE
for column in trading_crypto_df.columns:
    print(f"Column {column} has {trading_crypto_df[column].isnull().sum()} null vlaues")

Column CoinName has 0 null vlaues
Column Algorithm has 0 null vlaues
Column ProofType has 0 null vlaues
Column TotalCoinsMined has 459 null vlaues
Column TotalCoinSupply has 0 null vlaues


In [226]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE
trading_crypto_df = trading_crypto_df.dropna(subset=["TotalCoinsMined"])
trading_crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [227]:
# Keep the rows where coins are mined.
# YOUR CODE HERE
trading_crypto_df = trading_crypto_df[trading_crypto_df["TotalCoinsMined"] > 0]
trading_crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,,
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [228]:
# Create a new DataFrame that holds only the cryptocurrencies names.
# YOUR CODE HERE
Crypto_names_df = trading_crypto_df[["CoinName"]].copy()
Crypto_names_df


Unnamed: 0,CoinName
,
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex


In [232]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
clean_crypto_df = trading_crypto_df.drop("CoinName", 1)
clean_crypto_df


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
,,,,
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610


In [233]:
# Use get_dummies() to create variables for text features.
# YOUR CODE HERE
X = pd.get_dummies(clean_crypto_df, columns=["Algorithm", "ProofType"])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [234]:
# Standardize the data with StandardScaler().
# YOUR CODE HERE
crypto_scaled = StandardScaler().fit_transform(X)
crypto_scaled


array([[-0.11710817, -0.1528703 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.145009  , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561,  4.48942416, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.13217937, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.15255998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.15285552, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

# Deliverable 2 : Reducing Data Dimensions Using PCA

In [235]:
# Initialize PCA model
pca = PCA(n_components=3)

In [236]:
# Get three principal components for the cypto data.
crypto_pcs = pca.fit_transform(crypto_scaled)

In [238]:
# Transform PCA data to DataFrame
pcs_df = pd.DataFrame(
    data=crypto_pcs, columns=["PC 1", "PC 2", "PC 3"], index = clean_crypto_df.index
)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
,,,
42,-0.332584,1.018814,-0.600857
404,-0.315919,1.018930,-0.601348
1337,2.302691,1.633260,-0.684562
BTC,-0.146152,-1.303180,0.221455
ETH,-0.146801,-2.028011,0.424566
...,...,...,...
ZEPH,2.471661,0.869966,-0.090527
GAP,-0.330631,1.018688,-0.600885
BDX,0.311741,-2.359925,0.483691


# Deliverable 3: Clustering Cryptocurrencies Using K-means

In [239]:
# Finding best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_) 


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [240]:
# Create the elbow curve using hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [241]:
# Initializing model with k = 5 (best value of k from the elbow curve)
model = KMeans(n_clusters=5, random_state=5)
model

KMeans(n_clusters=5, random_state=5)

In [242]:
# Fitting model
model.fit(pcs_df)

KMeans(n_clusters=5, random_state=5)

In [243]:
# Get the predictions
predictions = model.predict(pcs_df)
print(predictions)

[3 3 0 1 1 1 3 1 1 1 3 1 3 3 1 3 1 1 3 3 1 0 1 1 1 3 1 1 1 3 1 3 1 1 3 3 1
 1 1 1 1 1 3 3 1 1 1 1 1 3 3 1 3 1 1 1 1 3 1 1 3 1 3 3 3 1 1 1 3 3 3 3 3 1
 1 1 3 3 1 3 1 3 3 1 1 1 1 3 3 1 3 1 1 3 3 1 3 3 0 1 3 3 1 3 3 1 3 1 3 1 3
 1 3 3 1 1 3 1 1 1 3 1 1 1 1 1 3 3 1 1 1 3 1 3 1 1 3 1 3 1 3 3 1 1 3 1 1 3
 3 1 3 1 3 3 3 1 1 1 1 3 3 3 3 3 1 1 3 3 3 3 3 1 3 3 3 3 3 1 3 1 3 3 1 3 1
 3 3 1 3 1 3 1 3 1 3 3 3 3 1 3 3 3 3 3 1 1 0 3 1 1 3 3 3 3 3 1 3 3 3 3 3 3
 3 3 1 3 3 3 3 3 3 1 1 1 3 3 3 3 1 3 1 3 3 1 3 1 1 3 1 1 3 1 3 3 3 1 3 3 1
 3 3 3 3 3 3 3 1 3 1 3 3 3 3 1 3 1 3 1 1 1 1 3 1 3 3 1 3 1 1 1 3 1 3 1 0 1
 3 1 3 1 3 3 2 1 3 1 1 1 1 1 3 3 1 3 3 3 1 3 1 3 1 3 1 3 3 3 3 1 3 3 1 3 3
 3 1 1 1 1 3 0 3 3 1 3 1 1 1 3 3 1 1 3 3 1 3 1 1 1 3 1 1 3 3 3 1 1 1 3 3 3
 0 1 3 1 1 1 1 3 2 2 1 1 1 3 2 3 3 3 3 1 1 1 1 3 3 3 1 3 1 3 3 3 0 1 3 3 1
 3 3 1 1 3 1 3 1 1 1 1 3 3 1 3 1 3 3 0 3 3 3 1 1 1 3 3 3 3 3 3 1 3 1 1 1 0
 3 3 3 3 1 3 3 1 3 3 1 2 1 3 1 1 3 3 1 3 1 1 3 1 1 3 1 3 1 3 3 1 3 3 3 3 3
 1 1 1 3 0 3 1 3 1 3 1 3 

In [244]:
# Add a new class column to the df_iris
pcs_df["class"] = model.labels_
pcs_df.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
,,,,
42,-0.332584,1.018814,-0.600857,3.0
404,-0.315919,1.01893,-0.601348,3.0
1337,2.302691,1.63326,-0.684562,0.0
BTC,-0.146152,-1.30318,0.221455,1.0
ETH,-0.146801,-2.028011,0.424566,1.0


In [247]:
clustered_df = pd.concat([clean_crypto_df, pcs_df], axis=1)
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
,,,,,,,,
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.332584,1.018814,-0.600857,3
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.315919,1.018930,-0.601348,3
1337,X13,PoW/PoS,2.927942e+10,314159265359,2.302691,1.633260,-0.684562,0
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.146152,-1.303180,0.221455,1
ETH,Ethash,PoW,1.076842e+08,0,-0.146801,-2.028011,0.424566,1
...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,2.471661,0.869966,-0.090527,0
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.330631,1.018688,-0.600885,3
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.311741,-2.359925,0.483691,1


In [248]:
# Adding CoinName column to clustered_df dataframe.
clustered1_df = pd.concat([clustered_df, Crypto_names_df], axis=1)
clustered1_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
,,,,,,,,,
42,Scrypt,PoW/PoS,4.199995e+01,42,-0.332584,1.018814,-0.600857,3,42 Coin
404,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.315919,1.018930,-0.601348,3,404Coin
1337,X13,PoW/PoS,2.927942e+10,314159265359,2.302691,1.633260,-0.684562,0,EliteCoin
BTC,SHA-256,PoW,1.792718e+07,21000000,-0.146152,-1.303180,0.221455,1,Bitcoin
ETH,Ethash,PoW,1.076842e+08,0,-0.146801,-2.028011,0.424566,1,Ethereum
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2000000000,2.471661,0.869966,-0.090527,0,ZEPHYR
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.330631,1.018688,-0.600885,3,Gapcoin
BDX,CryptoNight,PoW,9.802226e+08,1400222610,0.311741,-2.359925,0.483691,1,Beldex


# Deliverable 4: Visualizing Cryptocurrencies Results

In [249]:
# Plotting the clusters with three features
fig = px.scatter_3d(clustered1_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class", hover_name="CoinName", hover_data=["Algorithm"], width =800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [250]:
# Using hvplot.table() function
clusterd1_df = clustered1_df[["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "class"]]
clusterd1_df.hvplot.table(sortable=True, selectable=True)



In [251]:
# Print the total number of tradable cryptocurrencies.
print(f"There are {len(clusterd1_df.index)} cryptocurrencies in the dataframe.")

There are 532 cryptocurrencies in the dataframe.


In [252]:
# Use the MinMaxScaler().fit_transform method to scale the TotalCoinSupply and TotalCoinsMined columns between the given range of zero and one.
mms = MinMaxScaler()
clustered_df_scaled = mms.fit_transform(clustered1_df[['TotalCoinSupply','TotalCoinsMined']])
clustered_df_scaled

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [253]:
# STEP 7: Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df=pd.DataFrame(
data=clustered_df_scaled, columns=['TotalCoinSupply','TotalCoinsMined'], index = clustered_df.index)
plot_df

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
,,
42,4.200000e-11,0.000000e+00
404,5.320000e-04,1.065855e-03
1337,3.141593e-01,2.957551e-02
BTC,2.100000e-05,1.810842e-05
ETH,0.000000e+00,1.087731e-04
...,...,...
ZEPH,2.000000e-03,2.020225e-03
GAP,2.500000e-04,1.508199e-05
BDX,1.400223e-03,9.901351e-04


In [254]:
# Add two more columns to plot_df.
plot_df["CoinName"] = clusterd1_df["CoinName"]
plot_df["Class"] = clusterd1_df["class"]
plot_df

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
,,,,
42,4.200000e-11,0.000000e+00,42 Coin,3
404,5.320000e-04,1.065855e-03,404Coin,3
1337,3.141593e-01,2.957551e-02,EliteCoin,0
BTC,2.100000e-05,1.810842e-05,Bitcoin,1
ETH,0.000000e+00,1.087731e-04,Ethereum,1
...,...,...,...,...
ZEPH,2.000000e-03,2.020225e-03,ZEPHYR,0
GAP,2.500000e-04,1.508199e-05,Gapcoin,3
BDX,1.400223e-03,9.901351e-04,Beldex,1


In [255]:
# Plotting the 2D-scatter
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class", hover_cols=["CoinName"])