# Clustering Crypto

In [79]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


### Deliverable 1: Preprocessing the Data for PCA

In [80]:
# Load the crypto_data.csv dataset.
# YOUR CODE HERE
file_path = ("./Resources/crypto_data.csv")
# crypto_df = pd.read_csv(file_path)
crypto_df = pd.read_csv(file_path, index_col="Unnamed: 0")
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [81]:
# Keep all the cryptocurrencies that are being traded.
# YOUR CODE HERE
crypto_df_traded = crypto_df.loc[crypto_df["IsTrading"] == True]
print(crypto_df_traded)
crypto_df_traded.head(10)
# crypto_df.loc[crypto_df["IsTrading"] == True]

         CoinName    Algorithm  IsTrading ProofType  TotalCoinsMined  \
42        42 Coin       Scrypt       True   PoW/PoS     4.199995e+01   
365       365Coin          X11       True   PoW/PoS              NaN   
404       404Coin       Scrypt       True   PoW/PoS     1.055185e+09   
611     SixEleven      SHA-256       True       PoW              NaN   
808           808      SHA-256       True   PoW/PoS     0.000000e+00   
...           ...          ...        ...       ...              ...   
SERO   Super Zero       Ethash       True       PoW              NaN   
UOS           UOS      SHA-256       True      DPoI              NaN   
BDX        Beldex  CryptoNight       True       PoW     9.802226e+08   
ZEN       Horizen     Equihash       True       PoW     7.296538e+06   
XBC   BitcoinPlus       Scrypt       True       PoS     1.283270e+05   

     TotalCoinSupply  
42                42  
365       2300000000  
404        532000000  
611           611000  
808                0

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [None]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE
# Per Jon Richards, we are to comment that there are no invalid or null Algorithm values, and move on.

In [82]:
# Remove the "IsTrading" column. 
# YOUR CODE HERE
crypto_df_drop = crypto_df_traded.drop(['IsTrading'], axis=1)
print(crypto_df_drop)
# crypto_df_drop.head(10)
# crypto_df_traded.drop(['IsTrading'], axis=1, inplace = True)
crypto_df_drop.head()

         CoinName    Algorithm ProofType  TotalCoinsMined TotalCoinSupply
42        42 Coin       Scrypt   PoW/PoS     4.199995e+01              42
365       365Coin          X11   PoW/PoS              NaN      2300000000
404       404Coin       Scrypt   PoW/PoS     1.055185e+09       532000000
611     SixEleven      SHA-256       PoW              NaN          611000
808           808      SHA-256   PoW/PoS     0.000000e+00               0
...           ...          ...       ...              ...             ...
SERO   Super Zero       Ethash       PoW              NaN      1000000000
UOS           UOS      SHA-256      DPoI              NaN      1000000000
BDX        Beldex  CryptoNight       PoW     9.802226e+08      1400222610
ZEN       Horizen     Equihash       PoW     7.296538e+06        21000000
XBC   BitcoinPlus       Scrypt       PoS     1.283270e+05         1000000

[1144 rows x 5 columns]


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [83]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE
crypto_df_drop = crypto_df_drop.dropna()

In [84]:
print(crypto_df_drop)

         CoinName    Algorithm ProofType  TotalCoinsMined TotalCoinSupply
42        42 Coin       Scrypt   PoW/PoS     4.199995e+01              42
404       404Coin       Scrypt   PoW/PoS     1.055185e+09       532000000
808           808      SHA-256   PoW/PoS     0.000000e+00               0
1337    EliteCoin          X13   PoW/PoS     2.927942e+10    314159265359
BTC       Bitcoin      SHA-256       PoW     1.792718e+07        21000000
...           ...          ...       ...              ...             ...
ZEPH       ZEPHYR      SHA-256      DPoS     2.000000e+09      2000000000
GAP       Gapcoin       Scrypt   PoW/PoS     1.493105e+07       250000000
BDX        Beldex  CryptoNight       PoW     9.802226e+08      1400222610
ZEN       Horizen     Equihash       PoW     7.296538e+06        21000000
XBC   BitcoinPlus       Scrypt       PoS     1.283270e+05         1000000

[685 rows x 5 columns]


In [85]:
# Keep the rows where coins are mined.
# YOUR CODE HERE
# crypto_df_mined = crypto_df_drop.loc[crypto_df["TotalCoinsMined"] > 0]
# crypto_df_mined.head()

# crypto_df_drop.loc[crypto_df["TotalCoinsMined"] > 0]
crypto_df_drop = crypto_df_drop.loc[crypto_df["TotalCoinsMined"] > 0]

In [86]:
print(crypto_df_drop)

         CoinName    Algorithm ProofType  TotalCoinsMined TotalCoinSupply
42        42 Coin       Scrypt   PoW/PoS     4.199995e+01              42
404       404Coin       Scrypt   PoW/PoS     1.055185e+09       532000000
1337    EliteCoin          X13   PoW/PoS     2.927942e+10    314159265359
BTC       Bitcoin      SHA-256       PoW     1.792718e+07        21000000
ETH      Ethereum       Ethash       PoW     1.076842e+08               0
...           ...          ...       ...              ...             ...
ZEPH       ZEPHYR      SHA-256      DPoS     2.000000e+09      2000000000
GAP       Gapcoin       Scrypt   PoW/PoS     1.493105e+07       250000000
BDX        Beldex  CryptoNight       PoW     9.802226e+08      1400222610
ZEN       Horizen     Equihash       PoW     7.296538e+06        21000000
XBC   BitcoinPlus       Scrypt       PoS     1.283270e+05         1000000

[532 rows x 5 columns]


In [87]:
# Create a new DataFrame that holds only the cryptocurrencies names.
# YOUR CODE HERE
crypto_names = crypto_df_drop[{"CoinName": "CoinName"}]
crypto_names.head()


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [88]:
print(crypto_names)

         CoinName
42        42 Coin
404       404Coin
1337    EliteCoin
BTC       Bitcoin
ETH      Ethereum
...           ...
ZEPH       ZEPHYR
GAP       Gapcoin
BDX        Beldex
ZEN       Horizen
XBC   BitcoinPlus

[532 rows x 1 columns]


In [89]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
crypto_df_dropc = crypto_df_drop.drop(['CoinName'], axis=1)
print(crypto_df_dropc)
# crypto_df_drop.head(10)
# crypto_df_traded.drop(['IsTrading'], axis=1, inplace = True)
crypto_df_dropc.head()

        Algorithm ProofType  TotalCoinsMined TotalCoinSupply
42         Scrypt   PoW/PoS     4.199995e+01              42
404        Scrypt   PoW/PoS     1.055185e+09       532000000
1337          X13   PoW/PoS     2.927942e+10    314159265359
BTC       SHA-256       PoW     1.792718e+07        21000000
ETH        Ethash       PoW     1.076842e+08               0
...           ...       ...              ...             ...
ZEPH      SHA-256      DPoS     2.000000e+09      2000000000
GAP        Scrypt   PoW/PoS     1.493105e+07       250000000
BDX   CryptoNight       PoW     9.802226e+08      1400222610
ZEN      Equihash       PoW     7.296538e+06        21000000
XBC        Scrypt       PoS     1.283270e+05         1000000

[532 rows x 4 columns]


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [90]:
crypto_df_dropc.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [94]:
# Use get_dummies() to create variables for text features. - convert to numerics
# YOUR CODE HERE
X = pd.get_dummies(crypto_df_dropc, columns = ["Algorithm", "ProofType"])

In [95]:
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
# Standardize the data with StandardScaler().
# YOUR CODE HERE
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])

[[-0.11710817 -0.1528703  -0.0433963  -0.0433963  -0.0433963  -0.06142951
  -0.07530656 -0.0433963  -0.06142951 -0.06142951 -0.0433963  -0.0433963
  -0.19245009 -0.06142951 -0.09740465 -0.0433963  -0.11547005 -0.07530656
  -0.0433963  -0.0433963  -0.15191091 -0.0433963  -0.13118084 -0.0433963
  -0.0433963  -0.08703883 -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.06142951 -0.0433963  -0.08703883 -0.08703883 -0.08703883 -0.0433963
  -0.13118084 -0.13840913 -0.13840913 -0.0433963  -0.06142951 -0.0433963
  -0.07530656 -0.18168574 -0.0433963  -0.0433963  -0.0433963  -0.07530656
  -0.15826614 -0.31491833 -0.0433963  -0.08703883 -0.07530656 -0.06142951
   1.38675049 -0.0433963  -0.0433963  -0.06142951 -0.0433963  -0.0433963
  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963  -0.0433963
  -0.39879994 -0.0433963  -0.18168574 -0.0433963  -0.08703883 -0.08703883
  -0.10680283 -0.0433963  -0.13118084 -0.0433963  -0.0433963  -0.0433963
  -0.0433963  -0.07530656 -0.43911856 -0.04339

### Deliverable 2: Reducing Data Dimensions Using PCA

In [97]:
# Using PCA to reduce dimension to three principal components.
# YOUR CODE HERE
pca = PCA(n_components=3)

In [134]:
# Get three principal components for the crypto
crypto_pca = pca.fit_transform(X_scaled)

In [149]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE

pcs_df = pd.DataFrame(index=X.index, 
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"]
)
pcs_df.head(10)


Unnamed: 0,PC 1,PC 2,PC 3
42,-0.334433,0.987835,-0.53045
404,-0.317772,0.987959,-0.530798
1337,2.298228,1.682959,-0.634642
BTC,-0.150709,-1.193328,0.185275
ETH,-0.14885,-1.835179,0.422394
LTC,-0.167527,-1.094317,0.021255
DASH,-0.399649,1.243193,-0.520345
XMR,-0.1591,-2.018217,0.428675
ETC,-0.147293,-1.835269,0.422374
ZEC,-0.135517,-2.076601,0.420927


In [150]:
print(pcs_df)

          PC 1      PC 2      PC 3
42   -0.334433  0.987835 -0.530450
404  -0.317772  0.987959 -0.530798
1337  2.298228  1.682959 -0.634642
BTC  -0.150709 -1.193328  0.185275
ETH  -0.148850 -1.835179  0.422394
...        ...       ...       ...
ZEPH  2.473388  0.694277  0.046302
GAP  -0.332479  0.987730 -0.530476
BDX   0.316397 -2.141628  0.392094
ZEN  -0.135518 -2.076601  0.420927
XBC  -0.287961  0.660974 -0.217260

[532 rows x 3 columns]


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [141]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
inertia = []
k = list(range(1, 11))
# Calculate the inertia for the range of K values
for i in k:
   km = KMeans(n_clusters=i, random_state=0)
   km.fit(pcs_df)
   inertia.append(km.inertia_)

In [142]:
# Create the Elbow Curve using hvPlot
elbow_data = {"k" : k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=4`

In [143]:
# Initialize the K-Means model.
# YOUR CODE HERE


# Fit the model
# YOUR CODE HERE

# Predict clusters
# YOUR CODE HERE

def get_clusters(k, data):
   # Create a copy of the DataFrame
   data = data.copy()

    # Initialize the K-Means model
   model = KMeans(n_clusters=k, random_state=0)

   # Fit the model
   model.fit(data)

   # Predict clusters
   predictions = model.predict(data)

   # Create return DataFrame with predicted clusters
   data["class"] = model.labels_

   return data

In [145]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.

# YOUR CODE HERE
clusters_df = get_clusters(5, pcs_df)
clusters_df.head()


Unnamed: 0,PC 1,PC 2,PC 3,class
42,-0.334433,0.987835,-0.53045,0
404,-0.317772,0.987959,-0.530798,0
1337,2.298228,1.682959,-0.634642,4
BTC,-0.150709,-1.193328,0.185275,1
ETH,-0.14885,-1.835179,0.422394,1


In [159]:
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# result = pd.concat(frames, keys=["x", "y", "z"])
# combo = pd.concat([clusters_df, crypto_df])

combo = pd.concat([clusters_df, crypto_df_drop], axis=1, join="inner")

In [160]:
combo.head()

Unnamed: 0,PC 1,PC 2,PC 3,class,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,-0.334433,0.987835,-0.53045,0,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,-0.317772,0.987959,-0.530798,0,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,2.298228,1.682959,-0.634642,4,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,-0.150709,-1.193328,0.185275,1,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,-0.14885,-1.835179,0.422394,1,Ethereum,Ethash,PoW,107684200.0,0


In [161]:
print(combo)

          PC 1      PC 2      PC 3  class     CoinName    Algorithm ProofType  \
42   -0.334433  0.987835 -0.530450      0      42 Coin       Scrypt   PoW/PoS   
404  -0.317772  0.987959 -0.530798      0      404Coin       Scrypt   PoW/PoS   
1337  2.298228  1.682959 -0.634642      4    EliteCoin          X13   PoW/PoS   
BTC  -0.150709 -1.193328  0.185275      1      Bitcoin      SHA-256       PoW   
ETH  -0.148850 -1.835179  0.422394      1     Ethereum       Ethash       PoW   
...        ...       ...       ...    ...          ...          ...       ...   
ZEPH  2.473388  0.694277  0.046302      4       ZEPHYR      SHA-256      DPoS   
GAP  -0.332479  0.987730 -0.530476      0      Gapcoin       Scrypt   PoW/PoS   
BDX   0.316397 -2.141628  0.392094      1       Beldex  CryptoNight       PoW   
ZEN  -0.135518 -2.076601  0.420927      1      Horizen     Equihash       PoW   
XBC  -0.287961  0.660974 -0.217260      0  BitcoinPlus       Scrypt       PoS   

      TotalCoinsMined Total

In [None]:
#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE
# already done

In [None]:
#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE
# already done

In [162]:

# Print the shape of the clustered_df
# print(clustered_df.shape)
# clustered_df.head(10)

print(combo.shape)
combo.head(10)

(532, 9)


Unnamed: 0,PC 1,PC 2,PC 3,class,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,-0.334433,0.987835,-0.53045,0,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,-0.317772,0.987959,-0.530798,0,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,2.298228,1.682959,-0.634642,4,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,-0.150709,-1.193328,0.185275,1,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,-0.14885,-1.835179,0.422394,1,Ethereum,Ethash,PoW,107684200.0,0
LTC,-0.167527,-1.094317,0.021255,1,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,-0.399649,1.243193,-0.520345,0,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,-0.1591,-2.018217,0.428675,1,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,-0.147293,-1.835269,0.422374,1,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,-0.135517,-2.076601,0.420927,1,ZCash,Equihash,PoW,7383056.0,21000000


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [180]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
fig = px.scatter_3d(
    combo,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="class",
    symbol="class",
    hover_name="CoinName",
    hover_data=["Algorithm"],
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [182]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE
combo.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinSupply', 'TotalCoinsMined', 'class'])

In [191]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE
# total = combo['CoinName'].count()
print(f"There are {combo['CoinName'].count()} tradable cryptocurrencies.")


There are 532 tradable cryptocurrencies.


In [195]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE
scaler = MinMaxScaler()

In [202]:
combo.dtypes

PC 1               float64
PC 2               float64
PC 3               float64
class                int32
CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [207]:
# combo_convert = pd.get_dummies(combo, columns = ["TotalCoinSupply", "TotalCoinsMined"])

In [208]:
# combo_convert.head()

In [209]:
# tcs = combo_convert.loc[combo_convert["TotalCoinsMined"] < 1]

In [None]:
# tcm = combo.loc[combo["TotalCoinsMined"] < 1]

In [210]:
# combo_scaled = scaler.fit_transform(combo['TotalCoinSupply','TotalCoinsMined'])


In [None]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
