# Clustering Crypto

### Instructions

1. Read in the crypto_data.csv to the Pandas DataFrame named crypto_df.
2. Keep all the cryptocurrencies that are being traded.
3. Drop the IsTrading column.
4. Remove rows that have at least one null value.
5. Filter the crypto_df DataFrame so it only has rows where coins have been mined.
6. Create a new DataFrame that holds only the cryptocurrency names, and use the crypto_df DataFrame index as the index for this new DataFrame.
7. Remove the CoinName column from the crypto_df DataFrame since it's not going to be used on the clustering algorithm.

Take a moment to check that your crypto_df DataFrame looks like the image provided in the challenge
The crypto_df DataFrame shows four  columns: Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply. It contains ten rows with the following headings: 42, 404, 1337, BTC, ETH, LTC, DASH, XMR, ETC, and ZEC

8. Use the get_dummies() method to create variables for the two text features, Algorithm and ProofType, and store the resulting data in a new DataFrame named X.
9. Use the StandardScaler fit_transform() function to standardize the features from the X DataFrame.

In [38]:
# Initial imports
import pandas as pd
from pandas import DataFrame
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# from sklearn.compose import ColumnTransformer

pd.set_option('display.max_rows', None)




### Deliverable 1: Preprocessing the Data for PCA

In [39]:
# 1. Read in the crypto_data.csv to the Pandas DataFrame named crypto_df.
# Load the crypto_data.csv dataset.

file_path = "Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.columns.values[0] = "IndexValue"
crypto_df = crypto_df.set_index(keys='IndexValue', drop=True)
crypto_df.index.name = None
#crypto_df = crypto_df.set_index('IndexValue', inplace=True)
# crypto_df = crypto_df[(crypto_df[0] != 'Unnamed: 0')]
#crypto_df.head(20)
crypto_df.head(10)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [40]:
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [41]:
# 2. Keep all the cryptocurrencies that are being traded.

crypto_df = crypto_df[(crypto_df["IsTrading"] == True)]
crypto_df.IsTrading.value_counts() 

True    1144
Name: IsTrading, dtype: int64

In [42]:
# 3. Drop the IsTrading column.
crypto_df = crypto_df.drop(columns=["IsTrading"], axis=1)
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,PoW/PoS,,0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [43]:
# 4. Remove rows that have at least one null value.
crypto_df= crypto_df.dropna(axis=0)
crypto_df.count()

CoinName           685
Algorithm          685
ProofType          685
TotalCoinsMined    685
TotalCoinSupply    685
dtype: int64

In [44]:
# 5. Filter the crypto_df DataFrame so it only has rows where coins have been mined.

crypto_df = crypto_df[(crypto_df["TotalCoinsMined"] > 0)]
crypto_df.count()


CoinName           532
Algorithm          532
ProofType          532
TotalCoinsMined    532
TotalCoinSupply    532
dtype: int64

In [45]:
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [46]:
# 6. Create a new DataFrame that holds only the cryptocurrency names, and use the crypto_df DataFrame index as the index for this new DataFrame.
crypto_name_list_df = crypto_df.drop(columns=['Algorithm','ProofType','TotalCoinsMined','TotalCoinSupply'])
crypto_name_list_df.head()

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [47]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE
crypto_df = crypto_df.drop(columns=["CoinName"], axis=1)
crypto_df.head(10)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [48]:
crypto_df.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [49]:
# Use get_dummies() to create variables for text features.

# 8. Use the get_dummies() method to create variables for the two text features,
# Algorithm and ProofType, and store the resulting data in a new DataFrame
# named X.

# Convert TotalCoinSupply to a Float
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].astype(float)

# Create our features
colList = list(crypto_df.select_dtypes(include=['object']).columns)
X = pd.get_dummies(crypto_df, columns=colList)
X.head(10)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# 9. Use the StandardScaler fit_transform() function to standardize the 
# features from the X DataFrame.
X = StandardScaler().fit_transform(X)
print(X)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [51]:
# Using PCA to reduce dimension to three principal components.

# Now that the data has been standardized, we can use 
# PCA to reduce the number of features. The PCA method
# takes an argument of n_components, which will pass in the
# value of 3 to reduce from large number of features

# Set number of components
numComponents = 3

# Initialize PCA model
pca = PCA(n_components=numComponents)

# Get principal components
crypto_pca = pca.fit_transform(X)

crypto_pca

array([[-0.32795165,  1.00319884, -0.53720678],
       [-0.31128285,  1.00338565, -0.53774042],
       [ 2.31017576,  1.59603496, -0.64462304],
       ...,
       [ 0.32144101, -2.31900398,  0.37304756],
       [-0.15508019, -1.97515049,  0.35480791],
       [-0.28475165,  0.78014513, -0.18802634]])

In [52]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE

pcs_df = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=crypto_df.index)
pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.327952,1.003199,-0.537207
404,-0.311283,1.003386,-0.53774
1337,2.310176,1.596035,-0.644623
BTC,-0.145182,-1.337789,0.13461
ETH,-0.163681,-1.932914,0.390676
LTC,-0.159546,-1.141187,0.00477
DASH,-0.404082,1.302546,-0.512422
XMR,-0.147785,-2.253601,0.374123
ETC,-0.162122,-1.933016,0.39065
ZEC,-0.155079,-1.97515,0.354808


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [53]:
# 2. Using the pcs_df DataFrame, create an elbow curve using hvPlot to find the
# best value for K.

# Create an elbow curve to find the best value for K.

# We'll start with creating an empty list to hold inertia values. 
# We'll also store a range of K values we want to test. Enter the code in a new cell:
inertia = []
k = list(range(1, 11))

# Next, we'll loop through each K value, 
# find the inertia, and store it into our list. Enter the code in the next cell:

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# We'll create a DataFrame that stores our K values and their appropriate inertia values. 
# This will allow for an easy plot of the results withhvplot. In another new cell, enter the code:

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)



Running K-Means with `k=4`

In [54]:
# 3. Next, use the pcs_df DataFrame to run the K-means algorithm to make 
# predictions of the K clusters for the cryptocurrencies’ data.

# Initialize the K-Means model.
model = KMeans(n_clusters=4)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

predictions

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,

In [55]:
# 4. Create a new DataFrame named clustered_df by concatenating the 
# crypto_df and pcs_df DataFrames on the same columns. 
# The index should be the same as the crypto_df DataFrame.

clustered_df = pd.merge(crypto_df, pcs_df, left_index=True, right_index=True)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = clustered_df.merge(crypto_name_list_df[['CoinName']], left_index=True, right_index=True, how = 'left')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.

clustered_df["Class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42.0,-0.327952,1.003199,-0.537207,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.311283,1.003386,-0.53774,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159300000.0,2.310176,1.596035,-0.644623,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.145182,-1.337789,0.13461,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0.0,-0.163681,-1.932914,0.390676,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.159546,-1.141187,0.00477,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.404082,1.302546,-0.512422,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.147785,-2.253601,0.374123,Monero,1
ETC,Ethash,PoW,113359700.0,210000000.0,-0.162122,-1.933016,0.39065,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000.0,-0.155079,-1.97515,0.354808,ZCash,1


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

2. Create a 3D scatter plot using the Plotly Express scatter_3d() function to plot the three clusters from the clustered_df DataFrame.

3. Add the CoinName and Algorithm columns to the hover_name and hover_data parameters, respectively, so each data point shows the CoinName and Algorithm on hover.

4. Create a table with tradable cryptocurrencies using the hvplot.table() function.

5. Print the total number of tradable cryptocurrencies in the clustered_df DataFrame.

6. Use the MinMaxScaler().fit_transform method to scale the TotalCoinSupply and TotalCoinsMined columns between the given range of zero and one.

7. Create a new DataFrame using the clustered_df DataFrame index that contains the scaled data you created in Step 5.

8. Add the CoinName column from the clustered_df DataFrame to the new DataFrame.

9. Add the Class column from the clustered_df DataFrame to the new DataFrame.

10. Create an hvplot scatter plot with x="TotalCoinsMined", y="TotalCoinSupply", and by="Class", and have it show the CoinName when you hover over the the data point.

In [61]:
# Creating a 3D-Scatter with the PCA data and the clusters

# 2. Create a 3D scatter plot using the Plotly Express scatter_3d() function 
# to plot the three clusters from the clustered_df DataFrame.

# 3. Add the CoinName and Algorithm columns to the hover_name and hover_data parameters,
# respectively, so each data point shows the CoinName and Algorithm on hover.

#clustered_df["HoverName"] = f"{clustered_df.CoinName}, Algorithm: {clustered_df.Algorithm}"

fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    hover_name = "CoinName",
    hover_data = ['CoinName','Algorithm']
)

fig.show()

In [57]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(columns=['CoinName','Algorithm','ProofType','TotalCoinSupply','TotalCoinsMined','Class'], sortable=True, selectable=True)

In [73]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE
print(f"There are {clustered_df['CoinName'].count()} tradable currencies.")

There are 532 tradable currencies.


In [84]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

# Use the MinMaxScaler().fit_transform method to scale the TotalCoinSupply 
# and TotalCoinsMined columns between the given range of zero and one.
data = clustered_df[['TotalCoinSupply','TotalCoinsMined']].copy()
data

Unnamed: 0,TotalCoinSupply,TotalCoinsMined
42,42.0,41.99995
404,532000000.0,1055185000.0
1337,314159300000.0,29279420000.0
BTC,21000000.0,17927180.0
ETH,0.0,107684200.0
LTC,84000000.0,63039240.0
DASH,22000000.0,9031294.0
XMR,0.0,17201140.0
ETC,210000000.0,113359700.0
ZEC,21000000.0,7383056.0


In [85]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(data)
scaled


array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [88]:
# 7. Create a new DataFrame using the clustered_df DataFrame index 
# that contains the scaled data you created in Step 5.
plot_df = pd.DataFrame(
    data=clustered_df, columns=["TotalCoinSupply", "TotalCoinsMined"], index=clustered_df.index)

# 8. Add the CoinName column from the clustered_df DataFrame to 
# the new DataFrame.
plot_df = plot_df.merge(crypto_name_list_df[['CoinName']], left_index=True, right_index=True, how = 'left')

# 9. Add the Class column from the clustered_df DataFrame to the new DataFrame.
plot_df = plot_df.merge(clustered_df[['Class']], left_index=True, right_index=True, how = 'left')


plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,42.0,41.99995,42 Coin,0
404,532000000.0,1055185000.0,404Coin,0
1337,314159300000.0,29279420000.0,EliteCoin,0
BTC,21000000.0,17927180.0,Bitcoin,1
ETH,0.0,107684200.0,Ethereum,1
LTC,84000000.0,63039240.0,Litecoin,1
DASH,22000000.0,9031294.0,Dash,0
XMR,0.0,17201140.0,Monero,1
ETC,210000000.0,113359700.0,Ethereum Classic,1
ZEC,21000000.0,7383056.0,ZCash,1


In [91]:
#10. Create an hvplot scatter plot with x="TotalCoinsMined", 
# y="TotalCoinSupply", and by="Class", and have it show the 
# CoinName when you hover over the the data point.

plot_df.hvplot(kind="scatter", x='TotalCoinsMined', y='TotalCoinSupply', by='Class', hover_cols='CoinName')


