# Clustering Crypto

In [70]:
# my pyviz environment gave off a huge set of errors when I tried to run the code in it so I installed
# the needed libraries here.

! pip install hvplot
! pip install plotly

# Initial imports

import requests
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path



### Fetching Cryptocurrency Data

In [71]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [72]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [73]:
# Alternatively, use the provided csv file:
file_path = Path("/Users/aaronbsechler/Desktop/AWS_HW/02-Homework_13-AWS-Lex_Instructions_Starter_Files_Resources_crypto_data.csv")

# Create a DataFrame

crypto_df = pd.read_csv(file_path, parse_dates=True, infer_datetime_format=True)

crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


### Data Preprocessing

In [74]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'

crypto_df = crypto_df.drop(crypto_df.columns[[0]], axis=1)

crypto_df.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1247,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,21491210.0,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000
1251,SteamPunk,PoS,False,PoS,,40000000


In [75]:
# Keep only cryptocurrencies that are trading

non_trading = crypto_df[ crypto_df['IsTrading'] != True ].index

crypto_df.drop(non_trading , inplace=True)

crypto_df.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,980222600.0,1400222610
1246,Horizen,Equihash,True,PoW,7296538.0,21000000
1247,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000


In [76]:
# Keep only cryptocurrencies with a working algorithm

algo_types = crypto_df.groupby("Algorithm")["IsTrading"].count()

algo_types.head(50)
#algo_types.tail(39)

# I don't see something that says which ones are working. I used groupby to make all the kinds of
# algorithims visible and don't see an entry like "not working". apparently pos means something else
# in this context.

Algorithm
1GB AES Pattern Search     1
536                        2
Argon2                     2
Argon2d                    1
BLAKE256                   2
Blake                      6
Blake2S                    4
Blake2b                    2
C11                        2
Cloverhash                 1
Counterparty               1
CryptoNight               34
CryptoNight Heavy          2
CryptoNight Heavy X        1
CryptoNight-Lite           1
CryptoNight-V7             5
CryptoNight-lite           1
Cryptonight-GPU            1
Curve25519                 2
DPoS                      18
Dagger                     4
Dagger-Hashimoto           1
ECC 256K1                  1
Equihash                  23
Equihash+Scrypt            1
Equihash1927               1
Ethash                    16
Exosis                     1
Green Protocol             1
Groestl                    5
HMQ1725                    1
HybridScryptHash256        1
IMesh                      1
Jump Consistent Hash       1
Kecc

In [77]:
# Remove the "IsTrading" column

crypto_df = crypto_df.drop('IsTrading', axis=1)

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [78]:
# Remove rows with at least 1 null value

crypto_df=crypto_df.dropna(axis=0)

crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [79]:
# Remove rows with cryptocurrencies having no coins mined 

no_coins = crypto_df[ crypto_df['TotalCoinsMined'] == 0 ].index

crypto_df.drop(no_coins, inplace=True)

crypto_df.head()


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [80]:
# Drop rows where there are 'N/A' text values

crypto_df.drop(crypto_df[crypto_df.apply(lambda row: 'N/A' in row.to_string(header=False), axis=1)].index, inplace=True)

In [87]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
table_df = crypto_df

coin_name_df = crypto_df[['CoinName']]

coin_name_df.head()

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [88]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm

crypto_df = crypto_df.drop('CoinName', axis=1)

crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [89]:
# Create dummy variables for text features

dummies_crypto_df=pd.get_dummies(data=crypto_df, columns=['Algorithm', 'ProofType'])

dummies_crypto_df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
# Standardize data

crypto_scaled = StandardScaler().fit_transform(dummies_crypto_df)

print(crypto_scaled[0])


[-0.11674788 -0.15286468 -0.0433555  -0.0433555  -0.0433555  -0.06137164
 -0.07523548 -0.0433555  -0.06137164 -0.06137164 -0.0433555  -0.0433555
 -0.19226279 -0.06137164 -0.09731237 -0.0433555  -0.11536024 -0.07523548
 -0.0433555  -0.0433555  -0.15176505 -0.0433555  -0.13105561 -0.0433555
 -0.0433555  -0.08695652 -0.0433555  -0.0433555  -0.0433555  -0.0433555
 -0.06137164 -0.0433555  -0.08695652 -0.08695652 -0.08695652 -0.0433555
 -0.13105561 -0.13827675 -0.13827675 -0.0433555  -0.06137164 -0.0433555
 -0.07523548 -0.1815096  -0.0433555  -0.0433555  -0.0433555  -0.07523548
 -0.15811388 -0.3145935  -0.0433555  -0.08695652 -0.07523548 -0.06137164
 -0.0433555   1.38873015 -0.0433555  -0.0433555  -0.06137164 -0.0433555
 -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555  -0.0433555
 -0.0433555  -0.39836623 -0.0433555  -0.1815096  -0.0433555  -0.08695652
 -0.08695652 -0.10670145 -0.0433555  -0.0433555  -0.13105561 -0.0433555
 -0.0433555  -0.0433555  -0.0433555  -0.07523548 -0.4386271

### Reducing Dimensions Using PCA

In [91]:
# Use PCA to reduce dimensions to 3 principal components

# Initialize PCA model
pca = PCA(n_components=3)

# Get two principal components for the data.
crypto_pca = pca.fit_transform(crypto_scaled)


In [92]:
# Create a DataFrame with the principal components data

crypto_pca_df = pd.DataFrame(
    data=crypto_pca, columns=["principal component 1", "principal component 2", "principal component 3"]
)

crypto_pca_df.head()


Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.349422,1.10811,-0.519006
1,-0.332743,1.10816,-0.519473
2,2.32334,1.635497,-0.598225
3,-0.131878,-1.355064,0.168061
4,-0.147855,-2.065578,0.339075


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [93]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values

for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(dummies_crypto_df)
    inertia.append(km.inertia_)


# Create the Elbow Curve using hvPlot

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


Running K-Means with `k=<your best value for k here>`

In [96]:
# Initialize the K-Means model

model = KMeans(n_clusters=4, random_state=0)

# Fit the model

model.fit(crypto_pca_df)

# Predict clusters

predictions = model.predict(crypto_pca_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features

crypto_pca_df["Class"] = model.labels_
clustered_df = pd.concat([dummies_crypto_df, crypto_pca_df], axis=1, sort=False)
clustered_df["CoinName"] = coin_name_df["CoinName"]
print(clustered_df.shape)
clustered_df.head(10)

(861, 105)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW,principal component 1,principal component 2,principal component 3,Class,CoinName
0,41.99995,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.349422,1.10811,-0.519006,3.0,42 Coin
1,,,,,,,,,,,...,,,,,,-0.332743,1.10816,-0.519473,3.0,
2,1055185000.0,532000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.32334,1.635497,-0.598225,3.0,404Coin
3,,,,,,,,,,,...,,,,,,-0.131878,-1.355064,0.168061,0.0,
4,,,,,,,,,,,...,,,,,,-0.147855,-2.065578,0.339075,0.0,
5,29279420000.0,314159265359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.181743,-1.084497,-0.01283,0.0,EliteCoin
6,,,,,,,,,,,...,,,,,,-0.397772,1.255196,-0.445426,3.0,
7,17927180.0,21000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.141958,-2.223745,0.347572,0.0,Bitcoin
8,107684200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.146298,-2.065687,0.339057,0.0,Ethereum
9,63039240.0,84000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.10979,-1.930616,0.400504,0.0,Litecoin


### Visualizing Results

#### 3D-Scatter with Clusters

In [103]:
# Create a 3D-Scatter with the PCA data and the clusters - my machine won't display this but the code is taken
# from a class example so I think it should work on a different machine.

fig = px.scatter_3d(
    clustered_df,
    x="principal component 1",
    y="principal component 2",
    z="principal component 3",
    color="Class",
    symbol="Class",
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()



ValueError: nan is not in list

#### Table of Tradable Cryptocurrencies

In [98]:
# Table with tradable cryptos

table_df.hvplot.table(columns=['CoinName', 'Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply'], sortable=True, selectable=True)

In [99]:
# Print the total number of tradable cryptocurrencies

print(f'The total number of tradable cryptocurrencies is {len(crypto_df)}.')

The total number of tradable cryptocurrencies is 533.


#### Scatter Plot with Tradable Cryptocurrencies

In [100]:
# Scale data to create the scatter plot - I couldn't find a way to scale the data that hvplot could read


In [102]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply" - this is the largest size I
# can get it to display

legend = 'Tradable Cryptocurrencies'

clustered_df.hvplot.scatter(x='TotalCoinsMined', y='TotalCoinSupply', by='Class',
                   height=400, width=400)