## Imports

In [38]:
# Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

## Load Cryptocurrency Data

In [39]:
# Load data file
crypto_df = pd.read_csv("crypto_data.csv")
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,42.0,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [40]:
# Print count of cryptocurrencies loaded from the data file
count_crypto_file = crypto_df["CoinName"].count()
print(f"The count of crypto currencies from the file is: {count_crypto_file}")

The count of crypto currencies from the file is: 1252


## Data Preprocessing
0. Convert TotalCoinSupply and TotalCoinsMined to int64 
1. Remove all cryptocurrencies that aren’t trading
2. Remove all cryptocurrencies that don’t have an algorithm defined
3. Remove the IsTrading column
4. Remove all cryptocurrencies with at least one null value
   * Convert TotalCoinSupply and TotalCoinsMined to int64
5. Remove all cryptocurrencies wit1. Remove all cryptocurrencies that aren’t tradinghout coins mined
6. Store the names of all cryptocurrencies on a DataFramed named coins_name and use the crypto_df.index as the index
7. Remove the CoinName column
8. Use LableEncoder to encode text features ~~Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X~~
9. Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame

### 1. Remove all cryptocurrencies that aren’t trading

In [41]:
# Create a DataFrame including only cryptocurrencies that ARE being traded
crypto_df = crypto_df[(crypto_df["IsTrading"] == True)]

In [42]:
# Validate that DataFrame contains only cryptocurrencies that ARE being trading
crypto_df[(crypto_df["IsTrading"] != True)].head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply


In [43]:
# Print the count of cryptocurrencies that ARE being traded
count_1_crypto_trading = crypto_df["CoinName"].count()
print(f"The count of cryptocurrencies that are trading is: {count_1_crypto_trading}")

The count of cryptocurrencies that are trading is: 1144


### 2. Remove all cryptocurrencies that don’t have an algorithm defined

In [44]:
# Create a DataFrame including only cryptocurrencies that HAVE an algorithm defined
crypto_df = crypto_df[(crypto_df["Algorithm"].isna() != True) & (crypto_df["Algorithm"].str.len() != 0)]

In [45]:
# Validate that data frame contains only cryptocurrencies that HAVE an algorithm defined
crypto_df[(crypto_df["Algorithm"].isna() == True) & (crypto_df["Algorithm"].str.len() == 0)]

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply


In [46]:
# Print the count of cryptocurrencies that HAVE an algorithm defined
count_2_crypto_w_algorithm = crypto_df["CoinName"].count()
print(f"The count of cryptocurrencies that have a algorithm defined is: {count_2_crypto_w_algorithm}")

The count of cryptocurrencies that have a algorithm defined is: 1144


### 3. Remove the IsTrading column

In [47]:
# Drop IsTrading column
crypto_df.drop(columns=["IsTrading"], inplace=True)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,42.0,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


### 4. Remove all cryptocurrencies with at least one null value

In [48]:
# Select cryptocurrencies that HAVE values in all fields
pd.options.mode.use_inf_as_na = True # =True rows that that '' or numpy.inf values will be dropped also
crypto_df.dropna(inplace=True)

In [49]:
# Validate that data frame contains only cryptocurrencies that HAVE values in all fields
crypto_df[(crypto_df.isna() == True)].count()

Unnamed: 0         0
CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [50]:
# Print the count of cryptocurrencies that HAVE values in all fields
count_4_crypto_all_data = crypto_df["CoinName"].count()
print(f"The count of cryptocurrencies with no missing data is: {count_4_crypto_all_data}")

The count of cryptocurrencies with no missing data is: 685


### 4.1 Convert TotalCoinSupply and TotalCoinsMined to int64

In [51]:
# Convert TotalCoinSupply from string object field type to int64
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace("\s+", "")
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace("\.", "")
crypto_df["TotalCoinSupply"] = pd.to_numeric(crypto_df['TotalCoinSupply'])

# Convert TotalCoinsMined from float64 data type to int64
crypto_df['TotalCoinsMined'] = crypto_df['TotalCoinsMined'].apply(lambda x: '%.f' % x)
crypto_df["TotalCoinsMined"] = pd.to_numeric(crypto_df['TotalCoinsMined'])

# Get DataFrame data types
crypto_df.dtypes

Unnamed: 0         object
CoinName           object
Algorithm          object
ProofType          object
TotalCoinsMined     int64
TotalCoinSupply     int64
dtype: object

### 5. Remove all cryptocurrencies without coins mined

In [52]:
# Select cryptocurrencies with TotalCoinsMined GREATER THAN 0
crypto_df = crypto_df[(crypto_df["TotalCoinsMined"] > 0)]

In [53]:
# Validate that the DataFrame contains only cryptocurrencies with TotalCoinsMined GREATER THAN 0
crypto_df[(crypto_df["TotalCoinsMined"] == 0)]

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply


In [54]:
# Print the count of cryptocurrencies with TotalCoinsMined GREATER THAN 0
count_crypto_mined_ge_0 = crypto_df["CoinName"].count()
print(f"The count of cryptocurrencies where the TotalCoinsMined > 0: {count_crypto_mined_ge_0}")

The count of cryptocurrencies where the TotalCoinsMined > 0: 532


### 6. Store the names of all cryptocurrencies on a DataFramed named coins_name and use the crypto_df.index as the index

In [55]:
# Set the DataFrame index to the values IN Unnamed: 0
crypto_df.set_index(crypto_df["Unnamed: 0"], inplace=True, verify_integrity=True)
crypto_df.rename_axis(None, inplace=True)
crypto_df.drop(["Unnamed: 0"], axis=1, inplace=True)
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42,42
404,404Coin,Scrypt,PoW/PoS,1055184902,532000000
1337,EliteCoin,X13,PoW/PoS,29279424623,314159265359
BTC,Bitcoin,SHA-256,PoW,17927175,21000000
ETH,Ethereum,Ethash,PoW,107684223,0


In [56]:
# Create coin_name_df DataFrame
coin_name_df = crypto_df["CoinName"]
coin_name_df.head()

42        42 Coin
404       404Coin
1337    EliteCoin
BTC       Bitcoin
ETH      Ethereum
Name: CoinName, dtype: object

### 7. Remove the CoinName column

In [57]:
# Drop the CoinName column from the DataFrame
crypto_df = crypto_df.drop(["CoinName"], axis=1)
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,42,42
404,Scrypt,PoW/PoS,1055184902,532000000
1337,X13,PoW/PoS,29279424623,314159265359
BTC,SHA-256,PoW,17927175,21000000
ETH,Ethash,PoW,107684223,0


### 8. Use encoder to encode for all of the text features X

In [58]:
# Encode Algorithm and ProofType using LabelEncoder
lencoder = LabelEncoder()
crypto_df["Algorithm"] = lencoder.fit_transform(crypto_df["Algorithm"])
crypto_df["ProofType"] = lencoder.fit_transform(crypto_df["ProofType"])
X = crypto_df
X.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,52,15,42,42
404,52,15,1055184902,532000000
1337,66,15,29279424623,314159265359
BTC,47,12,17927175,21000000
ETH,20,12,107684223,0


### 9. Use the StandardScaler from sklearn to standardize all of the data from the X DataFrame

In [59]:
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[ 3.76459118e-01,  8.91356555e-01, -1.17108170e-01,
        -1.52870298e-01],
       [ 3.76459118e-01,  8.91356555e-01, -9.39695522e-02,
        -1.45008997e-01],
       [ 1.21543803e+00,  8.91356555e-01,  5.24945609e-01,
         4.48942416e+00],
       ...,
       [-2.14047761e+00,  1.67233875e-03, -9.56133629e-02,
        -1.32179374e-01],
       [-1.66106109e+00,  1.67233875e-03, -1.16948169e-01,
        -1.52559984e-01],
       [ 3.76459118e-01, -1.48113469e+00, -1.17105357e-01,
        -1.52855521e-01]])

## Reducing Data Dimensions Using PCA

In [60]:
# Initialize PCA model
pca = PCA(n_components=3)

In [61]:
# Get two principal components for the iris data.
X_pca = pca.fit_transform(X_scaled)

In [62]:
X_pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2", "PC3"])
X_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
0,-0.417875,0.810296,0.372138
1,-0.396564,0.815135,0.373256
2,3.124076,2.20978,0.504335
3,-0.192083,0.016266,-0.07291
4,-0.044116,-1.167492,1.012525


In [63]:
pca.explained_variance_ratio_

array([0.4350467 , 0.28670411, 0.20538956])

In [64]:
X_pca_df = X_pca_df.set_index(crypto_df.index)
X_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.417875,0.810296,0.372138
404,-0.396564,0.815135,0.373256
1337,3.124076,2.20978,0.504335
BTC,-0.192083,0.016266,-0.07291
ETH,-0.044116,-1.167492,1.012525


## Clustering Cryptocurrencies Using K-means

1. Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.
2. Run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data
3. Create a new DataFrame named clustered_df 
    1 Maintain the crypto_df DataFrames index
    2 Include the following columns: 
    * Algorithm
    * ProofType
    * TotalCoinsMined
    * TotalCoinSupply
    * PC 1
    * PC 2
    * PC 3
    * CoinName
    * Class

#### 1. Create an elbow curve to find the best value for K, and use the pcs_df DataFrame

In [65]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

In [66]:
# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.head()

Unnamed: 0,k,inertia
0,1,1972.95473
1,2,1294.355598
2,3,848.024904
3,4,573.184648
4,5,422.226493


In [67]:
elbow_df.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

#### 2. Run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data

In [68]:
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)

In [69]:
# Add the predicted class columns
X_pca_df["Class"] = model.labels_
X_pca_df.head()

Unnamed: 0,PC1,PC2,PC3,Class
42,-0.417875,0.810296,0.372138,4
404,-0.396564,0.815135,0.373256,4
1337,3.124076,2.20978,0.504335,1
BTC,-0.192083,0.016266,-0.07291,4
ETH,-0.044116,-1.167492,1.012525,3


#### 3. Create a new DataFrame named clustered_df

In [70]:
clustered_df = crypto_df.join(X_pca_df, how="inner")
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,Class
42,52,15,42,42,-0.417875,0.810296,0.372138,4
404,52,15,1055184902,532000000,-0.396564,0.815135,0.373256,4
1337,66,15,29279424623,314159265359,3.124076,2.20978,0.504335,1
BTC,47,12,17927175,21000000,-0.192083,0.016266,-0.07291,4
ETH,20,12,107684223,0,-0.044116,-1.167492,1.012525,3


In [71]:
clustered_df = clustered_df.join(coin_name_df, how="inner")
clustered_df = clustered_df[["Algorithm","ProofType","TotalCoinsMined","TotalCoinSupply","PC1","PC2","PC3", "CoinName", "Class"]]
clustered_df["Class Name"] = clustered_df["Class"].apply(str)
clustered_df["Class Name"] = clustered_df["Class Name"].apply(lambda x: "Class " + x)
clustered_df.sort_values(["Class","TotalCoinsMined", "TotalCoinSupply"], inplace=True)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class,Class Name
XBC,52,7,128327,1000000,0.110456,-0.7227,-1.350819,BitcoinPlus,0,Class 0
SQL,64,9,329200,0,-0.086886,0.186782,-1.40252,Squall Coin,0,Class 0
DASHP,64,7,715659,19700000,0.045399,-0.196423,-1.833216,Dash Platinum,0,Class 0
PNY,52,7,1042012,16880000000,0.281108,-0.685981,-1.313384,Peony Coin,0,Class 0
SEM,53,1,1231147,100000000,0.502275,-1.828376,-2.683022,Semux,0,Class 0


## Visualizing Results

In [72]:
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC1", y="PC2", z="PC3", color="Class Name", symbol="Class Name", width=800, hover_name="CoinName", hover_data=["Algorithm"])
fig.update_layout(legend=dict(x=0,y=1), title_text='Cryptocurrency Scatter Plot')
fig.show()

In [73]:
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"], width=600)

In [74]:
# Add a new class column to the df_iris
clustered_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class Name", hover_cols=["CoinName"], title="Mined vrs Supply")