In [81]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Data Preparation

In [82]:
# Loading data
file_path = Path("crypto_data.csv")
crypto_df = pd.read_csv(file_path)
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [84]:
# Discard all cryptocurrencies that are not being traded - filter and remove
filtered_crypto_df = crypto_df[crypto_df['IsTrading'] == True]
filtered_crypto_df.set_index("Unnamed: 0")
filtered_crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [63]:
# Drop the IsTrading column
new_crypto_df = filtered_crypto_df.drop(["IsTrading"], axis='columns')
new_crypto_df

Unnamed: 0,Index,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,PoW,,1000000000
1244,UOS,UOS,SHA-256,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [64]:
# Remove all rows that have at least one null value
new_crypto_df.dropna(inplace=True)
new_crypto_df

Unnamed: 0,Index,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
4,808,808,SHA-256,PoW/PoS,0.000000e+00,0
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.14159E+11
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [65]:
# Filter for cryptocurrencies that have been mined
# new_crypto_df.dtypes
new_crypto_df['TotalCoinSupply'] = pd.to_numeric(new_crypto_df['TotalCoinSupply'])

new_crypto_df = new_crypto_df[new_crypto_df['TotalCoinsMined'] > 0]


new_crypto_df

Unnamed: 0,Index,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
2,404,404Coin,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
5,1337,EliteCoin,X13,PoW/PoS,2.927942e+10,3.141590e+11
7,BTC,Bitcoin,SHA-256,PoW,1.792718e+07,2.100000e+07
8,ETH,Ethereum,Ethash,PoW,1.076842e+08,0.000000e+00
...,...,...,...,...,...,...
1238,ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2.000000e+09
1242,GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
1245,BDX,Beldex,CryptoNight,PoW,9.802226e+08,1.400223e+09
1246,ZEN,Horizen,Equihash,PoW,7.296538e+06,2.100000e+07


In [66]:
# Delete the CoinName from the original dataframe
final_crypto_df = new_crypto_df.drop(["CoinName"], axis='columns')
final_crypto_df

Unnamed: 0,Index,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01
2,404,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08
5,1337,X13,PoW/PoS,2.927942e+10,3.141590e+11
7,BTC,SHA-256,PoW,1.792718e+07,2.100000e+07
8,ETH,Ethash,PoW,1.076842e+08,0.000000e+00
...,...,...,...,...,...
1238,ZEPH,SHA-256,DPoS,2.000000e+09,2.000000e+09
1242,GAP,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08
1245,BDX,CryptoNight,PoW,9.802226e+08,1.400223e+09
1246,ZEN,Equihash,PoW,7.296538e+06,2.100000e+07


In [80]:
# Convert the remaining features with text values, Algorithm and ProofType, into numerical data
# Use Pandas to create dummy variables
X = pd.get_dummies(final_crypto_df, columns=['Algorithm', 'ProofType'])
X.dtypes

Index                                object
TotalCoinsMined                     float64
TotalCoinSupply                     float64
Algorithm_1GB AES Pattern Search      uint8
Algorithm_536                         uint8
                                     ...   
ProofType_Proof of Authority          uint8
ProofType_Proof of Trust              uint8
ProofType_TPoS                        uint8
ProofType_Zero-Knowledge Proof        uint8
ProofType_dPoW/PoW                    uint8
Length: 99, dtype: object

In [75]:
# Standardize your dataset so that columns that contain larger values do not unduly influence the outcome
X_scaled = StandardScaler().fit_transform(X)
X_scaled

ValueError: could not convert string to float: 'BTC'

## Dimensionality Reduction

In [None]:
# Perform dimensionality reduction with PCA - for this project, preserve 90% of the explained variance in dimensionality reduction
PCA(n_components=0.9)
crypto_pca = pca.fit_transform(X_scaled)

In [None]:
# Further reduce the dataset dimensions with t-SNE
# Run t-SNE on the principal components: the output of the PCA transformation

In [None]:
# Create a scatter plot of the t-SNE output. Observe whether there are distinct clusters or not.

## Cluster Analysis with k-means

In [None]:
# Create an elbow plot to identify the best number of clusters

In [None]:
# Use a for-loop to determine the inertia for each k between 1 through 10
# Determine, if possible, where the elbow of the plot is, and at which value of k it appears