In [1]:
# Booth Office Hours

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# Unsupervised Learning Algorithms
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

# Data Processing
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage

# Metrics
from sklearn.metrics import silhouette_samples, silhouette_score

In [3]:
# Import Dataset
file = 'crypto_data.csv'

In [4]:
# Convert "Dataset" to Dataframe
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1252 entries, 0 to 1251
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1252 non-null   object 
 1   CoinName         1252 non-null   object 
 2   Algorithm        1252 non-null   object 
 3   IsTrading        1252 non-null   bool   
 4   ProofType        1252 non-null   object 
 5   TotalCoinsMined  744 non-null    float64
 6   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 60.0+ KB


In [6]:
# What are the "non-trading" coins
df.IsTrading.value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [7]:
# Drop the "False" "IsTrading" columns
df2 = df.loc[df.IsTrading].reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       1144 non-null   object 
 1   CoinName         1144 non-null   object 
 2   Algorithm        1144 non-null   object 
 3   IsTrading        1144 non-null   bool   
 4   ProofType        1144 non-null   object 
 5   TotalCoinsMined  685 non-null    float64
 6   TotalCoinSupply  1144 non-null   object 
dtypes: bool(1), float64(1), object(5)
memory usage: 54.9+ KB


In [8]:
#  If there are any NA values present, drop that row.
df2 = df2.dropna(how="any").reset_index(drop=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685 entries, 0 to 684
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       685 non-null    object 
 1   CoinName         685 non-null    object 
 2   Algorithm        685 non-null    object 
 3   IsTrading        685 non-null    bool   
 4   ProofType        685 non-null    object 
 5   TotalCoinsMined  685 non-null    float64
 6   TotalCoinSupply  685 non-null    object 
dtypes: bool(1), float64(1), object(5)
memory usage: 32.9+ KB


In [9]:
# Create a DataFrame with Coins that were mined
mask = df2.TotalCoinsMined > 0
df2 = df2.loc[mask].reset_index(drop=True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       532 non-null    object 
 1   CoinName         532 non-null    object 
 2   Algorithm        532 non-null    object 
 3   IsTrading        532 non-null    bool   
 4   ProofType        532 non-null    object 
 5   TotalCoinsMined  532 non-null    float64
 6   TotalCoinSupply  532 non-null    object 
dtypes: bool(1), float64(1), object(5)
memory usage: 25.6+ KB


In [10]:
df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

In [13]:
for col in df2.columns:
    if type(df2[col].values[0]) == str:
        print(col)
        print(df2[col].nunique())
        print()

Unnamed: 0
532

CoinName
531

Algorithm
71

ProofType
25

TotalCoinSupply
280



In [14]:
for col in df2.columns:
    if type(df2[col].values[0]) != str:
        print(col)
        print(df2[col].nunique())
        print()

IsTrading
1

TotalCoinsMined
518



In [15]:
df2.head() 

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
2,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
3,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
4,ETH,Ethereum,Ethash,True,PoW,107684200.0,0


In [16]:
# Remove non-numeric columns
df2 = df2.drop(["Unnamed: 0", "CoinName", "IsTrading"], axis=1)
df2.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
1,Scrypt,PoW/PoS,1055185000.0,532000000
2,X13,PoW/PoS,29279420000.0,314159265359
3,SHA-256,PoW,17927180.0,21000000
4,Ethash,PoW,107684200.0,0


In [17]:
# Convert "TotalCoinSupply" column to "float" column type
df2["TotalCoinSupply"] = df2.TotalCoinSupply.astype(float)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    float64
dtypes: float64(2), object(2)
memory usage: 16.8+ KB


In [20]:
# Determine columns to "One-Hot Encode" or "Label Encode"
print(df2.Algorithm.nunique())
df2.Algorithm.value_counts()

71


Scrypt                   182
X11                       73
SHA-256                   48
CryptoNight               19
X13                       17
                        ... 
Time Travel                1
SkunkHash v2 Raptor        1
VeChainThor Authority      1
Ouroboros                  1
TRC10                      1
Name: Algorithm, Length: 71, dtype: int64

In [24]:
# Algorithms with < 10 uses 
alg = df2.Algorithm.value_counts()
alg_repl = list(alg[alg<10].index)
alg_repl[0:10]

['Multiple',
 'Ethash',
 'DPoS',
 'XEVAN',
 'CryptoNight-V7',
 'X16R',
 'Lyra2RE',
 'Groestl',
 'X15',
 'SHA-256D']

In [25]:
mask = df2.Algorithm.isin(alg_repl)
df2.loc[mask, "Algorithm"] = "Other"

df2.Algorithm.value_counts()

Scrypt         182
Other          131
X11             73
SHA-256         48
CryptoNight     19
X13             17
PoS             17
Quark           13
Equihash        12
NeoScrypt       10
NIST5           10
Name: Algorithm, dtype: int64

In [26]:
# Consolidate "other" Algorithms
mask = df2.Algorithm.isin(alg_repl)
df2.loc[mask, "Algorithm"] = "Other"

df2.Algorithm.value_counts()

Scrypt         182
Other          131
X11             73
SHA-256         48
CryptoNight     19
X13             17
PoS             17
Quark           13
Equihash        12
NeoScrypt       10
NIST5           10
Name: Algorithm, dtype: int64

In [27]:
df2.ProofType.value_counts()

PoW                     237
PoW/PoS                 176
PoS                      86
DPoS                      9
PoC                       3
PoS/PoW                   2
POBh                      1
LPoS                      1
Proof of Trust            1
Pos                       1
DPOS                      1
Zero-Knowledge Proof      1
HPoW                      1
PoA                       1
PoW + Hive                1
PoW/PoS                   1
PoW and PoS               1
Proof of Authority        1
TPoS                      1
PoW/PoW                   1
dPoW/PoW                  1
PoW/nPoS                  1
PoST                      1
PoS/PoW/PoT               1
PoS/LPoS                  1
Name: ProofType, dtype: int64

In [28]:
# Consolidate Proof Types
df2.loc[df2.ProofType == 'Pos', "ProofType"] = "PoS"
df2.loc[df2.ProofType == 'PoW and PoS', "ProofType"] = "PoW/PoS"
df2.loc[df2.ProofType == 'PoS/PoW', "ProofType"] = "PoW/PoS"
df2.loc[df2.ProofType == 'PoW/PoS ', "ProofType"] = "PoW/PoS"
df2.loc[df2.ProofType == 'PoW/PoW ', "ProofType"] = "PoW"

In [None]:
# Find "proof types" with less than 50 instances
types = df2.ProofType.value_counts()
type_replace = list(types[types<50].index)
type_replace[0:10] 