In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [21]:
# Loading data
file_path = Path("crypto_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [22]:
df.shape

(1252, 7)

In [23]:
# Filter for only cryptocurrencies that are currently trading
df = df.loc[df['IsTrading'] == True]
df.shape

(1144, 7)

In [24]:
# Drop isTrading column now since not relevant to model
df = df.drop('IsTrading', 1)
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365,365Coin,X11,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,PoW,,611000
4,808,808,SHA-256,PoW/PoS,0.0,0


In [25]:
# Drop all rows that have any null values
df = df.dropna(axis=0, how='any')
df.shape

(685, 6)

In [26]:
df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [27]:
# Only keep currencies that have been mined
df = df.loc[df['TotalCoinsMined'] > 0]

In [28]:
# Drop more columns not relevant to model
df = df.drop(['Unnamed: 0', 'CoinName'], 1)

In [29]:
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [30]:
df.shape

(532, 4)

In [36]:
# Cast TotalCoinSupply to numeric before creating dummies
df["TotalCoinSupply"] = pd.to_numeric(df["TotalCoinSupply"])

In [40]:
df['Algorithm'].nunique()

71

In [41]:
df['ProofType'].nunique()

25

In [37]:
# Convert text columns to numeric 
df1 = pd.get_dummies(df, drop_first=True)

In [38]:
df1.shape

(532, 96)

In [39]:
df1.dtypes

TotalCoinsMined                   float64
TotalCoinSupply                   float64
Algorithm_536                       uint8
Algorithm_Argon2d                   uint8
Algorithm_BLAKE256                  uint8
                                   ...   
ProofType_Proof of Authority        uint8
ProofType_Proof of Trust            uint8
ProofType_TPoS                      uint8
ProofType_Zero-Knowledge Proof      uint8
ProofType_dPoW/PoW                  uint8
Length: 96, dtype: object

When we first pulled in the dataframe from a .csv file, the number of rows was 1252, and the number of columns was 7.
After completing the bulk of data preprocessing/cleaning, we now have 532 rows and 96 columns. The number of rows decreased dramatically because we discarded all cryptocurrencies that didn't have complete data (i.e. had null values), weren't currently trading, or had not yet been mined. The number of columns shot up because we made a new column for every unique string value found in the original Algorithm and ProofType columns (minus 1). 

In [42]:
df1.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df1[['TotalCoinsMined','TotalCoinSupply']])