In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Reading CSV into dataframe
df = pd.read_csv("crypto_data.csv")

# EDA

In [None]:
df.head()

Null values exist

In [None]:
df.info()

In [None]:
# Observing cryptos not being traded
df.IsTrading.value_counts()

In [None]:
# Observing 'Algorithm' column for any similarly named values that can be combined (part 1)
df.Algorithm.value_counts()

In [None]:
# Observing 'Algorithm' column for any similarly named values that can be combined (part 2 - full list)
df.Algorithm.value_counts().index

In [None]:
# Observing 'ProofType' column for any similarly named values that can be combined
df.ProofType.value_counts()

Several values in the 'ProofType' column can be combined and will be cleaned up later in the data preparation step

In [None]:
# Observing columns with numerical values
df.describe()

Column 'TotalCoinSupply' does not appear in the description summary since its data type = object; needs to be converted in the data preparation step

# Data Preparation

In [None]:
# Discarding all cryptocurrencies not being traded
df_prep = df.copy()
df_prep = df_prep.loc[df_prep.IsTrading == True].reset_index(drop=True)
df_prep.info()

In [None]:
# Dropping 'IsTrading' column after applying filter
df_prep.drop(["IsTrading"], axis=1, inplace=True)
df_prep.head()

In [None]:
# Removing all rows that have at least one null value
df_prep = df_prep.dropna(how="any").reset_index(drop=True)
df_prep.info()

In [None]:
# Filtering for cryptocurrencies that have been mined
df_prep = df_prep.loc[df_prep.TotalCoinsMined > 0].reset_index(drop=True)
df_prep.info()

In [None]:
# Converting 'TotalCoinSupply' column to numeric
df_prep["TotalCoinSupply"] = pd.to_numeric(df_prep.TotalCoinSupply)
df_prep.info()

'TotalCoinSupply' column is now a float data type

In [None]:
# Reexaming 'ProofType' list after column drops
df_prep.ProofType.value_counts().index

In [None]:
# Correcting column values for proper counts
df_prep.loc[df_prep.ProofType == "PoW/PoS ", "ProofType"] = "PoW/PoS"
df_prep.loc[df_prep.ProofType == "Pos", "ProofType"] = "PoS"
df_prep.loc[df_prep.ProofType == "Proof of Authority", "ProofType"] = "PoA"
df_prep.loc[df_prep.ProofType == "PoW and PoS", "ProofType"] = "PoW/PoS"
df_prep.loc[df_prep.ProofType == "Proof of Trust", "ProofType"] = "PoT"

In [None]:
# Dropping non-contributing columns
df_coin = df_prep.CoinName
df_prep.drop(["Unnamed: 0", "CoinName"], axis=1, inplace=True)
df_prep.head()

In [None]:
# Converting 'Algorithm' and 'ProofType' into numerical data
df_prep = pd.get_dummies(df_prep)
df_prep.head()

In [None]:
# Standardizing 'TotalCoinsMined' and 'TotalCoinSupply' columns due to their large values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(df_prep.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]])

df_prep_scale = df_prep.copy()
df_prep_scale.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]] = scaler.transform(df_prep.loc[:, ["TotalCoinsMined", "TotalCoinSupply"]])
df_prep_scale.head()

# Dimensionality Reduction

In [None]:
# Gathering PCA explained variance
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(df_prep_scale)
print(np.sum(pca.explained_variance_ratio_))
print(pca.explained_variance_ratio_)

In [None]:
# Overview of PCA explained variance
plt.plot(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)
plt.show()

In [None]:
# Visualizing the first 10 PCA components
plt.plot(range(10), pca.explained_variance_ratio_[0:10])
plt.show()

In [None]:
# Discovering that 11 components are needed to preserve 90% of the explained variance
pca = PCA(n_components=0.90)
pca.fit(df_prep_scale)
print(np.sum(pca.explained_variance_ratio_))
print(pca.explained_variance_ratio_)
print(len(pca.explained_variance_ratio_))

In [None]:
# Creating dataframe having 11 PCA components to preserve 90% of the explained variance
df_prep_scale_pca = pca.transform(df_prep_scale)
df_prep_scale_pca = pd.DataFrame(df_prep_scale_pca, columns = ["PCA"+ str(x) for x in range(1, len(pca.explained_variance_ratio_) + 1)])
df_prep_scale_pca.head()

In [None]:
# Creating TSNE model
from sklearn.manifold import TSNE
tsne = TSNE(learning_rate=35)
tsne_features = tsne.fit_transform(df_prep_scale_pca)
tsne_features.shape

In [None]:
# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

# Cluster Analysis with k-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Finding the best value for k
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_prep_scale_pca)
    inertia.append(km.inertia_)

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(list(range(1, 11)))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

Elbow is at k=8

In [None]:
# Predicting clusters with k=8

# Initialize the k-means model
model = KMeans(n_clusters=8, random_state=0)

# Fit the model
model.fit(df_prep_scale_pca)

# Predict clusters
predictions = model.predict(df_prep_scale_pca)

# Add the predicted class column to the dataframe
df_prep_scale_pca["class"] = model.labels_
df_prep_scale_pca.head()

In [None]:
# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1], c=df_prep_scale_pca["class"])
plt.show()

In [None]:
# Count of class sets
df_prep_scale_pca["class"].value_counts()

# Analysis and Conclusion

In [None]:
df_merged = df_prep.merge(df_prep_scale_pca, how='outer', left_index=True, right_index=True)

In [None]:
df_review = df_merged[['TotalCoinsMined', 'TotalCoinSupply', 'class']]
df_final = df_review.merge(df_coin, how='outer', left_index=True, right_index=True)
df_final.head()

In [None]:
# Reviewing cryptos
bitcoin_class = df_final.loc[df_final['CoinName'] == 'Bitcoin', 'class'].item()
bitcoin_class

In [None]:
df_final.loc[df_final['class'] == bitcoin_class]

Based on recent cryptocurrency activity, Bitcoin and Ethereum are reaching all-time highs. An important thing to note is that their class belongs to the highest cluster count. K-means clustering provides us with a list of other cryptos associated in the same class and are worth investigating possible trends. This is not financial advice