In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("data/prices.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
def optimize_k(data, max_k):
    means = []
    inertias = []

    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)

        means.append(k)
        inertias.append(kmeans.inertia_)

    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.xlabel('clusters')
    plt.ylabel('inertia')
    plt.grid(True)
    plt.show()


In [None]:
new_df = df.set_index('Ticker')

In [None]:
optimize_k(new_df, 10)

In [None]:
kmeans = KMeans(n_clusters=4)

In [None]:
kmeans.fit(new_df)

In [None]:
df['cluster'] = kmeans.labels_

In [None]:
df.head()

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(new_df)

# Create a DataFrame with PCA components
pca_df = pd.DataFrame(
    data=pca_components,
    columns=['PC1', 'PC2']
)
pca_df['Ticker'] = df['Ticker'].values
pca_df['Cluster'] = df['cluster'].values

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")
pca_df.head()

In [None]:
# Scatterplot for cluster vizualisation
plt.figure(figsize=(12, 8))

colors = ['red', 'blue', 'green', 'orange']
cluster_names = [f'Cluster {i}' for i in range(4)]

for i in range(4):
    cluster_data = pca_df[pca_df['Cluster'] == i]
    plt.scatter(
        cluster_data['PC1'], 
        cluster_data['PC2'],
        c=colors[i],
        label=cluster_names[i],
        alpha=0.6,
        s=100
    )
    
    for idx, row in cluster_data.iterrows():
        plt.annotate(
            row['Ticker'],
            (row['PC1'], row['PC2']),
            fontsize=8,
            alpha=0.7
        )

plt.xlabel('First Principal Component', fontsize=12)
plt.ylabel('Second Principal Component', fontsize=12)
plt.title('K-Means Clustering of Cryptocurrencies (PCA Visualization)', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Show cluster distribution
print("Cluster Distribution:")
print(df['cluster'].value_counts().sort_index())
print("\nCryptocurrencies by Cluster:")
for i in range(4):
    tickers = df[df['cluster'] == i]['Ticker'].tolist()
    print(f"\nCluster {i} ({len(tickers)} cryptos): {', '.join(tickers)}")