In [None]:
import pandas as pd

df = pd.read_csv('Open source llms-2.csv') # replace with your dataset
display(df.head())
display(df.info())

In [None]:
print("Missing values in 'Parameter in B':", df['Parameter in B'].isnull().sum())
print("Missing values in 'Typical Download Size(int_4) in GB':", df['Typical Download Size(int_4) in GB'].isnull().sum())

print("\nData type of 'Parameter in B':", df['Parameter in B'].dtype)
print("Data type of 'Typical Download Size(int_4) in GB':", df['Typical Download Size(int_4) in GB'].dtype)

# Drop unnamed columns with all null values
unnamed_cols = [col for col in df.columns if 'Unnamed:' in col and df[col].isnull().all()]
df_cleaned = df.drop(columns=unnamed_cols)

display(df_cleaned.head())
display(df_cleaned.info())

In [None]:
from sklearn.preprocessing import StandardScaler

df_clustering = df_cleaned[['Parameter in B', 'Typical Download Size(int_4) in GB']]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_clustering)

display(scaled_data[:5])

In [None]:
from sklearn.cluster import KMeans

kmeans_model = KMeans(n_clusters=5, random_state=42, n_init=10)
kmeans_model.fit(scaled_data)
cluster_labels = kmeans_model.labels_

display(cluster_labels[:5])

In [None]:
df_cleaned['cluster_label'] = cluster_labels

cluster_means = df_cleaned.groupby('cluster_label')[['Parameter in B', 'Typical Download Size(int_4) in GB']].mean()
display(cluster_means)

# Determine the mapping based on the cluster means
# Assuming lower means correspond to smaller categories and higher means to larger categories
# We sort the clusters based on the sum of their mean 'Parameter in B' and 'Typical Download Size(int_4) in GB'
cluster_means['sum_means'] = cluster_means['Parameter in B'] + cluster_means['Typical Download Size(int_4) in GB']
cluster_means_sorted = cluster_means.sort_values('sum_means')

mapping = {
    cluster_means_sorted.index[0]: 'tiny',
    cluster_means_sorted.index[1]: 'small',
    cluster_means_sorted.index[2]: 'medium',
    cluster_means_sorted.index[3]: 'large',
    cluster_means_sorted.index[4]: 'Ultra',
}



df_cleaned['category'] = df_cleaned['cluster_label'].map(mapping)

display(df_cleaned.head())

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))

categories = df_cleaned['category'].unique()
colors = ['blue', 'red', 'green', 'purple','pink'] # Assign a color for each category

for i, category in enumerate(categories):
    subset = df_cleaned[df_cleaned['category'] == category]
    plt.scatter(subset['Parameter in B'], subset['Typical Download Size(int_4) in GB'],
                color=colors[i], label=category, alpha=0.7)

plt.xlabel('Parameter in B')
plt.ylabel('Typical Download Size (GB)')
plt.title('LLM Clustering based on Parameter Size and Download Size')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
grouped_by_category = df_cleaned.groupby('category')['Model Name'].apply(list)

for category, models in grouped_by_category.items():
    print(f"Category: {category}")
    print("Models:", models)
    print("-" * 30)

In [None]:
# Finding the optimum number of clusters using the Elbow Method
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

sse = []
k_range = range(3, 10)  # Check clusters from 4 to 8

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    sse.append(kmeans.inertia_) # Inertia is the sum of squared distances of samples to their closest cluster center

# Plotting the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of squared errors (SSE)')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.show()

# Based on the plot, choose the optimal k (where the elbow occurs)
# Let's assume the optimal k is determined from the plot (replace optimal_k with the value you find)
# For demonstration, let's say the optimal k is 5 (you should determine this from the plot)
optimal_k = 4 # Replace with the actual optimal k from the plot

# Apply K-Means with the optimal number of clusters
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_optimal.fit(scaled_data)
cluster_labels_optimal = kmeans_optimal.labels_

# Add the optimal cluster labels to the dataframe
df_cleaned['optimal_cluster_label'] = cluster_labels_optimal

# You can then proceed with mapping these optimal cluster labels to categories
# similar to how you did before, or analyze the clusters directly.
# The mapping logic might need to be adjusted based on the optimal number of clusters.

# Display the head of the dataframe with optimal cluster labels
display(df_cleaned.head())

# To see the distribution of models in the optimal clusters:
print("\nModels in each optimal cluster:")
grouped_by_optimal_cluster = df_cleaned.groupby('optimal_cluster_label')['Model Name'].apply(list)
for cluster, models in grouped_by_optimal_cluster.items():
    print(f"Cluster {cluster}:")
    print("Models:", models)
    print("-" * 30)

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Assuming 'scaled_data' is already defined and contains your scaled data
# Assuming 'df_cleaned' is already defined

silhouette_scores = []
k_range = range(3, 10)  # Check clusters from 2 to 9

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    score = silhouette_score(scaled_data, kmeans.labels_)
    silhouette_scores.append(score)

# Plotting the Silhouette scores
plt.figure(figsize=(8, 5))
plt.plot(k_range, silhouette_scores, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.show()

# Find the k with the highest silhouette score
optimal_k_silhouette = k_range[silhouette_scores.index(max(silhouette_scores))]
print(f"Optimal number of clusters based on Silhouette Method: {optimal_k_silhouette}")

# Apply K-Means with the optimal number of clusters found by Silhouette Method
kmeans_silhouette = KMeans(n_clusters=optimal_k_silhouette, random_state=42, n_init=10)
kmeans_silhouette.fit(scaled_data)
cluster_labels_silhouette = kmeans_silhouette.labels_

# Add the optimal cluster labels to the dataframe
df_cleaned['silhouette_cluster_label'] = cluster_labels_silhouette

# Display the head of the dataframe with optimal cluster labels from Silhouette method
display(df_cleaned.head())

# To see the distribution of models in the optimal clusters from Silhouette method:
print("\nModels in each optimal cluster (Silhouette Method):")
grouped_by_silhouette_cluster = df_cleaned.groupby('silhouette_cluster_label')['Model Name'].apply(list)
for cluster, models in grouped_by_silhouette_cluster.items():
    print(f"Cluster {cluster}:")
    print("Models:", models)
    print("-" * 30)