In [10]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.datasets import fetch_openml
import numpy as np
import pickle

mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto') 
mnist.target = mnist.target.astype(np.uint8)
X = mnist["data"]
y = mnist["target"]

In [11]:
from sklearn.metrics import silhouette_score

silhouette_scores = []

for n_clusters in [8, 9, 10, 11, 12]:
    best_inertia = np.inf
    
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    kmeans.fit(X)
    
    silhouette_avg = silhouette_score(X, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)
    
    
    print(f"Clusters: {n_clusters}, Best Inertia: {kmeans.inertia_}")
    
print(silhouette_scores)

Clusters: 8, Best Inertia: 184475663288.58374
Clusters: 9, Best Inertia: 181170513468.66922
Clusters: 10, Best Inertia: 178432239695.42792
Clusters: 11, Best Inertia: 175770300286.51297
Clusters: 12, Best Inertia: 173323112045.8375
[0.07337977998298922, 0.05681625379289227, 0.0586915389505002, 0.05835878745275728, 0.05817356340885259]


In [12]:
with open ('kmeans_sil.pkl', 'wb') as f:
    pickle.dump(silhouette_scores, f)

In [13]:
from sklearn.metrics import confusion_matrix

kmeans = KMeans(n_clusters=10, n_init=10, random_state=42)
kmeans.fit(X)
y_pred = kmeans.predict(X)

cnf_m = confusion_matrix(y, y_pred)
argmax = []
for row in cnf_m:
    armgax_index = np.argmax(row)
    argmax.append(armgax_index)
    
unique_sorted = sorted(set(argmax))

print(unique_sorted)

[0, 1, 2, 3, 5, 6, 8, 9]


In [14]:
with open('kmeans_argmax.pkl', 'wb') as f:
    pickle.dump(unique_sorted, f)

In [15]:
distances = []
for i in range(300):
    for j in range(len(X)):
        distance = np.linalg.norm(X[i] - X[j])
        if distance != 0:
            distances.append(distance)

sorted_distances = sorted(distances)[:10]


In [19]:
print(sorted_distances)

[279.26152617215286, 304.37641170103836, 317.5893575043093, 328.7658741414626, 333.4546445920344, 352.89800226127664, 355.1774204534967, 358.07401469528617, 359.64287842247063, 360.42474942767177]


In [16]:
with open ('dist.pkl', 'wb') as f:
    pickle.dump(sorted_distances, f)

In [17]:
s = np.mean(sorted_distances[:3])

eps_val = np.arange(s, s + 0.1 * s, 0.04 * s)

unique_count = []

for eps in eps_val:
    dbscan = DBSCAN(eps=eps)
    dbscan.fit(X)
    unique_labels = len(np.unique(dbscan.labels_))
    unique_count.append(unique_labels)
    
print(unique_count)

[4, 7, 22]


In [18]:
with open('dbscan_len.pkl', 'wb') as f:
    pickle.dump(unique_count, f)