In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_olivetti_faces
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from utils import shuffle_split

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 10)

In [None]:
faces = fetch_olivetti_faces()

In [None]:
X_train, y_train, X_test, y_test = shuffle_split(faces.data, faces.target, 0.8)
X_cv, y_cv, X_test, y_test = shuffle_split(X_test, y_test, 0.5)
len(X_train)

In [None]:
plt.hist(y_train, rwidth=0.9, bins=40)

In [None]:
kms = []
for i in range(2, 150):
    km = KMeans(n_clusters=i)
    km.fit(X_train)
    kms.append(km)

In [None]:
inertias = [km.inertia_ for km in kms]

In [None]:
plt.plot(range(2, 150), inertias, "-o")

Hm, this is not very helpful, let's try the silhouette score

In [None]:
scores = [silhouette_score(X_train, km.labels_) for km in kms]
plt.plot(range(2, 150), scores, "-o")

In [None]:
best_km_idx = np.argmax(scores)
print(best_km_idx)

In [None]:
best_km = kms[best_km_idx]

In [None]:
y_train_pred = best_km.predict(X_train)

In [None]:
def plot_clusters(X, y, n_cols=5):
    for cluster in range(np.min(y), np.max(y) + 1):
        images_idx = np.where(y == cluster)        
        images = X[images_idx]
        if len(images) == 0:
            continue
        fig = plt.figure(edgecolor="black", linewidth=1)
        fig.supylabel(f"Cluser {cluster}")
        n_rows = len(images) // n_cols + 1
        for i in range(len(images)):
            ax = fig.add_subplot(n_rows, n_cols, i + 1)
            ax.imshow(images[i].reshape((64, -1)))

In [None]:
plot_clusters(X_train, y_train_pred)

# 11

In [None]:
clf = SVC()
clf.fit(X_train, y_train)

In [None]:
accuracy_score(y_cv, clf.predict(X_cv))

In [None]:
clf.fit(best_km.transform(X_train), y_train)

In [None]:
clf.score(best_km.transform(X_cv), y_cv)

In [None]:
for n_clusters in range (50, 150):
    km = KMeans(n_clusters)
    km.fit(X_train)
    X_train_ext = np.hstack((X_train, km.transform(X_train)))
    clf_ext = SVC()
    clf_ext.fit(X_train_ext, y_train)
    X_cv_ext = np.hstack((X_cv, km.transform(X_cv)))
    print(f"Score for {n_clusters} cluster extension: {clf_ext.score(X_cv_ext, y_cv)}")

# 12

In [None]:
gm_pipeline = make_pipeline(PCA(0.99), GaussianMixture())
gm = GaussianMixture()
gm.fit(X_train)

In [None]:
X_sample, y_sample = gm.sample(10)

In [None]:
for i in range(len(X_sample)):
    fig, ax = plt.subplots()
    ax.imshow(X_sample[i, :].reshape((64, -1)))

In [None]:
gm.score_samples(X_train[:10])

In [None]:
n_rotated = 4
rotated = np.transpose(X_train[:n_rotated].reshape(-1, 64, 64), axes=[0, 2, 1])
rotated = rotated.reshape(-1, 64*64)
y_rotated = y_train[:n_rotated]

n_flipped = 3
flipped = X_train[:n_flipped].reshape(-1, 64, 64)[:, ::-1]
flipped = flipped.reshape(-1, 64*64)
y_flipped = y_train[:n_flipped]

n_darkened = 3
darkened = X_train[:n_darkened].copy()
darkened[:, 1:-1] *= 0.3
y_darkened = y_train[:n_darkened]

X_bad_faces = np.r_[rotated, flipped, darkened]
y_bad = np.concatenate([y_rotated, y_flipped, y_darkened])

plot_clusters(X_bad_faces, y_bad)

In [None]:
gm.score_samples(X_bad_faces).mean()

In [None]:
gm.score_samples(X_test).mean()