# Training a semi-supervised model on the Olivetti faces

## Setup

In [270]:
import os
if os.getcwd().endswith("notebooks"):
    os.chdir('..')

print("Current working directory: ", os.getcwd())
if not os.getcwd().endswith("Clustering-and-Classifying-Olivetti-Faces"):
    raise ValueError("Please change working directory to 'path/Clustering-and-Classifying-Olivetti-Faces' before proceeding")

Current working directory:  /Users/irellzane/MLprojects/Clustering-and-Classifying-Olivetti-Faces


In [271]:
!pip install -r requirements.txt



In [272]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Fetch Olivetti Faces Dataset

In [273]:
from sklearn.datasets import fetch_olivetti_faces

olivetti = fetch_olivetti_faces()
list(olivetti)

['data', 'images', 'target', 'DESCR']

In [274]:
X_olivetti = olivetti.data
y_olivetti = olivetti.target
face_width = int(np.sqrt(X_olivetti.shape[1]))

## Split data

In [275]:
from sklearn.model_selection import train_test_split

X_train, X_valid_test, y_train, y_valid_test = train_test_split(X_olivetti, y_olivetti, test_size=0.3, stratify=y_olivetti, random_state=42)

In [276]:
X_valid, X_test, y_valid, y_test = train_test_split(X_valid_test, y_valid_test, test_size=1/3, stratify=y_valid_test)

In [277]:
X_train_labeled, X_train_unlabeled, y_train_labeled, y_train_unlabeled = train_test_split(X_train, y_train, train_size=2/7, random_state=42)

## Analysing supervised random forest for future comparison

In [279]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

prep_pipeline = make_pipeline(StandardScaler(), PCA(0.99)) 

X_train_prepped = prep_pipeline.fit_transform(X_train)
rndf_clf = RandomForestClassifier(n_estimators=100, max_depth=23, random_state=42)
rndf_clf.fit(X_train_prepped, y_train)

In [280]:
X_valid_prepped = prep_pipeline.transform(X_valid)
rndf_clf.score(X_valid_prepped, y_valid)

0.9375

## Training random forest on 80 instances only

In [299]:
prep_pipeline = make_pipeline(StandardScaler(), PCA(0.99)) 
X_train_labeled_prepped = prep_pipeline.fit_transform(X_train_labeled)
rndf_clf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=42)
rndf_clf.fit(X_train_labeled_prepped, y_train_labeled)

In [300]:
X_valid_prepped = prep_pipeline.transform(X_valid)
rndf_clf.score(X_valid_prepped, y_valid)

0.5

## Training on cluster 100 instances close to cluster centroids

### Cluster data

In [301]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=100, n_init=10, random_state=42)

X_olivetti_dist = kmeans.fit_transform(X_train)

In [302]:
cluster_dominant_faces_idx = np.argmin(X_olivetti_dist, axis=0)
cluster_dominant_faces = X_train[cluster_dominant_faces_idx]
len(cluster_dominant_faces)

100

#### Supposing we manually label the 100 instances

In [303]:
X_train_labeled = cluster_dominant_faces
y_train_labeled = y_train[cluster_dominant_faces_idx]

#### Training

In [313]:
prep_pipeline = make_pipeline(StandardScaler(), PCA(0.99)) 
X_train_labeled_prepped = prep_pipeline.fit_transform(X_train_labeled)

rndf_clf = RandomForestClassifier(n_estimators=100, max_depth=27, random_state=42)
rndf_clf.fit(X_train_labeled_prepped, y_train_labeled)

In [314]:
X_valid_prepped = prep_pipeline.transform(X_valid)

rndf_clf.score(X_valid_prepped, y_valid)

0.7625

## Train a random Forest on propagated labels

### Propogate dominant faces labels

In [319]:
y_train_propagated = np.empty(len(X_train))

for i in range(kmeans.n_clusters):
    y_train_propagated[kmeans.labels_ == i] = y_train_labeled[i]

#### Training

In [337]:
prep_pipeline = make_pipeline(StandardScaler(), PCA(0.99)) 
X_train_prepped = prep_pipeline.fit_transform(X_train)

rndf_clf = RandomForestClassifier(n_estimators=100, max_depth=27, random_state=42)
rndf_clf.fit(X_train_prepped, y_train_propagated)

In [338]:
X_valid_prepped = prep_pipeline.transform(X_valid)

rndf_clf.score(X_valid_prepped, y_valid)

0.8

## Removing outliers to attempt score improvement

In [350]:
X_cluster_dist = X_olivetti_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(kmeans.n_clusters):
    cluster_indices = (kmeans.labels_ == i)
    cluster_distances = X_cluster_dist[cluster_indices]
    cutoff = np.percentile(cluster_distances, 95)
    X_cluster_dist[np.logical_and(cluster_indices, X_cluster_dist > cutoff)] = -1

X_train_trimmed = X_train[X_cluster_dist != -1]
y_train_propagated_trimmed = y_train_propagated[X_cluster_dist != -1]

In [351]:
len(X_train)

280

In [352]:
len(X_train_trimmed)

231