# P1

In [None]:
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, SparsePCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle

from skimage.exposure import equalize_hist
from skimage.filters import gaussian

%matplotlib inline

warnings.filterwarnings('ignore')

Let's show some images from dataset.

In [None]:
size = 80
shape =  2 * (size,)

X_train, y_train = shuffle(np.load('x_train.npy'), np.load('y_train.npy'))
X_test = np.load('x_test.npy')

width, height = 8, 8

plt.figure(figsize=(16, 20))
for n, (image, name) in enumerate(zip(X_train, y_train), 1):
    if n > width * height:
        break
        
    plt.subplot(height, width, n)
    plt.title(name)
    plt.imshow(image.reshape(shape), cmap='gray')

Define simple image preparing.

In [None]:
def prepare(img):
    img = img.reshape(shape)
    img = equalize_hist(img)
    img = gaussian(img, sigma=1)
    img - img.mean()
    
    return img.flatten()
    
def transform(X):
    height, width = X.shape
    for i in range(height):
        X[i] = prepare(X[i]) 
        
    return X

In [None]:
X_train = transform(X_train)
X_test = transform(X_test)

Use pca decomposition to reduce the dimensionality.

In [None]:
pca = PCA(n_components=200)
pca.fit(np.vstack([X_train, X_test]))

Show some components.

In [None]:
width, height = 8, 8

plt.figure(figsize=(16, 20))
for n, component in enumerate(pca.components_, 1):
    if n > width * height:
        break
        
    plt.subplot(height, width, n)
    plt.imshow(component.reshape(shape), cmap='gray')

In [None]:
from sklearn.model_selection import train_test_split
a_train, a_test, b_train, b_test = train_test_split(pca.transform(X_train), y_train, test_size=0.4, random_state=0)

In [None]:
from sklearn.metrics import jaccard_similarity_score

neighbors = range(1,30)
train_results = []
test_results = []
for n in neighbors:
    model = KNeighborsClassifier(n_neighbors=n)
    model.fit(a_train, b_train)
    train_pred = model.predict(a_train)
    an = jaccard_similarity_score(b_train, train_pred)
    train_results.append(an)
    y_pred = model.predict(a_test)
    ab=jaccard_similarity_score(b_test, y_pred)
    test_results.append(ab)
    
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(neighbors, train_results, 'b', label="Train AUC")
line2, = plt.plot(neighbors, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('n_neighbors')
plt.show()

In [None]:
for idx,rs in enumerate(test_results):
    print(f'{idx} {rs}')

In [None]:
from sklearn.model_selection import GridSearchCV

weight_options = ['uniform', 'distance']

param_grid = dict(n_neighbors=[1,2,3], weights=weight_options, p=[1, 1.5, 2] )

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring='accuracy')

# fit the grid with data
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

And in the end we use the nearest neighbors to classify faces.

In [None]:
classifier = KNeighborsClassifier(n_neighbors=10, p=1.5, metric='minkowski')
classifier.fit(pca.transform(X_train), y_train)

In [None]:
prediction= classifier.predict(pca.transform(X_test))

In [None]:
with open('prediction.csv', 'w') as out:
    print('Id,Name', file=out)
    for pair in enumerate(prediction, 1):
        print('%i,%s' % pair, file=out)