In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sb
import math
from IPython.display import display, clear_output 
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

# set random seed to achieve same results each time
np.random.seed(23)

In [2]:
class FacialKeypoints():
    
    def __init__(self):
        self.train_csv = pd.read_csv("training.csv")
        self.test_csv = pd.read_csv("test.csv")
        self.sample_csv = pd.read_csv("SampleSubmission.csv")
        self.ID_csv = pd.read_csv("IdLookupTable.csv")
        self.label_names = self.train_csv.columns.tolist()[:-1]
        self.dev_data_n, self.dev_labels_n, self.train_data_n, self.train_labels_n = self.create_training(self.train_csv)
        self.dev_data, self.dev_labels, self.train_data, self.train_labels = self.create_training(self.train_csv, nans=False)
        self.knn = self.K_nn(self.train_data, self.train_labels)
        self.mlp = self.MLPRegressor(self.train_data, self.train_labels)
    
    def create_training(self, train_csv, dev_size=1000, nans=True):
        if nans == False:
            train_csv = train_csv.dropna()
        labels = train_csv.loc[:, train_csv.columns != 'Image'].values
        data = train_csv['Image'].str.split()
        data = np.vstack(data.apply(lambda row: np.asarray([int(n) for n in row])).values)
        data = data/255.0                                           # Rescale grayscale values to [0,1].
        shuffle = np.random.permutation(np.arange(data.shape[0]))   # Shuffle the data
        data, labels              = data[shuffle], labels[shuffle]  # Splitting into dev and training
        dev_data, dev_labels      = data[:dev_size], labels[:dev_size]
        train_data, train_labels  = data[dev_size:], labels[dev_size:]
        return dev_data, dev_labels, train_data, train_labels
    
    def plot_example(self, data, label, predicted_label=np.nan):
        plt.imshow(data.reshape(96, 96), cmap='gray')
        plt.scatter(label[0::2], label[1::2], c='red', marker='x', label='actual')
        if np.all(np.isfinite(predicted_label)):
            plt.scatter(predicted_label[0::2], predicted_label[1::2], c='blue', marker='x', label='predicted')
        plt.axis('off')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize=12)
        return
    
    def score(self, y_pred, y_actual):
        return np.sqrt(mean_squared_error(y_actual, y_pred))
    
    def preprocessor():
        # label_nan_filler() generates a substitute value for each missing facial keypoint as the average distance from the
        # facial keypoint to the origin, scaling each distance by each image keypoints' Euclidean norm prior to
        # averaging, then descaling for the final value    
        def label_nan_filler():
            # the line below returns a list of the number of missing label values for each facial keypoint
            train_label_nan_counts = np.count_nonzero(np.isnan(self.train_labels_n), axis = 0)
    
            # get the indices of keypoints that have no missing label values, these will be the origin
            origin_index = [i for i, x in enumerate(train_label_nan_counts) if x == 0]

            # extract the values for each image using the origin index
            origin_x = self.train_labels_n[:, origin_index[0]]
            origin_y = self.train_labels_n[:, origin_index[1]]

            # separate x labels from y labels
            x_keypoints = self.train_labels_n[:,0::2]
            y_keypoints = self.train_labels_n[:,1::2]

            # get the distance between each point and the origin
            x_keypoint_dist_from_origin = np.empty((x_keypoints.shape))
            y_keypoint_dist_from_origin = np.empty((y_keypoints.shape))
            for i in range(x_keypoints.shape[0]):
                for j in range(x_keypoints.shape[1]):
                    x_keypoint_dist_from_origin[i,j] = x_keypoints[i,j] - origin_x[i]
                    y_keypoint_dist_from_origin[i,j] = y_keypoints[i,j] - origin_y[i]

            # get the Euclidean norm of each distance from origin (with as many non-nan values as possible) to use as
            # a scaling factor
            label_norms = np.empty(len(self.train_labels_n))
            for i in range(len(self.train_labels_n)):
                label_norms[i] = np.linalg.norm(np.hstack((x_keypoint_dist_from_origin[i, ~np.isnan(x_keypoint_dist_from_origin[i])],
                                                           y_keypoint_dist_from_origin[i, ~np.isnan(y_keypoint_dist_from_origin[i])])))

            # scale each facial keypoint distance from origin by the Eucliean norm of its associated face
            # initiate distance from origins for x and y
            x_scaled_dist_from_origin_list = np.empty((x_keypoints.shape))
            y_scaled_dist_from_origin_list = np.empty((y_keypoints.shape))
            for i in range(len(self.train_labels_n)):
                for j in range(int(len(self.label_names)/2)):
                    x_scaled_dist_from_origin_list[i,j] = x_keypoint_dist_from_origin[i,j]/label_norms[i]
                    y_scaled_dist_from_origin_list[i,j] = y_keypoint_dist_from_origin[i,j]/label_norms[i]
            
            # convert list of arrays into one array (for x and y)
            x_scaled_dist_from_origin = np.array(x_scaled_dist_from_origin_list)
            y_scaled_dist_from_origin = np.array(y_scaled_dist_from_origin_list)
            
            # take the mean of each scaled facial keypoint across all faces
            x_mean_scaled_dist_from_origin = np.nanmean(x_scaled_dist_from_origin, axis=0)
            y_mean_scaled_dist_from_origin = np.nanmean(y_scaled_dist_from_origin, axis=0)
            
            # combine the x and y mean scaled distances from the origin into the original label order
            mean_scaled_dist_from_origin = np.empty(30)
            x_iter = iter(x_mean_scaled_dist_from_origin)
            y_iter = iter(y_mean_scaled_dist_from_origin)
            for i in range(len(mean_scaled_dist_from_origin)):
                if i%2 == 0:
                    mean_scaled_dist_from_origin[i] = next(x_iter)
                else:
                    mean_scaled_dist_from_origin[i] = next(y_iter)
            
            # substitute missing values for average values
            filled_in_train_labels = np.empty((self.train_labels_n).shape)
            # substitute value distance from origin scale factor (multiplied by norm of labels)
            s_f = 2.2
            # add a y offset, since missing value noses tend to be labeled at too low of a position
            y_off = -5
            # check for nan values
            for row in range(len(self.train_labels_n)):
                for label_index in range(len(self.label_names)):
                    if math.isnan(self.train_labels_n[row, label_index]):
                        # add de-scaled mean distance from x or y origin for average facial keypoint position to each NaN
                        if label_index%2 == 0:
                            filled_in_train_labels[row, label_index] = origin_x[row] + (mean_scaled_dist_from_origin[label_index] *s_f*label_norms[row])
                        else:
                            filled_in_train_labels[row, label_index] = origin_y[row] + y_off +(mean_scaled_dist_from_origin[label_index] *s_f*label_norms[row])
                    # fill the rest of the array with the original label values
                    else:
                        filled_in_train_labels[row, label_index] = self.train_labels_n[row, label_index]
            
            # return array with no NaN values
            return filled_in_train_labels
        
        preprocessed_data = label_nan_filler()
        return preprocessed_data
        return
    
    def generate_training():
        return
    
    def blur_training():
        return
    
    def K_nn(self, data, labels, n_neighbors=3):
        knn = KNeighborsRegressor(n_neighbors)
        knn.fit(data, labels)
        return knn
    
    def LogReg(alpha):
        return
    
    def MLPRegressor(self, data, labels):
        mlp = MLPRegressor(hidden_layer_sizes=(300, 100))
        mlp.fit(data, labels)
        return mlp
    
    def _average_nans(self, train_labels):
        df = pd.DataFrame(train_labels)
        return df.fillna(df.mean()).values
    
    pass

## Initialize the Object

In [3]:
self = FacialKeypoints()

FileNotFoundError: File b'training.csv' does not exist

## Missing Labels

We noticed that there are many examples that are missing a full label set. Out of the total 6049 examples, only 1140 have the full set of labels. Here is the percentage break out of each label

In [None]:
(self.train_csv.loc[:, self.train_csv.columns != 'Image'].count()/len(self.train_csv.index)).round(3)

## Preprocess the Data

In [None]:
preprocessed_train_labels = self.preprocessor()

## Plot an Example

In [None]:
self.plot_example(self.train_data_n[0], self.train_labels_n[0]) # from incomplete labels

In [None]:
self.plot_example(self.train_data_n[0], preprocessed_train_labels[0]) # from incomplete labels, filled in

In [None]:
self.plot_example(self.train_data[25], self.train_labels[25]) # from complete labels

## K-NN

In [None]:
y_pred = self.knn.predict(self.dev_data) # this is a stored method that trains on train data then returns KNN object

In [None]:
self.score(self.dev_labels, y_pred)

#### Using _average_nans to preprocess and create more training data

In [None]:
self.knn = self.K_nn(data=self.train_data_n, labels=self._average_nans(self.train_labels_n))

In [None]:
y_pred = self.knn.predict(self.dev_data)

In [None]:
self.score(self.dev_labels, y_pred)

## Plot a Prediction

In [None]:
print ('RMSE of '+ str((self.score(self.dev_labels[25], y_pred[25]))))
self.plot_example(self.dev_data[25], self.dev_labels[25], y_pred[25])

## Logistic Regression

## Neural Net

#### Standard Scale Training and Labels

In [None]:
label_scaler = StandardScaler()
label_scaler.fit(self.train_labels)

data_scaler = StandardScaler()
data_scaler.fit(self.train_data)

In [None]:
self.mlp = self.MLPRegressor(data_scaler.transform(self.train_data), label_scaler.transform(self.train_labels))

In [None]:
y_pred = self.mlp.predict(scaler.transform(self.dev_data))
self.score(self.dev_labels, label_scaler.inverse_transform(y_pred))

## Plot a Prediction

In [None]:
print ('RMSE of '+ str((self.score(self.dev_labels[25], label_scaler.inverse_transform(y_pred[25])))))
self.plot_example(self.dev_data[25], self.dev_labels[25], label_scaler.inverse_transform(y_pred[25]))

## Grid Search to Find Best Parameters

In [None]:
knn = KNeighborsRegressor()
parameters ={'n_neighbors': range(1, 25)}
clf = GridSearchCV(knn, parameters)
clf.fit(self.train_data, self.train_labels)
clf.best_params_