### End-to-End: Train

### Procedure Outline
1. Filter the dataset 
    - Detect faces among all the images. Reject images that have more than one face, or no face.
    - Save filtered dataset to new location.
2. Generate Train-Test Splits
    - Create folds.
3. Evaluate ***<--- This notebook***
    - Generate embeddings from the splits
    - Train classifier on the embeddings
    - Test classifier on the embeddings
4. Tune classifier ***<--- This notebook***
    - Tune the classifier 
5. Save the model ***<--- This notebook***

### Imports

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import pprint
import logging
import tqdm
import numpy as np
import pandas as pd
import random

from collections import Counter
from multiprocessing import cpu_count

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.externals import joblib

import face_trigger

In [3]:
from face_trigger.utils.train import generate_embeddings_for_dataset

In [4]:
dataset_path = "/media/ankurrc/new_volume/softura/facerec/softura_filtered/"

In [5]:
logging.basicConfig(level=logging.DEBUG)

### Get the embeddings for the dataset

In [6]:
def get_embeddings(dataset_path=None):
    return generate_embeddings_for_dataset(dataset_path=dataset_path)

In [7]:
X, y = get_embeddings(dataset_path=dataset_path)

0it [00:00, ?it/s]DEBUG:face_trigger.model.deep.FaceRecognizer:No DNN model path specified, using default.
100%|██████████| 40/40 [01:54<00:00,  2.39s/it]


In [8]:
# convert to numpy array
y_arr = np.array(y)

In [9]:
# convert to numpy array
X_arr = np.array(X)

### Visualize embeddings (Optional)

In [21]:
x_file_path = "/media/ankurrc/new_volume/softura/facerec/embeddings/x.tsv"
y_file_path = "/media/ankurrc/new_volume/softura/facerec/embeddings/y.tsv"

In [23]:
np.savetxt(x_file_path, X_arr, delimiter='\t')
np.savetxt(y_file_path, y_arr, fmt="%s", delimiter='\t')

#### Project the embeddings
Load the generated files as data('x_file_path') and metadata('y_file_path') on http://projector.tensorflow.org/ to project the embeddings!

### Encode the labels

In [24]:
encoder = LabelEncoder()
classes = np.unique(y_arr)
encoder.fit(classes)
# transform our ground truth
y_enc = encoder.transform(y_arr)
# get a mapping to use later
encoder_mapping = dict(zip(encoder.transform(classes),encoder.classes_))

In [25]:
encoder_mapping

{0: '1',
 1: '10',
 2: '11',
 3: '12',
 4: '13',
 5: '14',
 6: '16',
 7: '17',
 8: '18',
 9: '19',
 10: '2',
 11: '20',
 12: '21',
 13: '22',
 14: '23',
 15: '24',
 16: '25',
 17: '26',
 18: '27',
 19: '28',
 20: '29',
 21: '3',
 22: '30',
 23: '31',
 24: '32',
 25: '33',
 26: '34',
 27: '35',
 28: '36',
 29: '37',
 30: '38',
 31: '39',
 32: '4',
 33: '40',
 34: '41',
 35: '5',
 36: '6',
 37: '7',
 38: '8',
 39: '9'}

### Oversample categories if required

We use random-resampling for minority classes.

In [26]:
# minimum samples for each class
k_core = 10

X_oversampled = []
y_oversampled = []

In [27]:
# Category counts
y_counts = Counter(y_enc)

In [28]:
y_counts

Counter({0: 12,
         1: 12,
         2: 12,
         3: 12,
         4: 10,
         5: 12,
         6: 10,
         7: 12,
         8: 12,
         9: 12,
         10: 14,
         11: 10,
         12: 12,
         13: 10,
         14: 12,
         15: 12,
         16: 16,
         17: 12,
         18: 12,
         19: 12,
         20: 10,
         21: 12,
         22: 12,
         23: 14,
         24: 12,
         25: 10,
         26: 12,
         27: 12,
         28: 12,
         29: 12,
         30: 10,
         31: 12,
         32: 12,
         33: 10,
         34: 28,
         35: 10,
         36: 12,
         37: 12,
         38: 10,
         39: 10})

In [29]:
# figure out which classes have a count less than k_core
minority_classes = []
for key, val in y_counts.items():
    if val < k_core:
        minority_classes.append(key)

In [30]:
minority_classes

[]

In [31]:
# randomly resample for each minority class
for minority in minority_classes:
    # num of samples our minority class has
    minority_count = y_counts[minority]
    # num of samples required to make it k-core
    samples_required = k_core - minority_count
    # minority class' indices
    minority_indices = np.argwhere((y_enc == minority)).flatten()

    # loop till we have added enough samples
    for i in range(samples_required):
        random.Random().shuffle(minority_indices)
        index = minority_indices[0]
        X_oversampled.append(X_arr[index])
        y_oversampled.append(y_enc[index])

In [32]:
# convert to numpy arrays
X_oversampled = np.array(X_oversampled)
y_oversampled = np.array(y_oversampled)

In [36]:
# concatenate both our original array and resampled minority ones to obtain a k-core dataset
if X_oversampled.shape[0] > 0:
    X_total = np.concatenate((X_arr, X_oversampled), axis=0)
    y_toal = np.concatenate((y_enc, y_oversampled), axis=0)
else:
    X_total = X_arr
    y_total = y_enc

In [38]:
assert X_total.shape[1] == 128
assert X_total.shape[0] == y_total.shape[0]

### Train the classifier

In [46]:
# Set all hyperparameter knobs here
dual = True # set false for datasets that have num_dims > num_samples
class_weight = "balanced" # set for class imbalance
Cs = np.logspace(-3, 3, 7, base=10.0)
n_jobs = cpu_count()

In [47]:
# setup parameter grid
param_grid = dict(base_estimator__C=Cs)
# setup cross-validation strategy
cv = StratifiedShuffleSplit(n_splits=3,test_size=0.2, random_state=42)

In [48]:
# setup classifier
svm = LinearSVC(dual=dual,class_weight=class_weight)
# we need probabilities, so we use a calibrated classifier
clf = CalibratedClassifierCV(svm) 

In [49]:
# setup our grid-search object
grid = GridSearchCV(clf, param_grid=param_grid, cv=cv, verbose=True, n_jobs=n_jobs, return_train_score=True)

In [50]:
# fit the data
grid.fit(X_total, y_total)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed:    8.1s finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'base_estimator__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [51]:
print("The best parameters are {0:s} with a score of {1:0.2f}".format(grid.best_params_, grid.best_score_))

The best parameters are {'base_estimator__C': 10.0} with a score of 1.00


In [52]:
grid_results = pd.DataFrame(data=grid.cv_results_)
grid_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_base_estimator__C,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,1.035603,0.026554,0.134021,0.135931,0.001,{u'base_estimator__C': 0.001},7,0.134021,0.119481,0.134021,0.135065,0.134021,0.153247,0.145223,0.007725,0.0,0.013799
1,0.909461,0.029253,0.924399,0.928139,0.01,{u'base_estimator__C': 0.01},6,0.938144,0.932468,0.907216,0.932468,0.927835,0.919481,0.058705,0.022121,0.012858,0.006122
2,0.960495,0.030457,0.955326,0.966234,0.1,{u'base_estimator__C': 0.1},5,0.948454,0.974026,0.958763,0.971429,0.958763,0.953247,0.078059,0.017008,0.00486,0.009244
3,1.21351,0.018982,0.986254,0.988745,1.0,{u'base_estimator__C': 1.0},4,0.969072,0.987013,1.0,0.98961,0.989691,0.98961,0.052523,0.007539,0.012858,0.001224
4,1.64643,0.015985,1.0,1.0,10.0,{u'base_estimator__C': 10.0},1,1.0,1.0,1.0,1.0,1.0,1.0,0.09253,0.00386,0.0,0.0
5,2.212373,0.015418,1.0,1.0,100.0,{u'base_estimator__C': 100.0},1,1.0,1.0,1.0,1.0,1.0,1.0,0.063996,0.010137,0.0,0.0
6,1.80746,0.011239,1.0,1.0,1000.0,{u'base_estimator__C': 1000.0},1,1.0,1.0,1.0,1.0,1.0,1.0,0.269827,0.006685,0.0,0.0


### Save the model

In [55]:
# save paths 
save_path = "/media/ankurrc/new_volume/softura/facerec/softura_trained"
clf_name = "classifier.pkl"
label_map_file = "label_mapping.pkl"

In [56]:
joblib.dump(grid, os.path.join(save_path, clf_name))

['/media/ankurrc/new_volume/softura/facerec/softura_trained/classifier.pkl']

In [57]:
joblib.dump(encoder_mapping, os.path.join(save_path, label_map_file))

['/media/ankurrc/new_volume/softura/facerec/softura_trained/label_mapping.pkl']

### Test the models by reloading

#### Classifier Sanity Check

In [58]:
clf = joblib.load(os.path.join(save_path, clf_name))

In [60]:
predictions = clf.predict_proba(X_total)

In [62]:
predicted_labels = clf.predict(X_total)

In [64]:
print("Accuracy before thresholding: {:0.5%}".format(accuracy_score(y_total, predicted_labels)))

Accuracy before thresholding: 100.00000%


#### Classifier Confidence Thresholding

In [65]:
prediction_probabilities = np.max(predictions, axis=1)

In [66]:
threshold = 0.2
thresholded_probabilities = prediction_probabilities < threshold

In [67]:
thresholded_indices = np.nonzero(thresholded_probabilities)

In [68]:
thresholded_predicted_labels = predicted_labels.copy()
thresholded_predicted_labels[thresholded_indices] = -1

In [70]:
print("Accuracy after thresholding with {0:0.2%} confidence: {1:0.5%}".format(threshold, accuracy_score(y_total, thresholded_predicted_labels)))

Accuracy after thresholding with 20.00% confidence: 100.00000%


#### Label Map

In [71]:
label_map = joblib.load(os.path.join(save_path, label_map_file)) 

In [72]:
label_map

{0: '1',
 1: '10',
 2: '11',
 3: '12',
 4: '13',
 5: '14',
 6: '16',
 7: '17',
 8: '18',
 9: '19',
 10: '2',
 11: '20',
 12: '21',
 13: '22',
 14: '23',
 15: '24',
 16: '25',
 17: '26',
 18: '27',
 19: '28',
 20: '29',
 21: '3',
 22: '30',
 23: '31',
 24: '32',
 25: '33',
 26: '34',
 27: '35',
 28: '36',
 29: '37',
 30: '38',
 31: '39',
 32: '4',
 33: '40',
 34: '41',
 35: '5',
 36: '6',
 37: '7',
 38: '8',
 39: '9'}