### End-to-End: Train

### Procedure Outline
1. Filter the dataset 
    - Detect faces among all the images. Reject images that have more than one face, or no face.
    - Save filtered dataset to new location.
2. Generate Train-Test Splits
    - Create folds.
3. Evaluate ***<--- This notebook***
    - Generate embeddings from the splits
    - Train classifier on the embeddings
    - Test classifier on the embeddings
4. Tune classifier ***<--- This notebook***
    - Tune the classifier 
5. Save the model ***<--- This notebook***

### Imports

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import pprint
import logging
import tqdm
import numpy as np
import pandas as pd
import random

from collections import Counter
from multiprocessing import cpu_count

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.externals import joblib

import face_trigger

In [3]:
from face_trigger.utils.train import generate_embeddings_for_dataset

In [4]:
dataset_path = "/media/ankurrc/new_volume/softura/facerec/softura_filtered/"

In [5]:
logging.basicConfig(level=logging.DEBUG)

### Get the embeddings for the dataset

In [6]:
def get_embeddings(dataset_path=None):
    return generate_embeddings_for_dataset(dataset_path=dataset_path)

In [7]:
X, y = get_embeddings(dataset_path=dataset_path)

0it [00:00, ?it/s]DEBUG:face_trigger.model.deep.FaceRecognizer:No DNN model path specified, using default.


Exception: No label mapping provided!

In [254]:
# convert to numpy array
y_arr = np.array(y)

In [111]:
# convert to numpy array
X_arr = np.array(X)

### Encode the labels

In [255]:
encoder = LabelEncoder()
classes = np.unique(y_arr)
encoder.fit(classes)
# transform our ground truth
y_enc = encoder.transform(y_arr)
# get a mapping to use later
encoder_mapping = dict(zip(encoder.transform(classes),encoder.classes_))

In [256]:
encoder_mapping

{0: '000001',
 1: '000002',
 2: '000003',
 3: '000004',
 4: '000005',
 5: '000006',
 6: '000007',
 7: '000008',
 8: '000009',
 9: '000010',
 10: '000011',
 11: '000012',
 12: '000013',
 13: '000014',
 14: '000015',
 15: '000016',
 16: '000017',
 17: '000019',
 18: '000020',
 19: '000021',
 20: '000022',
 21: '000023',
 22: '000024',
 23: '000025',
 24: '000026',
 25: '000027',
 26: '000028',
 27: '000029',
 28: '000030',
 29: '000031',
 30: '000032',
 31: '000033',
 32: '000034',
 33: '000035',
 34: '000036',
 35: '000037',
 36: '000038',
 37: '000039',
 38: '000040',
 39: '000041',
 40: '000042',
 41: '000043',
 42: '000044',
 43: '000045',
 44: '000046',
 45: '000047',
 46: '000048',
 47: '000049',
 48: '000050',
 49: '000051',
 50: '000052',
 51: '000053',
 52: '000054',
 53: '000055',
 54: '000056',
 55: '000057',
 56: '000058',
 57: '000059',
 58: '000060',
 59: '000061',
 60: '000062',
 61: '000063',
 62: '000064',
 63: '000065',
 64: '000066',
 65: '000067',
 66: '000068',
 67: 

### Oversample categories if required

We use random-resampling for minority classes.

In [114]:
# minimum samples for each class
k_core = 10

X_oversampled = []
y_oversampled = []

In [115]:
# Category counts
y_counts = Counter(y_enc)

In [116]:
y_counts

Counter({0: 6,
         1: 7,
         2: 14,
         3: 4,
         4: 5,
         5: 4,
         6: 8,
         7: 6,
         8: 13,
         9: 13,
         10: 6,
         11: 17,
         12: 4,
         13: 6,
         14: 22,
         15: 6,
         16: 9,
         17: 6,
         18: 14,
         19: 13,
         20: 14,
         21: 35,
         22: 14,
         23: 23,
         24: 26,
         25: 11,
         26: 20,
         27: 6,
         28: 9,
         29: 8,
         30: 20,
         31: 11,
         32: 17,
         33: 19,
         34: 9,
         35: 7,
         36: 16,
         37: 18,
         38: 12,
         39: 29,
         40: 16,
         41: 15,
         42: 7,
         43: 7,
         44: 18,
         45: 20,
         46: 17,
         47: 19,
         48: 10,
         49: 4,
         50: 6,
         51: 16,
         52: 15,
         53: 5,
         54: 6,
         55: 4,
         56: 14,
         57: 4,
         58: 9,
         59: 18,
         60: 8,
 

In [117]:
# figure out which classes have a count less than k_core
minority_classes = []
for key, val in y_counts.items():
    if val < k_core:
        minority_classes.append(key)

In [118]:
minority_classes

[0,
 1,
 3,
 4,
 5,
 6,
 7,
 10,
 12,
 13,
 15,
 16,
 17,
 27,
 28,
 29,
 34,
 35,
 42,
 43,
 49,
 50,
 53,
 54,
 55,
 57,
 58,
 60,
 61,
 62,
 63,
 69,
 75,
 83,
 92,
 93,
 94,
 95,
 102,
 103,
 104,
 105,
 106,
 113,
 114,
 116,
 117,
 119,
 120,
 123,
 124,
 132,
 134,
 136,
 141,
 143,
 147,
 149,
 151,
 155,
 157,
 158,
 161,
 162,
 163,
 164,
 165,
 167,
 169,
 172,
 173,
 177,
 180,
 181,
 183,
 184,
 188,
 191,
 193,
 195,
 196,
 197,
 198,
 203,
 205,
 206,
 210,
 211,
 212,
 217,
 221,
 222,
 223,
 225,
 226,
 231,
 233,
 235,
 239,
 240,
 241,
 242,
 243,
 246,
 247,
 249,
 251,
 252,
 253,
 254,
 255,
 257,
 259,
 262,
 263,
 265,
 267,
 268,
 269,
 276,
 277,
 278,
 279,
 280,
 281,
 282,
 283,
 284,
 286,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 303,
 304,
 306,
 307,
 310,
 311,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 320,
 321,
 322,
 324,
 325,
 326,
 327,
 328,
 330,
 331,
 332,
 333,
 334,
 335,
 340,
 342,
 343,
 

In [119]:
# randomly resample for each minority class
for minority in minority_classes:
    # num of samples our minority class has
    minority_count = y_counts[minority]
    # num of samples required to make it k-core
    samples_required = k_core - minority_count
    # minority class' indices
    minority_indices = np.argwhere((y_enc == minority)).flatten()

    # loop till we have added enough samples
    for i in range(samples_required):
        random.Random().shuffle(minority_indices)
        index = minority_indices[0]
        X_oversampled.append(X_arr[index])
        y_oversampled.append(y_enc[index])

In [120]:
# convert to numpy arrays
X_oversampled = np.array(X_oversampled)
y_oversampled = np.array(y_oversampled)

In [121]:
# concatenate both our original array and resampled minority ones to obtain a k-core dataset
X_total = np.concatenate((X, X_oversampled), axis=0)
y_toal = np.concatenate((y_enc, y_oversampled), axis=0)

In [122]:
assert X_total.shape[1] == 128
assert X_total.shape[0] == y_final.shape[0]

### Train the classifier

In [123]:
# Set all hyperparameter knobs here
dual = True # set false for datasets that have num_dims > num_samples
class_weight = "balanced" # set for class imbalance
Cs = np.logspace(-3, 3, 7, base=10.0)
n_jobs = cpu_count()

In [124]:
# setup parameter grid
param_grid = dict(base_estimator__C=Cs)
# setup cross-validation strategy
cv = StratifiedShuffleSplit(n_splits=3,test_size=0.2, random_state=42)

In [125]:
# setup classifier
svm = LinearSVC(dual=dual,class_weight=class_weight)
# we need probabilities, so we use a calibrated classifier
clf = CalibratedClassifierCV(svm) 

In [126]:
# setup our grid-search object
grid = GridSearchCV(clf, param_grid=param_grid, cv=cv, verbose=True, n_jobs=n_jobs, return_train_score=True)

In [127]:
# fit the data
grid.fit(X_final, y_final)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=4)]: Done  21 out of  21 | elapsed: 25.1min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=3, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
            cv=3, method='sigmoid'),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'base_estimator__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=True)

In [128]:
print("The best parameters are {0:s} with a score of {1:0.2f}".format(grid.best_params_, grid.best_score_))

The best parameters are {'base_estimator__C': 10.0} with a score of 0.99


In [129]:
grid_results = pd.DataFrame(data=grid.cv_results_)
grid_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_base_estimator__C,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,252.462956,1.18251,0.01599,0.017087,0.001,{u'base_estimator__C': 0.001},7,0.0123,0.016404,0.01681,0.017736,0.01886,0.017121,3.762975,0.128142,0.00274,0.000545
1,199.408525,1.287059,0.552276,0.59825,0.01,{u'base_estimator__C': 0.01},6,0.558426,0.596473,0.542845,0.597909,0.555556,0.600369,17.142563,0.10539,0.00677,0.001609
2,216.913789,1.230099,0.988793,0.996548,0.1,{u'base_estimator__C': 0.1},5,0.98729,0.996514,0.98893,0.996924,0.99016,0.996207,23.328893,0.264456,0.001176,0.000294
3,299.5494,1.21628,0.990707,0.998701,1.0,{u'base_estimator__C': 1.0},3,0.98852,0.998872,0.99139,0.998872,0.99221,0.99836,1.460922,0.191722,0.001582,0.000242
4,325.026359,0.914613,0.991117,0.999863,10.0,{u'base_estimator__C': 10.0},1,0.98893,0.999897,0.99221,0.999897,0.99221,0.999795,19.874132,0.024783,0.001546,4.8e-05
5,285.929334,1.181592,0.990843,0.999966,100.0,{u'base_estimator__C': 100.0},2,0.98934,1.0,0.99139,1.0,0.9918,0.999897,5.497058,0.170784,0.001076,4.8e-05
6,271.938955,0.977232,0.990433,1.0,1000.0,{u'base_estimator__C': 1000.0},4,0.98934,1.0,0.99057,1.0,0.99139,1.0,53.914836,0.265413,0.000842,0.0


### Save the model

In [142]:
# save paths 
save_path = "/media/ankurrc/new_volume/softura/facerec/trained"
clf_name = "classifier.pkl"
label_map_file = "label_mapping.pkl"

In [133]:
joblib.dump(grid, os.path.join(save_path, clf_name))

['/media/ankurrc/new_volume/softura/facerec/trained/classifier.pkl']

In [257]:
joblib.dump(encoder_mapping, os.path.join(save_path, label_map_file))

['/media/ankurrc/new_volume/softura/facerec/trained/label_mapping.pkl']

### Test the models by reloading

#### Classifier Sanity Check

In [295]:
clf = joblib.load(os.path.join(save_path, clf_name))

In [296]:
predictions = clf.predict_proba(X_final)

In [297]:
predicted_labels = clf.predict(X_final)

In [307]:
print("Accuracy before thresholding: {:0.5%}".format(accuracy_score(y_final, predicted_labels)))

Accuracy before thresholding: 99.98360%


#### Classifier Confidence Thresholding

In [299]:
prediction_probabilities = np.max(predictions, axis=1)

In [300]:
threshold = 0.2
thresholded_probabilities = prediction_probabilities < threshold

In [301]:
thresholded_indices = np.nonzero(thresholded_probabilities)

In [302]:
thresholded_predicted_labels = predicted_labels.copy()
thresholded_predicted_labels[thresholded_indices] = -1

In [310]:
print("Accuracy after thresholding with {0:0.2%} confidence: {1:0.5%}".format(threshold, accuracy_score(y_final, thresholded_predicted_labels)))

Accuracy after thresholding with 20.00% confidence: 99.95899%


#### Label Map

In [None]:
label_map = joblib.load(os.path.join(save_path, label_map_file)) 

In [259]:
label_map

{0: '000001',
 1: '000002',
 2: '000003',
 3: '000004',
 4: '000005',
 5: '000006',
 6: '000007',
 7: '000008',
 8: '000009',
 9: '000010',
 10: '000011',
 11: '000012',
 12: '000013',
 13: '000014',
 14: '000015',
 15: '000016',
 16: '000017',
 17: '000019',
 18: '000020',
 19: '000021',
 20: '000022',
 21: '000023',
 22: '000024',
 23: '000025',
 24: '000026',
 25: '000027',
 26: '000028',
 27: '000029',
 28: '000030',
 29: '000031',
 30: '000032',
 31: '000033',
 32: '000034',
 33: '000035',
 34: '000036',
 35: '000037',
 36: '000038',
 37: '000039',
 38: '000040',
 39: '000041',
 40: '000042',
 41: '000043',
 42: '000044',
 43: '000045',
 44: '000046',
 45: '000047',
 46: '000048',
 47: '000049',
 48: '000050',
 49: '000051',
 50: '000052',
 51: '000053',
 52: '000054',
 53: '000055',
 54: '000056',
 55: '000057',
 56: '000058',
 57: '000059',
 58: '000060',
 59: '000061',
 60: '000062',
 61: '000063',
 62: '000064',
 63: '000065',
 64: '000066',
 65: '000067',
 66: '000068',
 67: 