#### Imports

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
%env JOBLIB_TEMP_FOLDER=data/antispoofing/tmp
import operator
from multiprocessing import Pool, TimeoutError
import glob
import os
import cv2

import numpy as np
from tqdm import tqdm_notebook

env: JOBLIB_TEMP_FOLDER=data/antispoofing/tmp


In [3]:
from matplotlib import pyplot as plt

In [26]:
from sklearn.model_selection import StratifiedKFold

In [46]:
from sklearn.metrics import roc_auc_score

In [27]:
import lightgbm as lgb

In [35]:
import gc

#### Paths

In [4]:
PATH = "data/antispoofing/"

In [5]:
TEST_PATH = "data/antispoofing/test/"

In [6]:
TRN_REAL_PATH = os.path.join(PATH, 'train/real')
TRN_SPOOF_PATH = os.path.join(PATH, 'train/spoof')
VAL_REAL_PATH = os.path.join(PATH, 'valid/real')
VAL_SPOOF_PATH = os.path.join(PATH, 'valid/spoof')

#### Const

In [7]:
NEW_WIDTH = 480
NEW_HEIGHT = 480
POOL_THREADS = 16

#### Workflow 

##### Funcs

In [13]:
# Feature extractor
def extract_feature_vector(image, vector_size=32):
    try:
        # Using KAZE, cause SIFT, ORB and other was moved to additional module
        # which is adding addtional pain during install
        alg = cv2.KAZE_create()
        # Dinding image keypoints
        kps = alg.detect(image)
        # Getting first 32 of them.
        # Number of keypoints is varies depend on image size and color pallet
        # Sorting them based on keypoint response value(bigger is better)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        # computing descriptors vector
        kps, dsc = alg.compute(image, kps)
        # Flatten all of them in one big vector - our feature vector
        if dsc is not None:
            dsc = dsc.flatten()
            # Making descriptor of same size
            # Descriptor vector size is 64
            needed_size = (vector_size * 64)
            if dsc.size < needed_size:
                # if we have less the 32 descriptors then just adding zeros at the
                # end of our feature vector
                dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
        else:
            needed_size = (vector_size * 64)
            dsc = np.zeros(needed_size)
    except cv2.error as e:
        print( 'Error: ', e)
        return None

    return dsc

In [14]:
def process_single(file):
    image = cv2.imread(file)
    h, w, c = image.shape
    dw, dh = w // 4, h // 4
    return extract_feature_vector(cv2.resize(image[dh:h-dh, dw:w-dw, :], (NEW_WIDTH, NEW_HEIGHT)))

In [15]:
def extract_features(filelist):
    def __impl(files):
        pool = Pool(POOL_THREADS) 
        feats = list(tqdm_notebook(pool.imap(process_single, files), total=len(files)))        
        return feats
    feature_list = __impl(filelist) 
    return [x for x in feature_list if x is not None]

##### Load data and extract features

In [21]:
# load valid
val_real_filelist  = glob.glob(os.path.join(VAL_REAL_PATH, '*.png'))
val_spoof_filelist = glob.glob(os.path.join(VAL_SPOOF_PATH, '*.png'))

val_real_features = extract_features(val_real_filelist)

HBox(children=(IntProgress(value=0, max=373), HTML(value='')))




In [22]:
val_spoof_features = extract_features(val_spoof_filelist)

val_data = val_real_features + val_spoof_features
val_labels = [0] * len(val_real_features) + [1] * len(val_spoof_features)

HBox(children=(IntProgress(value=0, max=632), HTML(value='')))




In [23]:
len(val_real_features), len(val_real_filelist)

(373, 373)

In [24]:
len(val_spoof_features), len(val_spoof_filelist)

(632, 632)

In [18]:
# load train
train_real_filelist  = glob.glob(os.path.join(TRN_REAL_PATH, '*.png'))
train_real_features = extract_features(train_real_filelist)

HBox(children=(IntProgress(value=0, max=1223), HTML(value='')))




In [19]:
len(train_real_features), len(train_real_filelist)

(1223, 1223)

In [16]:
train_spoof_filelist = glob.glob(os.path.join(TRN_SPOOF_PATH, '*.png'))
train_spoof_features = extract_features(train_spoof_filelist)

HBox(children=(IntProgress(value=0, max=7076), HTML(value='')))




In [17]:
len(train_spoof_features), len(train_spoof_filelist)

(7076, 7076)

In [20]:
train_data = train_real_features + train_spoof_features
labels = [0] * len(train_real_features) + [1] * len(train_spoof_features)

#### Modeling 

In [28]:
n_folds = 5

In [29]:
# Convert to np arrays
features = np.array(train_data)
val_features = np.array(val_data)

# Create the kfold object
k_fold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=50)

In [34]:
len(labels), features.shape

(8299, (8299, 2048))

In [37]:
labels = np.array(labels)
val_labels = np.array(val_labels)

In [38]:
# Lists for recording validation and training scores
valid_scores = []
train_scores = []
models = []
best_iters = []

# Iterate through each fold
for train_indices, valid_indices in k_fold.split(features, labels):

    # Training data for the fold
    train_features, train_labels = features[train_indices], labels[train_indices]
    # Validation data for the fold
    valid_features, valid_labels = features[valid_indices], labels[valid_indices]

    # Create the model
    model = lgb.LGBMClassifier(n_estimators=10000, objective='binary',
                               class_weight='balanced', learning_rate=0.05,
                               reg_alpha=0.1, reg_lambda=0.1,
                               subsample=0.8, n_jobs=-1,
                               random_state=50)

    # Train the model
    model.fit(train_features, train_labels, eval_metric='auc',
              eval_set=[(valid_features, valid_labels),
                        (train_features, train_labels)],
              eval_names=['valid', 'train'],
              early_stopping_rounds=100, verbose=200)

    # Record the best iteration
    best_iters.append(model.best_iteration_)

    # Record the best score
    valid_score = model.best_score_['valid']['auc']
    train_score = model.best_score_['train']['auc']

    valid_scores.append(valid_score)
    train_scores.append(train_score)
    
    models.append(model)

    # Clean up memory
    gc.enable()
    del model, train_features, valid_features
    gc.collect()

Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.727431	train's auc: 0.999995
Early stopping, best iteration is:
[234]	valid's auc: 0.733263	train's auc: 0.999997
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.717646	train's auc: 0.999993
[400]	valid's auc: 0.721102	train's auc: 0.999996
Early stopping, best iteration is:
[314]	valid's auc: 0.719308	train's auc: 0.999996
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.737835	train's auc: 0.999994
Early stopping, best iteration is:
[177]	valid's auc: 0.739087	train's auc: 0.999995
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.721624	train's auc: 0.999997
Early stopping, best iteration is:
[151]	valid's auc: 0.718905	train's auc: 0.999997
Training until validation scores don't improve for 100 rounds.
[200]	valid's auc: 0.725821	train's auc: 0.999995
Early stopping, best iteration is:
[239]	vali

In [41]:
opt_model = models[2]
best_it = best_iters[2]

In [44]:
valid_preds = opt_model.predict_proba(val_features, num_iteration=best_it)[:, 1]

In [47]:
roc_auc_score(val_labels, valid_preds)

0.5782527912580174