### Importing the necessary modules

Note: For the successful loading of all the modules, it is necessary that LightGBM is installed in the current python environment.

The assumed directory structure is,
1. Current dir: ../
2. Data dir: ../data/ contains pose/train, pose/test/, labels.csv, files.txt
3. Code dir: ../code/
4. Execution file dir: ../code/Sign_Language_Recognition_Ashutosh_Vyas_01601649.ipynb
5. Utilities dir: ../code/util/ contains helpers.py, vis.py, straified_groupk.py, results_plots_evalutaion.py
6. Feature processing dir: ../code/feature_engineering contains data_augmentation.py, feature_extractors_4D_array.py, feature_preprocessing.py, features_4D_array.py

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from os.path import join as pjoin
import util.vis as V
import util.helpers as H
import data_analysis

import csv
import random
import gc
from glob import glob
import sklearn as sk
from sklearn import preprocessing

import feature_engineering.feature_preprocessing as feat_prepro
import feature_engineering.feature_extractors_4D_array as feat_extract
from feature_engineering.data_augmentation import SLRImbAugmentation
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GroupKFold
from util.stratified_group_cv import StratifiedGroupKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from util.results_plots_evaluation import map3_scorer
import util.results_plots_evaluation as results
from sklearn.metrics import accuracy_score
import util.helpers as kaggle_submission

from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

print("Imports done...")



### Initialize feature extraction flags and data paths

In [None]:
np.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise')
rng = np.random.RandomState(42)
startTime= datetime.now()

# #splits for cross-validation and #frames for interpolation
n_splits = 5
interpolated_total_frames = 15

# Initialize features extraction flags
face_flag = True #False for feature set 1
body_flag = True #False
hand_flag = True #False
physics_flag = True
trajectory_flag = True #False
linear_flag = True
angular_flag = False
std_flag = False
velocity_flag = False
acceleration_flag = False
remove_keypoints = True #False
save_plot = False

# 137 keypoint indices to remove if remove_keypoints is True
# current list is for lower-body and many face keypoints
unwanted_keypoints=[10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94]

# Paths to load the data
DATA_DIR = '../data'
POSE_DIR = '../data/pose'
TRAIN_DIR = POSE_DIR + "/train"
TEST_DIR = POSE_DIR + "/test"


### Load the data and interpolate

In [None]:

# Read labels.csv into a pandas dataframe for convenience
full_dataframe = pd.read_csv(pjoin(DATA_DIR, "labels.csv"))
full_dataframe['Data'] = full_dataframe['File'].apply(lambda title: np.load(pjoin(TRAIN_DIR, title + ".npy")))
print(full_dataframe.head())

# 4D data as (n_samples, n_frames, n_keypoints, n_coords)
samples_centered_4D_array = feat_prepro.interpolate_allsamples(full_dataframe.Data, interpolated_total_frames=interpolated_total_frames, x_resolution=1.0, y_resolution=1.0)

print("\nInterpolated training data shape",samples_centered_4D_array.shape)


### Train-test split if necessary

### Data augmentation

### Extract features

Extract features as per above enabled flags

In [None]:
### Use this block if just performing cross-validation on original data
### NOT FOR Train-test split and augmentation, see next block for that.
print("\nExtracting features for training data")
X_train = feat_extract.main_feature_extractor(array_4D_data=samples_centered_4D_array, face=face_flag, body=body_flag, hands=hand_flag, physics=physics_flag, trajectory=trajectory_flag, linear_flag=linear_flag, angular_flag=angular_flag, std_flag=std_flag, velocity_flag=velocity_flag, acceleration_flag=acceleration_flag, remove_keypoints=remove_keypoints, unwanted_keypoints=unwanted_keypoints)

y_train = np.asarray(full_dataframe.Label)
group_train = np.asarray(full_dataframe.Person)
print("Training shape",X_train.shape)
print("NAN values:",np.isnan(X_train).sum())

### Initialize objects for Scaling, PCA, Cross-validation, Feature selection

In [None]:
### Standard Scaler
scl = StandardScaler()
# scl = RobustScaler()

### PCA
# pca_obj = PCA(n_components=0.95, random_state=42)

### Cross validator
# cvld = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.2, train_size=None, random_state=42)
# cvld = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
cvld = GroupKFold(n_splits=n_splits)

### Feature Selector
feature_selector = VarianceThreshold(threshold=0.0)
# feature_selector = SelectKBest(k=int(0.5*X_train.shape[1]))

### Flush the RAM before training
gc.collect()

### Initialize one estimator

Note: The hyper-parameters are set and initialized as per the GridSearchCV tuning.

In [None]:
### Estimator
### Enable and select only one at a time

# estimator = LogisticRegression(C=0.275, tol=1e-4, max_iter=5000, penalty='l2', class_weight=None, multi_class='ovr', random_state=42, n_jobs=-1)

# estimator = SVC(C=6.5, decision_function_shape='ovo', kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, break_ties=False, random_state=42)

# estimator = SVC(C=6.5, decision_function_shape='ovr', kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, break_ties=False, random_state=42)

# estimator = SVC(C=0.0775, decision_function_shape='ovo', kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, break_ties=False, random_state=42)

# estimator = SVC(C=0.0775, decision_function_shape='ovr', kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, break_ties=False, random_state=42)

# estimator = RandomForestClassifier(n_estimators=130, max_depth=16, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=150, bootstrap=True, criterion='entropy', min_weight_fraction_leaf=0.0, min_impurity_decrease=0.0, min_impurity_split=None, oob_score=True, n_jobs=-1, random_state=42, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

# estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=15, max_depth=15, learning_rate=0.525, n_estimators=102, objective='multiclass', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, importance_type='gain', subsample_for_bin=200000, class_weight=None, random_state=42, n_jobs=-1, silent=True)

# estimator =  GaussianNB(priors=None, var_smoothing=110)

########### Below are the estimators for body+mean features
estimator = LogisticRegression(C=0.9, tol=1e-4, max_iter=5000, penalty='l2', class_weight=None, multi_class='ovr', random_state=42, n_jobs=-1)
# estimator = LGBMClassifier(boosting_type='gbdt', num_leaves=18, max_depth=5, learning_rate=0.525, n_estimators=170, objective='multiclass', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, importance_type='gain', subsample_for_bin=200000, class_weight=None, random_state=42, n_jobs=-1, silent=True)

print("\nTraining the model", str(estimator))

### Initialize the pipeline object with above configurations

In [None]:
# pipe = Pipeline([('scale', scl), ('reduce_dims', pca_obj), ('clf', estimator)])
pipe = Pipeline([('selection', feature_selector), ('scale', scl), ('clf', estimator)])

### Initialize and perform hyperparameter tuning with grid search

Note: Currently GridSearchCV object is initialized for Logistic Regression. Refer Appendix at the end of this file for parameter grid for other estimators.

In [None]:
### Grid Search CV
param_grid = dict(clf__C=[0.85, 0.9, 0.95]) # Refer Appendix

print("Running GSCV.....")
grid = GridSearchCV(pipe, param_grid=param_grid, cv=cvld, n_jobs=-1, verbose=100, scoring=map3_scorer)
grid.fit(X_train, y_train, groups=group_train)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

pipe_submission = grid.best_estimator_

map3_trn, map3_vld = results.predict_print_results(pipe_submission, X_train, X_train, y_train, y_train)



### Testing and Kaggle submission

Note: Currently testing is performed with GridSearchCV best estimator.

In [None]:
### Create a submission using the test set data and write the submission file using the provided code

all_test_files = sorted(glob(pjoin(TEST_DIR, '*.npy')))

test_samples = []
for numpy_file in all_test_files:
    sample = np.load(numpy_file)
    test_samples.append(sample)

samples_centered_4D_array_test = feat_prepro.interpolate_allsamples(test_samples, interpolated_total_frames=interpolated_total_frames, x_resolution=1.0, y_resolution=1.0)

print("Interpolated test data shape",samples_centered_4D_array_test.shape)
print("\nExtracting features for testing data")
X_test = feat_extract.main_feature_extractor(array_4D_data=samples_centered_4D_array_test, face=face_flag, body=body_flag, hands=hand_flag, physics=physics_flag, trajectory=trajectory_flag, linear_flag=linear_flag, angular_flag=angular_flag, std_flag=std_flag, velocity_flag=velocity_flag, acceleration_flag=acceleration_flag, remove_keypoints=remove_keypoints, unwanted_keypoints=unwanted_keypoints)
print("Test shape",X_test.shape)
print("NAN values:",np.isnan(X_test).sum())

test_probas = pipe_submission.predict_proba(X_test)
fname_txt = 'main10_LightGBM_body_phy_traj'#'main1_logreg'
H.create_submission(test_probas, '{txt}.csv'.format(txt=fname_txt))

print("\nKaggle submission {txt}.csv generated. Check the current directory.".format(txt=fname_txt))
print("\n~~~~~#####      Done     #####~~~~~\n")

timeElapsed = datetime.now() - startTime
print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

## Appendix
### Parameter grid for various estimators

Feature set 1: mean value of x-y-c over 15 frames i.e. 137*3 = 411 features

Feature set 2: Removing keypoints. (body+face+hands) + mean of x-y-c + trajectory = 14+3+16 + 216 + 8 = 256 features

#### Logistic Regression
Feature set 1: param_grid = dict(clf__C=[0.25, 0.275, 0.30])
Feature set 2: param_grid = dict(clf__C=[0.85, 0.875, 0.9, 0.925, 0.95]) 

#### Support Vector Classifier 4 models
1. With RBF kernel and OneVsOne decision function: param_grid = dict(clf__C=[5.75, 6.0, 6.25, 6.5, 6.75, 7.0], clf__gamma=['scale', 'auto'])
2. With RBF kernel and OneVsRest decision function: param_grid = dict(clf__C=[5.75, 6.0, 6.25, 6.5, 6.75, 7.0], clf__gamma=['scale', 'auto'])
3. With Linear kernel and OneVsOne decision function: param_grid = dict(clf__C=[0.075, 0.0775, 0.08, 0.0825, 0.085], clf__gamma=['scale', 'auto'])
4. With Linear kernel and OneVsRest decision function: param_grid = dict(clf__C=[0.075, 0.0775, 0.08, 0.0825, 0.085], clf__gamma=['scale', 'auto'])

#### Random Forest
Note: Please select 1 or 2 or 3 parameter at a time to reduce the computation time.

param_grid = dict(clf__n_estimators=[128, 130, 132, 134, 136], clf__max_depth=[12, 14, 16, 18], clf__max_features=['log', 'auto', None], clf__max_leaf_nodes=[130, 140, 150, 160, None], clf__bootstrap=[True, False], clf__criterion=['gini', 'entropy'], clf__max_samples=[0.2, 0.4, 0.6, 0.8, None])


#### Light Gradient Bossting Machine - LightGBM
Note: Please select 1 or 2 or 3 parameter at a time to reduce the computation time.

Feature set 1: param_grid = dict(clf__num_leaves=[11, 13, 15, 17, 19], max_depth=[11, 12, 13, 14, 15, 16], learning_rate=[0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65], n_estimators=[101, 102, 103, 104, 105], clf__importance_type=['split', 'gain'])

Feature set 2: param_grid = dict(clf__num_leaves=[16, 18, 20], max_depth=[4, 5, 6, 7], learning_rate=[0.5, 0.525, 0.55], n_estimators=[140, 150, 160, 170, 180])


#### Gaussian Naive Bayes
param_grid = dict(clf__var_smoothing=[90, 95, 100, 105, 110, 115, 120])