In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from sklearn.utils import shuffle
import sklearn 
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import matplotlib.style as style 
style.use('seaborn-dark')

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [112]:
X_train = pd.read_csv("./X_train.csv")
y_train = pd.read_csv("./y_train.csv")
X_test = pd.read_csv('./X_test.csv')

In [67]:
X_train_wo = X_train.drop(['orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W'], axis = 1)

In [48]:
#feature Extraction
def change1(x):
    return np.mean(np.abs(np.diff(x)))

def change2(x):
    return np.mean(np.diff(np.abs(np.diff(x))))

def feature_extraction(df):
    feat = pd.DataFrame()
    for col in df.columns[3:]:
        feat[col + '_mean'] = df.groupby(['series_id'])[col].mean()
        feat[col + '_std'] = df.groupby(['series_id'])[col].std()
        feat[col + '_max'] = df.groupby(['series_id'])[col].max()
        feat[col + '_min'] = df.groupby(['series_id'])[col].min()
        feat[col + '_max_to_min'] = feat[col + '_max'] / feat[col + '_min']
        feat[col + '_mean_abs_change'] = df.groupby('series_id')[col].apply(change1)
        feat[col + '_mean_abs_change2'] = df.groupby('series_id')[col].apply(change2)
        
    return feat
    
    

In [53]:
def group_kfold(train_df,y,folds):
    """Generator that yiels train and test indexes."""
    folds = GroupKFold(n_splits=folds)
    f_t = []
    f_test = []
    for train_idx, test_idx in folds.split(train_df, groups=y['group_id'].values):
        f_t.append(train_idx)
        f_test.append(test_idx)
        
    return f_t,f_test



In [6]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(X_train, y_train, folds):
    Train = feature_extraction(X_train)
    t1,t2 = group_kfold(Train,y_train,folds)
    le = LabelEncoder()
    target = le.fit_transform(y_train['surface'])
    target = pd.DataFrame(target)
    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(),
        svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier()
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in tqdm(range(0,5)):
            X_train = Train.iloc[t1[i],:]
            y_train = target.iloc[t1[i],:]
            X_test = Train.iloc[t2[i],:]
            y_test = target.iloc[t2[i],:]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            

                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [96]:
MLA_selection(X_train, y_train, 5)

100%|██████████| 5/5 [00:09<00:00,  1.88s/it]
100%|██████████| 5/5 [00:09<00:00,  1.92s/it]
100%|██████████| 5/5 [00:03<00:00,  1.59it/s]
100%|██████████| 5/5 [03:42<00:00, 45.62s/it]
100%|██████████| 5/5 [00:06<00:00,  1.31s/it]
100%|██████████| 5/5 [04:56<00:00, 59.10s/it]
100%|██████████| 5/5 [00:16<00:00,  3.35s/it]
100%|██████████| 5/5 [00:00<00:00, 10.21it/s]
100%|██████████| 5/5 [00:00<00:00, 37.82it/s]
100%|██████████| 5/5 [00:01<00:00,  4.67it/s]
100%|██████████| 5/5 [00:00<00:00, 17.54it/s]
100%|██████████| 5/5 [00:00<00:00, 52.40it/s]
100%|██████████| 5/5 [00:00<00:00, 46.73it/s]
100%|██████████| 5/5 [00:01<00:00,  3.19it/s]
100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
100%|██████████| 5/5 [00:10<00:00,  2.13s/it]
100%|██████████| 5/5 [00:00<00:00,  5.06it/s]
100%|██████████| 5/5 [00:00<00:00, 61.27it/s]
100%|██████████| 5/5 [00:00<00:00,  9.98it/s]
100%|██████████| 5/5 [00:00<00:00, 24.07it/s]
  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:05<00:20,  5.02s/it]



 40%|████      | 2/5 [00:09<00:14,  4.78s/it]



 60%|██████    | 3/5 [00:13<00:09,  4.62s/it]



 80%|████████  | 4/5 [00:17<00:04,  4.53s/it]



100%|██████████| 5/5 [00:22<00:00,  4.46s/it]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9973761889143982, 0.9908136482939632, 0.996...",0.993635,"[0.48751642575558474, 0.37139107611548555, 0.4...",0.476381,0.0558823
20,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.49934296977660975, 0.37270341207349084, 0.4...",0.471669,0.0576054
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4980289093298292, 0.34908136482939633, 0.42...",0.469048,0.0722208
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5256241787122208, 0.3320209973753281, 0.424...",0.460394,0.0749429
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9931124959002952, 0.9957349081364829, 0.994...",0.993635,"[0.507227332457293, 0.30708661417322836, 0.385...",0.431791,0.0754181
16,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.43626806833114323, 0.3320209973753281, 0.39...",0.407885,0.0426353
17,ExtraTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4152431011826544, 0.2874015748031496, 0.363...",0.394239,0.0627825
15,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.6188914398163332, 0.6551837270341208, 0.668...",0.646394,"[0.4336399474375821, 0.34120734908136485, 0.30...",0.393211,0.0606288
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.6448015742866514, 0.6850393700787402, 0.689...",0.670541,"[0.43889618922470436, 0.3451443569553806, 0.30...",0.391637,0.0625872
18,LinearDiscriminantAnalysis,"{'covariance_estimator': None, 'n_components':...","[0.5998688094457199, 0.6322178477690289, 0.641...",0.617457,"[0.43889618922470436, 0.35170603674540685, 0.3...",0.382187,0.0533244


### Frequency level feature engineeering

In [96]:
X_train_fft = pd.read_csv("./fft_train.csv")

In [98]:
X_train_fft

Unnamed: 0,series_id,group_id,surface,offt_X_sum,offt_X_mean,offt_X_std,offt_Y_sum,offt_Y_mean,offt_Y_std,offt_Z_sum,...,afft_Z_std,lfft_X_sum,lfft_X_mean,lfft_X_std,lfft_Y_sum,lfft_Y_mean,lfft_Y_std,lfft_Z_sum,lfft_Z_mean,lfft_Z_std
0,0,13,fine_concrete,138.411875,2.129413,17.015342,128.354633,1.974687,15.772233,143.353244,...,1.911020,208.101388,3.201560,4.551770,210.553031,3.239277,6.528539,193.882079,2.982801,3.112246
1,1,31,concrete,175.641478,2.702177,21.609155,30.394924,0.467614,3.709957,24.666889,...,3.438063,339.173735,5.218057,4.077873,352.890366,5.429083,8.141335,358.530355,5.515852,6.262698
2,2,20,concrete,92.939858,1.429844,11.348448,166.902564,2.567732,20.491226,172.668360,...,3.288737,210.733768,3.242058,3.156317,296.042565,4.554501,8.048390,213.244111,3.280679,3.057722
3,3,31,concrete,172.041805,2.646797,21.162528,42.713170,0.657126,5.228671,33.006995,...,2.183239,436.493436,6.715284,5.720067,652.301794,10.035412,19.599177,558.336689,8.589795,10.646387
4,4,22,soft_tiles,164.526126,2.531171,20.060062,66.854904,1.028537,7.848612,60.748163,...,8.572508,110.328803,1.697366,2.703601,149.786263,2.304404,5.525958,126.019010,1.938754,2.539218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3805,3805,55,tiled,39.857107,0.613186,4.840717,160.375819,2.467320,19.722903,161.359143,...,2.345485,528.359219,8.128603,8.831259,479.566722,7.377950,5.827256,654.512578,10.069424,14.163133
3806,3806,67,wood,105.203384,1.618514,12.871995,136.212221,2.095573,16.716340,142.285122,...,1.278774,7.469435,0.114914,0.170451,6.131051,0.094324,0.152862,3.759598,0.057840,0.047775
3807,3807,48,fine_concrete,81.949549,1.260762,9.142543,151.572043,2.331878,18.240972,161.509640,...,22.155936,842.756969,12.965492,14.997013,1136.168825,17.479520,17.484220,935.049902,14.385383,16.120517
3808,3808,54,tiled,48.204868,0.741613,5.660261,159.209130,2.449371,19.521916,159.038106,...,8.029539,640.708490,9.857054,11.892794,602.867188,9.274880,18.195620,527.884916,8.121306,9.387676


In [95]:
Train = X_train_fft.drop(['series_id', 'group_id','surface' ], axis = 1)

In [14]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(Train, y_train, folds):
#     Train = feature_extraction(X_train)
    t1,t2 = group_kfold(Train,y_train,folds)
    le = LabelEncoder()
    target = le.fit_transform(y_train['surface'])
    target = pd.DataFrame(target)
    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(),
        svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier()
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in tqdm(range(0,5)):
            X_train = Train.iloc[t1[i],:]
            y_train = target.iloc[t1[i],:]
            X_test = Train.iloc[t2[i],:]
            y_test = target.iloc[t2[i],:]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            

                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [15]:
MLA_selection(Train, y_train, 5)

100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
100%|██████████| 5/5 [00:03<00:00,  1.30it/s]
100%|██████████| 5/5 [00:02<00:00,  1.87it/s]
100%|██████████| 5/5 [02:34<00:00, 30.76s/it]
100%|██████████| 5/5 [00:07<00:00,  1.52s/it]
100%|██████████| 5/5 [07:52<00:00, 94.92s/it] 
100%|██████████| 5/5 [00:29<00:00,  5.96s/it]
100%|██████████| 5/5 [00:00<00:00,  9.84it/s]
100%|██████████| 5/5 [00:01<00:00,  4.82it/s]
100%|██████████| 5/5 [00:01<00:00,  3.63it/s]
100%|██████████| 5/5 [00:00<00:00, 12.24it/s]
100%|██████████| 5/5 [00:00<00:00, 34.83it/s]
100%|██████████| 5/5 [00:00<00:00, 32.73it/s]
100%|██████████| 5/5 [00:02<00:00,  1.97it/s]
100%|██████████| 5/5 [00:05<00:00,  1.20s/it]
100%|██████████| 5/5 [00:12<00:00,  2.43s/it]
100%|██████████| 5/5 [00:00<00:00,  6.92it/s]
100%|██████████| 5/5 [00:00<00:00, 47.53it/s]
100%|██████████| 5/5 [00:00<00:00, 23.70it/s]
100%|██████████| 5/5 [00:00<00:00, 30.34it/s]
  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:07<00:28,  7.01s/it]



 40%|████      | 2/5 [00:11<00:18,  6.14s/it]



 60%|██████    | 3/5 [00:15<00:11,  5.52s/it]



 80%|████████  | 4/5 [00:19<00:05,  5.08s/it]



100%|██████████| 5/5 [00:23<00:00,  4.69s/it]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.545335085413929, 0.33727034120734906, 0.463...",0.477974,0.0773815
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.979993440472286, 0.979002624671916, 0.97472...",0.976443,"[0.5229960578186597, 0.3779527559055118, 0.484...",0.475601,0.0551668
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5243101182654402, 0.32808398950131235, 0.47...",0.470615,0.0728671
5,GaussianProcessClassifier,"{'copy_X_train': True, 'kernel': None, 'max_it...","[0.8950475565759265, 0.8996062992125984, 0.893...",0.897703,"[0.5256241787122208, 0.3438320209973753, 0.440...",0.469051,0.0709422
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9924565431288948, 0.9908136482939632, 0.994...",0.993504,"[0.533508541392904, 0.3123359580052493, 0.4692...",0.465371,0.0817689
20,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4888304862023653, 0.35039370078740156, 0.47...",0.465358,0.061768
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.7176123319121023, 0.7509842519685039, 0.726...",0.732481,"[0.5045992115637319, 0.32677165354330706, 0.42...",0.449889,0.0672817
13,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...","[0.875696949819613, 0.8848425196850394, 0.8831...",0.880053,"[0.4520367936925099, 0.32545931758530183, 0.42...",0.43964,0.0658387
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.5677271236470974, 0.6069553805774278, 0.581...",0.574476,"[0.41655716162943496, 0.3556430446194226, 0.42...",0.41312,0.0312195
16,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.44415243101182655, 0.3136482939632546, 0.44...",0.410499,0.0490865


### Both

In [31]:
time_features = feature_extraction(X_train)
frequency_features = X_train_fft.drop(['series_id', 'group_id','surface' ], axis = 1)

In [40]:
Train = pd.concat([time_features,frequency_features],axis = 1)

In [41]:
MLA_selection(Train, y_train, 5)

100%|██████████| 5/5 [00:13<00:00,  2.71s/it]
100%|██████████| 5/5 [00:14<00:00,  2.90s/it]
100%|██████████| 5/5 [00:04<00:00,  1.13it/s]
100%|██████████| 5/5 [08:31<00:00, 102.57s/it]
100%|██████████| 5/5 [00:16<00:00,  3.29s/it]
100%|██████████| 5/5 [08:58<00:00, 106.78s/it]
100%|██████████| 5/5 [00:30<00:00,  6.21s/it]
100%|██████████| 5/5 [00:01<00:00,  4.29it/s]
100%|██████████| 5/5 [00:00<00:00, 16.68it/s]
100%|██████████| 5/5 [00:02<00:00,  1.94it/s]
100%|██████████| 5/5 [00:00<00:00,  7.80it/s]
100%|██████████| 5/5 [00:00<00:00, 21.53it/s]
100%|██████████| 5/5 [00:00<00:00, 17.72it/s]
100%|██████████| 5/5 [00:02<00:00,  2.00it/s]
100%|██████████| 5/5 [00:08<00:00,  1.80s/it]
100%|██████████| 5/5 [00:21<00:00,  4.30s/it]
100%|██████████| 5/5 [00:02<00:00,  2.09it/s]
100%|██████████| 5/5 [00:00<00:00, 29.87it/s]
100%|██████████| 5/5 [00:00<00:00, 14.86it/s]
100%|██████████| 5/5 [00:00<00:00, 14.69it/s]
  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:09<00:39,  9.78s/it]



 40%|████      | 2/5 [00:19<00:29,  9.87s/it]



 60%|██████    | 3/5 [00:31<00:20, 10.47s/it]



 80%|████████  | 4/5 [00:40<00:09,  9.96s/it]



100%|██████████| 5/5 [00:50<00:00, 10.04s/it]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9983601180714988, 0.994750656167979, 0.9957...",0.994882,"[0.48226018396846254, 0.37139107611548555, 0.4...",0.472703,0.0525797
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5400788436268068, 0.31758530183727035, 0.43...",0.46617,0.0835424
20,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5019710906701709, 0.36089238845144356, 0.43...",0.465635,0.0610009
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.533508541392904, 0.3241469816272966, 0.4154...",0.461711,0.0810807
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9921285667431945, 0.9954068241469817, 0.995...",0.993898,"[0.5111695137976346, 0.3136482939632546, 0.398...",0.436775,0.0767904
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.7412266316825189, 0.7893700787401575, 0.793...",0.766408,"[0.4783180026281209, 0.2874015748031496, 0.407...",0.424166,0.0745984
15,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.7015414890127911, 0.7526246719160105, 0.754...",0.731434,"[0.4507227332457293, 0.2887139107611549, 0.359...",0.418659,0.0822999
14,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.830108232207281, 0.833989501312336, 0.82868...",0.832087,"[0.47963206307490147, 0.27165354330708663, 0.3...",0.417616,0.0884374
13,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...","[0.8743850442768121, 0.8884514435695539, 0.877...",0.876509,"[0.4783180026281209, 0.3530183727034121, 0.346...",0.417358,0.0586299
16,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4533508541392904, 0.30446194225721784, 0.39...",0.414714,0.0670718


### Advance Feature Engineering

In [72]:
# https://stackoverflow.com/questions/53033620/how-to-convert-euler-angles-to-quaternions-and-get-the-same-euler-angles-back-fr?rq=1
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

In [73]:
def fe_step0 (actual):
    
    # https://www.mathworks.com/help/aeroblks/quaternionnorm.html
    # https://www.mathworks.com/help/aeroblks/quaternionmodulus.html
    # https://www.mathworks.com/help/aeroblks/quaternionnormalize.html
    
    
    actual['norm_quat'] = (actual['orientation_X']**2 + actual['orientation_Y']**2 + actual['orientation_Z']**2 + actual['orientation_W']**2)
    actual['mod_quat'] = (actual['norm_quat'])**0.5
    actual['norm_X'] = actual['orientation_X'] / actual['mod_quat']
    actual['norm_Y'] = actual['orientation_Y'] / actual['mod_quat']
    actual['norm_Z'] = actual['orientation_Z'] / actual['mod_quat']
    actual['norm_W'] = actual['orientation_W'] / actual['mod_quat']
    
    return actual

In [74]:
X_train = fe_step0(X_train)
print(X_train.shape)
X_train.head()

(487680, 19)


Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z,norm_quat,mod_quat,norm_X,norm_Y,norm_Z,norm_W
0,0_0,0,0,-0.75853,-0.63435,-0.10488,-0.10597,0.10765,0.017561,0.000767,-0.74857,2.103,-9.7532,0.999997,0.999999,-0.758531,-0.634351,-0.10488,-0.10597
1,0_1,0,1,-0.75853,-0.63434,-0.1049,-0.106,0.067851,0.029939,0.003385,0.33995,1.5064,-9.4128,0.999995,0.999998,-0.758532,-0.634342,-0.1049,-0.106
2,0_2,0,2,-0.75853,-0.63435,-0.10492,-0.10597,0.007275,0.028934,-0.005978,-0.26429,1.5922,-8.7267,1.000006,1.000003,-0.758528,-0.634348,-0.10492,-0.10597
3,0_3,0,3,-0.75852,-0.63436,-0.10495,-0.10597,-0.013053,0.019448,-0.008974,0.42684,1.0993,-10.096,1.000009,1.000005,-0.758516,-0.634357,-0.10495,-0.10597
4,0_4,0,4,-0.75852,-0.63435,-0.10495,-0.10596,0.005135,0.007652,0.005245,-0.50969,1.4689,-10.441,0.999995,0.999997,-0.758522,-0.634352,-0.10495,-0.10596


In [77]:
def fe_step1 (actual):
    """Quaternions to Euler Angles"""
    
    x, y, z, w = actual['norm_X'].tolist(), actual['norm_Y'].tolist(), actual['norm_Z'].tolist(), actual['norm_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    return actual

In [78]:
X_train = fe_step1(X_train)
# test = fe_step1(test)
print (X_train.shape)
# X_train.head()



(487680, 22)


Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,...,linear_acceleration_Z,norm_quat,mod_quat,norm_X,norm_Y,norm_Z,norm_W,euler_x,euler_y,euler_z
0,0_0,0,0,-0.75853,-0.63435,-0.10488,-0.10597,0.10765,0.017561,0.000767,...,-9.7532,0.999997,0.999999,-0.758531,-0.634351,-0.10488,-0.10597,2.843273,-0.024668,1.396667
1,0_1,0,1,-0.75853,-0.63434,-0.1049,-0.106,0.067851,0.029939,0.003385,...,-9.4128,0.999995,0.999998,-0.758532,-0.634342,-0.1049,-0.106,2.843201,-0.024662,1.396651
2,0_2,0,2,-0.75853,-0.63435,-0.10492,-0.10597,0.007275,0.028934,-0.005978,...,-8.7267,1.000006,1.000003,-0.758528,-0.634348,-0.10492,-0.10597,2.843222,-0.024728,1.396677
3,0_3,0,3,-0.75852,-0.63436,-0.10495,-0.10597,-0.013053,0.019448,-0.008974,...,-10.096,1.000009,1.000005,-0.758516,-0.634357,-0.10495,-0.10597,2.843183,-0.024769,1.396712
4,0_4,0,4,-0.75852,-0.63435,-0.10495,-0.10596,0.005135,0.007652,0.005245,...,-10.441,0.999995,0.999997,-0.758522,-0.634352,-0.10495,-0.10596,2.843197,-0.024785,1.396698


In [76]:
from scipy.stats import kurtosis
from scipy.stats import skew

def _kurtosis(x):
    return kurtosis(x)

def CPT5(x):
    den = len(x)*np.exp(np.std(x))
    return sum(np.exp(x))/den

def skewness(x):
    return skew(x)

def SSC(x):
    x = np.array(x)
    x = np.append(x[-1], x)
    x = np.append(x,x[1])
    xn = x[1:len(x)-1]
    xn_i2 = x[2:len(x)]    
    xn_i1 = x[0:len(x)-2]  
    ans = np.heaviside((xn-xn_i1)*(xn-xn_i2),0)
    return sum(ans[1:]) 

def wave_length(x):
    x = np.array(x)
    x = np.append(x[-1], x)
    x = np.append(x,x[1])
    xn = x[1:len(x)-1]
    xn_i2 = x[2:len(x)] 
    return sum(abs(xn_i2-xn))
    
def norm_entropy(x):
    tresh = 3
    return sum(np.power(abs(x),tresh))

def SRAV(x):    
    SRA = sum(np.sqrt(abs(x)))
    return np.power(SRA/len(x),2)

def mean_abs(x):
    return sum(abs(x))/len(x)

def zero_crossing(x):
    x = np.array(x)
    x = np.append(x[-1], x)
    x = np.append(x,x[1])
    xn = x[1:len(x)-1]
    xn_i2 = x[2:len(x)]    
    return sum(np.heaviside(-xn*xn_i2,0))

In [80]:
def feat_eng(data):
    
    df = pd.DataFrame()
    data['totl_anglr_vel'] = (data['angular_velocity_X']**2 + data['angular_velocity_Y']**2 + data['angular_velocity_Z']**2)** 0.5
    data['totl_linr_acc'] = (data['linear_acceleration_X']**2 + data['linear_acceleration_Y']**2 + data['linear_acceleration_Z']**2)**0.5
    data['totl_xyz'] = (data['orientation_X']**2 + data['orientation_Y']**2 + data['orientation_Z']**2)**0.5
    data['acc_vs_vel'] = data['totl_linr_acc'] / data['totl_anglr_vel']
    
    def mean_change_of_abs_change(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    for col in tqdm(data.columns):
        if col in ['row_id','series_id','measurement_number']:
            continue
        df[col + '_mean'] = data.groupby(['series_id'])[col].mean()
        df[col + '_median'] = data.groupby(['series_id'])[col].median()
        df[col + '_max'] = data.groupby(['series_id'])[col].max()
        df[col + '_min'] = data.groupby(['series_id'])[col].min()
        df[col + '_std'] = data.groupby(['series_id'])[col].std()
        df[col + '_range'] = df[col + '_max'] - df[col + '_min']
        df[col + '_maxtoMin'] = df[col + '_max'] / df[col + '_min']
        df[col + '_mean_abs_chg'] = data.groupby(['series_id'])[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
        df[col + '_mean_change_of_abs_change'] = data.groupby('series_id')[col].apply(mean_change_of_abs_change)
        df[col + '_abs_max'] = data.groupby(['series_id'])[col].apply(lambda x: np.max(np.abs(x)))
        df[col + '_abs_min'] = data.groupby(['series_id'])[col].apply(lambda x: np.min(np.abs(x)))
        df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
        
        df[col + '_skew'] = data.groupby(['series_id'])[col].skew()
        df[col + '_mad'] = data.groupby(['series_id'])[col].mad()
        df[col + '_q25'] = data.groupby(['series_id'])[col].quantile(0.25)
        df[col + '_q75'] = data.groupby(['series_id'])[col].quantile(0.75)
        df[col + '_q95'] = data.groupby(['series_id'])[col].quantile(0.95)
        df[col + '_iqr'] = df[col + '_q75'] - df[col + '_q25']
        df[col + '_CPT5'] = data.groupby(['series_id'])[col].apply(CPT5) 
        df[col + '_SSC'] = data.groupby(['series_id'])[col].apply(SSC) 
        df[col + '_skewness'] = data.groupby(['series_id'])[col].apply(skewness)
        df[col + '_wave_lenght'] = data.groupby(['series_id'])[col].apply(wave_length)
        df[col + '_norm_entropy'] = data.groupby(['series_id'])[col].apply(norm_entropy)
        df[col + '_SRAV'] = data.groupby(['series_id'])[col].apply(SRAV)
        df[col + '_kurtosis'] = data.groupby(['series_id'])[col].apply(_kurtosis) 
        df[col + '_zero_crossing'] = data.groupby(['series_id'])[col].apply(zero_crossing)
    return df
    



In [82]:
Train = feat_eng(X_train)
# test = feat_eng(test)
# print ("New features: ",Train.shape)
Train.head()


  0%|          | 0/26 [00:00<?, ?it/s][A
 15%|█▌        | 4/26 [00:11<01:04,  2.93s/it][A
 19%|█▉        | 5/26 [00:24<02:03,  5.86s/it][A
 23%|██▎       | 6/26 [00:37<02:39,  7.97s/it][A
 27%|██▋       | 7/26 [00:49<02:57,  9.36s/it][A
 31%|███       | 8/26 [01:02<03:05, 10.31s/it][A
 35%|███▍      | 9/26 [01:15<03:07, 11.01s/it][A
 38%|███▊      | 10/26 [01:27<03:05, 11.56s/it][A
 42%|████▏     | 11/26 [01:40<02:59, 11.95s/it][A
 46%|████▌     | 12/26 [01:54<02:53, 12.40s/it][A
 50%|█████     | 13/26 [02:06<02:42, 12.50s/it][A
 54%|█████▍    | 14/26 [02:19<02:31, 12.66s/it][A
 58%|█████▊    | 15/26 [02:32<02:20, 12.76s/it][A
 62%|██████▏   | 16/26 [02:44<02:04, 12.45s/it][A
 65%|██████▌   | 17/26 [02:56<01:49, 12.22s/it][A
 69%|██████▉   | 18/26 [03:08<01:36, 12.05s/it][A
 73%|███████▎  | 19/26 [03:19<01:23, 11.97s/it][A
 77%|███████▋  | 20/26 [03:31<01:11, 11.94s/it][A
 81%|████████  | 21/26 [03:43<00:59, 11.91s/it][A
 85%|████████▍ | 22/26 [03:55<00:47, 11.83s/i

Unnamed: 0_level_0,orientation_X_mean,orientation_X_median,orientation_X_max,orientation_X_min,orientation_X_std,orientation_X_range,orientation_X_maxtoMin,orientation_X_mean_abs_chg,orientation_X_mean_change_of_abs_change,orientation_X_abs_max,...,acc_vs_vel_q95,acc_vs_vel_iqr,acc_vs_vel_CPT5,acc_vs_vel_SSC,acc_vs_vel_skewness,acc_vs_vel_wave_lenght,acc_vs_vel_norm_entropy,acc_vs_vel_SRAV,acc_vs_vel_kurtosis,acc_vs_vel_zero_crossing
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.758666,-0.75853,-0.75822,-0.75953,0.000363,0.00131,0.998275,1.5e-05,2.380952e-07,0.75953,...,447.111952,152.070826,inf,71.0,2.364265,14636.382656,3736465000.0,207.565488,8.825131,0.0
1,-0.958606,-0.958595,-0.95837,-0.95896,0.000151,0.00059,0.999385,2.3e-05,-4.761905e-07,0.95896,...,257.764692,62.3699,2.11832e+225,69.0,3.03487,7428.529167,836511000.0,108.428982,11.665781,0.0
2,-0.512057,-0.512035,-0.50944,-0.51434,0.001377,0.0049,0.990473,4.1e-05,0.0,0.51434,...,454.481504,110.303778,inf,68.0,2.373981,10684.625968,2860362000.0,173.727266,7.30758,0.0
3,-0.939169,-0.93917,-0.93884,-0.93968,0.000227,0.00084,0.999106,2.6e-05,-6.349206e-07,0.93968,...,305.250486,87.872089,inf,61.0,3.159149,10080.003112,1313681000.0,97.947012,13.384316,0.0
4,-0.891301,-0.89094,-0.88673,-0.89689,0.002955,0.01016,0.988672,8e-05,7.936508e-08,0.89689,...,96.514784,22.041252,1.807905e+39,41.0,0.7365,733.47425,44267580.0,65.071722,-0.078722,0.0


In [92]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(Train, y_train, folds):
#     Train = feature_extraction(X_train)
    t1,t2 = group_kfold(Train,y_train,folds)
    le = LabelEncoder()
    target = le.fit_transform(y_train['surface'])
    target = pd.DataFrame(target)
    MLA = [
        #Ensemble Methods
#         ensemble.AdaBoostClassifier(),
#         ensemble.BaggingClassifier(),
#         ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
#         gaussian_process.GaussianProcessClassifier(),

        #GLM
#         linear_model.LogisticRegressionCV(),
#         linear_model.PassiveAggressiveClassifier(),
#         linear_model.RidgeClassifierCV(),
#         linear_model.SGDClassifier(),
#         linear_model.Perceptron(),

        #Navies Bayes
#         naive_bayes.BernoulliNB(),
#         naive_bayes.GaussianNB(),

        #Nearest Neighbor
#         neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(),
#         svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
#         tree.ExtraTreeClassifier(),

        #Discriminant Analysis
#         discriminant_analysis.LinearDiscriminantAnalysis(),
#         discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier()
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in tqdm(range(0,5)):
            X_train = Train.iloc[t1[i],:]
            y_train = target.iloc[t1[i],:]
            X_test = Train.iloc[t2[i],:]
            y_test = target.iloc[t2[i],:]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            

                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [93]:
Train.fillna(0,inplace=True)
# test.fillna(0,inplace=True)
Train.replace(-np.inf,0,inplace=True)
Train.replace(np.inf,0,inplace=True)
# test.replace(-np.inf,0,inplace=True)
# test.replace(np.inf,0,inplace=True)


In [94]:
MLA_selection(Train, y_train, 5)



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [05:21<21:27, 322.00s/it][A[A

 40%|████      | 2/5 [10:43<16:05, 321.93s/it][A[A

 60%|██████    | 3/5 [16:05<10:43, 321.94s/it][A[A

 80%|████████  | 4/5 [20:51<05:11, 311.13s/it][A[A

100%|██████████| 5/5 [26:13<00:00, 314.28s/it][A[A

  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:03<00:14,  3.61s/it][A[A

 40%|████      | 2/5 [00:07<00:10,  3.60s/it][A[A

 60%|██████    | 3/5 [00:10<00:07,  3.63s/it][A[A

 80%|████████  | 4/5 [00:14<00:03,  3.63s/it][A[A

100%|██████████| 5/5 [00:18<00:00,  3.67s/it][A[A

  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:04<00:19,  4.98s/it][A[A

 40%|████      | 2/5 [00:09<00:14,  4.86s/it][A[A

 60%|██████    | 3/5 [00:13<00:09,  4.70s/it][A[A

 80%|████████  | 4/5 [00:18<00:04,  4.61s/it][A[A

100%|██████████| 5/5 [00:22<00:00,  4.56s/it][A[A

  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1





 20%|██        | 1/5 [00:28<01:52, 28.20s/it][A[A





 40%|████      | 2/5 [00:54<01:23, 27.77s/it][A[A





 60%|██████    | 3/5 [01:23<00:56, 28.02s/it][A[A





 80%|████████  | 4/5 [01:51<00:28, 28.08s/it][A[A





100%|██████████| 5/5 [02:20<00:00, 28.23s/it][A[A

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
4,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.6110381077529566, 0.36220472440944884, 0.46...",0.506337,0.0856768
0,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[1.0, 0.9983595800524935, 0.9993436166721366, ...",0.998819,"[0.5965834428383706, 0.3648293963254593, 0.483...",0.503442,0.0782397
1,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5926412614980289, 0.3569553805774278, 0.422...",0.491121,0.0874737
2,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...","[0.9307969826172515, 0.916994750656168, 0.9327...",0.927625,"[0.5597897503285151, 0.33989501312335957, 0.41...",0.483764,0.0915117
3,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4664914586070959, 0.26246719160104987, 0.39...",0.412092,0.0812608
