In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
from tqdm import tqdm
from sklearn.utils import shuffle
import sklearn 
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [90]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
X_train = pd.read_csv("./X_train.csv")
y_train = pd.read_csv("./y_train.csv")
X_test = pd.read_csv('./X_test.csv')

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z
0,0_0,0,0,-0.75853,-0.63435,-0.104880,-0.105970,0.107650,0.017561,0.000767,-0.74857,2.1030,-9.7532
1,0_1,0,1,-0.75853,-0.63434,-0.104900,-0.106000,0.067851,0.029939,0.003385,0.33995,1.5064,-9.4128
2,0_2,0,2,-0.75853,-0.63435,-0.104920,-0.105970,0.007275,0.028934,-0.005978,-0.26429,1.5922,-8.7267
3,0_3,0,3,-0.75852,-0.63436,-0.104950,-0.105970,-0.013053,0.019448,-0.008974,0.42684,1.0993,-10.0960
4,0_4,0,4,-0.75852,-0.63435,-0.104950,-0.105960,0.005135,0.007652,0.005245,-0.50969,1.4689,-10.4410
...,...,...,...,...,...,...,...,...,...,...,...,...,...
487675,3809_123,3809,123,0.62871,-0.76878,-0.084391,0.081093,0.003167,0.093760,-0.142740,3.27180,2.0115,-9.0063
487676,3809_124,3809,124,0.62884,-0.76868,-0.084365,0.081099,0.014994,0.032637,-0.132380,4.42750,3.0696,-8.1257
487677,3809_125,3809,125,0.62891,-0.76861,-0.084345,0.081178,-0.031184,-0.003961,-0.138940,2.70480,4.2622,-8.1443
487678,3809_126,3809,126,0.62903,-0.76850,-0.084414,0.081231,-0.069153,0.013229,-0.130210,2.54100,4.7130,-9.4435


In [15]:
#feature Extraction
def change1(x):
    return np.mean(np.abs(np.diff(x)))

def change2(x):
    return np.mean(np.diff(np.abs(np.diff(x))))

def feature_extraction(df):
    feat = pd.DataFrame()
    for col in df.columns[3:]:
        feat[col + '_mean'] = df.groupby(['series_id'])[col].mean()
        feat[col + '_std'] = df.groupby(['series_id'])[col].std()
        feat[col + '_max'] = df.groupby(['series_id'])[col].max()
        feat[col + '_min'] = df.groupby(['series_id'])[col].min()
        feat[col + '_max_to_min'] = feat[col + '_max'] / feat[col + '_min']
        feat[col + '_mean_abs_change'] = df.groupby('series_id')[col].apply(change1)
        feat[col + '_mean_abs_change2'] = df.groupby('series_id')[col].apply(change2)
        
    return feat
    
    

In [49]:
def group_kfold(train_df,y,folds):
    """Generator that yiels train and test indexes."""
    folds = GroupKFold(n_splits=folds)
    f_t = []
    f_test = []
    for train_idx, test_idx in folds.split(train_df, groups=y['group_id'].values):
        f_t.append(train_idx)
        f_test.append(test_idx)
        
    return f_t,f_test



In [95]:
#Machine Learning Algorithm (MLA) Selection and Initialization
def MLA_selection(X_train, y_train, folds):
    Train = feature_extraction(X_train)
    t1,t2 = group_kfold(Train,y_train,folds)
    le = LabelEncoder()
    target = le.fit_transform(y_train['surface'])
    target = pd.DataFrame(target)
    MLA = [
        #Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        #Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        #GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        #Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        #Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        #SVM
        svm.SVC(),
        svm.LinearSVC(),

        #Trees    
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        #Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),


        XGBClassifier()
#         XGBClassifier()
        ]


    MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy','MLA Train Accuracy Mean', 'MLA Test Accuracy', 'MLA Test Accuracy Mean','MLA Test Accuracy Std' ]
    MLA_compare = pd.DataFrame(columns = MLA_columns)
    row_index = 0
    for alg in MLA:
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        test = []
        train = []
        for i in tqdm(range(0,5)):
            X_train = Train.iloc[t1[i],:]
            y_train = target.iloc[t1[i],:]
            X_test = Train.iloc[t2[i],:]
            y_test = target.iloc[t2[i],:]
            #Scaling
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

            alg.fit(X_train,y_train)
            test.append(metrics.accuracy_score(y_test,alg.predict(X_test)))

            train.append(metrics.accuracy_score(y_train,alg.predict(X_train)))
            

                
        
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = train
        MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = np.mean(train)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = test
        MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = np.mean(test)
        MLA_compare.loc[row_index, 'MLA Test Accuracy Std'] = np.std(test)


        row_index+=1

    
    MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
    
    return MLA_compare

In [96]:
MLA_selection(X_train, y_train, 5)

100%|██████████| 5/5 [00:09<00:00,  1.88s/it]
100%|██████████| 5/5 [00:09<00:00,  1.92s/it]
100%|██████████| 5/5 [00:03<00:00,  1.59it/s]
100%|██████████| 5/5 [03:42<00:00, 45.62s/it]
100%|██████████| 5/5 [00:06<00:00,  1.31s/it]
100%|██████████| 5/5 [04:56<00:00, 59.10s/it]
100%|██████████| 5/5 [00:16<00:00,  3.35s/it]
100%|██████████| 5/5 [00:00<00:00, 10.21it/s]
100%|██████████| 5/5 [00:00<00:00, 37.82it/s]
100%|██████████| 5/5 [00:01<00:00,  4.67it/s]
100%|██████████| 5/5 [00:00<00:00, 17.54it/s]
100%|██████████| 5/5 [00:00<00:00, 52.40it/s]
100%|██████████| 5/5 [00:00<00:00, 46.73it/s]
100%|██████████| 5/5 [00:01<00:00,  3.19it/s]
100%|██████████| 5/5 [00:05<00:00,  1.04s/it]
100%|██████████| 5/5 [00:10<00:00,  2.13s/it]
100%|██████████| 5/5 [00:00<00:00,  5.06it/s]
100%|██████████| 5/5 [00:00<00:00, 61.27it/s]
100%|██████████| 5/5 [00:00<00:00,  9.98it/s]
100%|██████████| 5/5 [00:00<00:00, 24.07it/s]
  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:05<00:20,  5.02s/it]



 40%|████      | 2/5 [00:09<00:14,  4.78s/it]



 60%|██████    | 3/5 [00:13<00:09,  4.62s/it]



 80%|████████  | 4/5 [00:17<00:04,  4.53s/it]



100%|██████████| 5/5 [00:22<00:00,  4.46s/it]


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Train Accuracy Mean,MLA Test Accuracy,MLA Test Accuracy Mean,MLA Test Accuracy Std
3,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...","[0.9973761889143982, 0.9908136482939632, 0.996...",0.993635,"[0.48751642575558474, 0.37139107611548555, 0.4...",0.476381,0.0558823
20,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.49934296977660975, 0.37270341207349084, 0.4...",0.471669,0.0576054
4,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4980289093298292, 0.34908136482939633, 0.42...",0.469048,0.0722208
2,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.5256241787122208, 0.3320209973753281, 0.424...",0.460394,0.0749429
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...","[0.9931124959002952, 0.9957349081364829, 0.994...",0.993635,"[0.507227332457293, 0.30708661417322836, 0.385...",0.431791,0.0754181
16,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.43626806833114323, 0.3320209973753281, 0.39...",0.407885,0.0426353
17,ExtraTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...","[1.0, 1.0, 1.0, 1.0, 1.0]",1.0,"[0.4152431011826544, 0.2874015748031496, 0.363...",0.394239,0.0627825
15,LinearSVC,"{'C': 1.0, 'class_weight': None, 'dual': True,...","[0.6188914398163332, 0.6551837270341208, 0.668...",0.646394,"[0.4336399474375821, 0.34120734908136485, 0.30...",0.393211,0.0606288
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...","[0.6448015742866514, 0.6850393700787402, 0.689...",0.670541,"[0.43889618922470436, 0.3451443569553806, 0.30...",0.391637,0.0625872
18,LinearDiscriminantAnalysis,"{'covariance_estimator': None, 'n_components':...","[0.5998688094457199, 0.6322178477690289, 0.641...",0.617457,"[0.43889618922470436, 0.35170603674540685, 0.3...",0.382187,0.0533244
