# Experiment

In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('..')

from IPython.display import HTML
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

import definitions
from modules.preparation import FeatureExtraction
from modules.model.classifier import Classifier
from modules.model.evaluation import Evaluation
from modules.common import Feature

## Loading

In [2]:
labels = pd.read_csv(os.path.join(definitions.DATA_LABEL, os.listdir(definitions.DATA_LABEL)[0]))
labels.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [3]:
data_raw = pd.read_csv(os.path.join(definitions.DATA_PREP, os.listdir(definitions.DATA_PREP)[0]))
data_raw.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [138]:
data_sample = data_raw[data_raw['bookingID'] == 0]
data_sample.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [135]:
data = pd.read_csv(os.path.join(definitions.DATA_AGGREGATED, os.listdir(definitions.DATA_AGGREGATED)[0]))
data.head()

Unnamed: 0,bookingID,mean_acceleration_x,mean_acceleration_y,mean_acceleration_z,mean_accuracy,mean_bearing,mean_gyro_x,mean_gyro_y,mean_gyro_z,mean_speed
0,0,-0.711264,-9.613822,-1.619658,10.165339,176.526099,0.003328,-0.006118,-0.004188,8.994822
1,1,-0.525406,9.532086,-2.198999,3.718763,124.19859,-0.002467,-0.00754,0.000405,7.881588
2,2,0.306786,9.843183,0.139347,3.930626,173.794872,0.006458,-0.012861,0.002597,3.157213
3,4,-0.365117,-9.406439,-2.613639,10.0,151.807013,-0.022884,0.023232,-0.000376,6.150996
4,6,0.490616,9.538043,2.355059,4.586721,197.812785,0.003877,0.000436,0.00293,4.628921


In [6]:
for key, df in labels.groupby('bookingID'):
    if (len(df)>=2):
        print(key, df['label'].values)

13 [0 1]
154618822837 [1 0]
223338299461 [0 1]
395136991308 [1 0]
403726925929 [0 1]
455266533495 [1 0]
481036337234 [1 0]
515396075694 [0 1]
695784702084 [0 1]
919123001348 [1 0]
970662608932 [0 1]
1279900254294 [1 0]
1348619731077 [1 0]
1391569403991 [0 1]
1408749273124 [0 1]
1511828488211 [1 0]
1632087572573 [1 0]
1649267441751 [1 0]


In [7]:
data_complete = pd.merge(data, labels, on=Feature.FEAT_booking_id, suffixes=(False, False)).drop_duplicates(subset='bookingID', keep=False)
data_complete.head()

Unnamed: 0,bookingID,mean_acceleration_x,mean_acceleration_y,mean_acceleration_z,mean_accuracy,mean_bearing,mean_gyro_x,mean_gyro_y,mean_gyro_z,mean_speed,label
0,0,-0.711264,-9.613822,-1.619658,10.165339,176.526099,0.003328,-0.006118,-0.004188,8.994822,0
1,1,-0.525406,9.532086,-2.198999,3.718763,124.19859,-0.002467,-0.00754,0.000405,7.881588,1
2,2,0.306786,9.843183,0.139347,3.930626,173.794872,0.006458,-0.012861,0.002597,3.157213,1
3,4,-0.365117,-9.406439,-2.613639,10.0,151.807013,-0.022884,0.023232,-0.000376,6.150996,1
4,6,0.490616,9.538043,2.355059,4.586721,197.812785,0.003877,0.000436,0.00293,4.628921,0


In [8]:
print('0 : ' + str(len(data_complete[data_complete['label']==0])))
print('1 : ' + str(len(data_complete[data_complete['label']==1])))
print('Total : ' + str(len(data_complete)))

0 : 14999
1 : 4983
Total : 19982


## Exploration

In [140]:
df = FeatureExtraction.expand(data_sample)
# HTML(df.to_html())
df.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,...,valid_bearing,scalar_acceleration,scalar_gyro,deltasec,deltasec_speed,deltasec_bearing,delta_speed,delta_bearing,delta_scalar_acceleration,delta_scalar_gyro
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,...,1,10.176551,0.118788,,,,,,,
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,...,1,10.059553,0.101508,1.0,1.0,1.0,-3.214536,0.0,-0.116998,-0.01728
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,...,1,9.503762,0.046282,1.0,1.0,1.0,0.0,0.0,-0.555791,-0.055227
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,...,1,9.83032,0.034511,1.0,1.0,1.0,0.0,0.0,0.326559,-0.01177
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,...,1,9.967466,0.053095,1.0,1.0,1.0,0.0,0.0,0.137145,0.018583


In [123]:
df_sel = df[np.invert(np.isnan(df['delta_speed']))]
delspe = df_sel['delta_speed'].values
np.flip(np.sort(np.absolute(delspe)))[:50]

array([6.58114243, 4.81720352, 4.79992777, 3.96127343, 3.59464788,
       3.51147717, 3.31231403, 3.30077553, 3.28798103, 3.21453607,
       3.09515178, 3.05595704, 3.0213815 , 2.94464254, 2.79476762,
       2.73102093, 2.62628984, 2.4831863 , 2.47003095, 2.40783978,
       2.40376264, 2.37760258, 2.25821733, 2.19961648, 2.16615009,
       2.14760368, 2.12154198, 2.0841372 , 2.07672596, 2.00910664,
       1.95775771, 1.93812513, 1.93665314, 1.92293191, 1.91452122,
       1.9066906 , 1.87064886, 1.82662106, 1.78648233, 1.76472521,
       1.76128197, 1.72339487, 1.70558739, 1.70363164, 1.66785431,
       1.6548748 , 1.6435101 , 1.63406348, 1.6324501 , 1.61570501])

## Train

In [9]:
X = data_complete.drop('label', axis=1).values
y = data_complete['label'].values

In [10]:
model = Classifier()
scores = Evaluation.evaluate(model, X, y)

Fold 1: 0.6561095524382098
Fold 2: 0.6469792919171676
Fold 3: 0.633498997995992
Fold 4: 0.6607228915662651
Fold 5: 0.6430548862115127
Fold 6: 0.6499625167336012
Fold 7: 0.6466171352074966
Fold 8: 0.6434819277108433
Fold 9: 0.657330655957162
Fold 10: 0.6539674374616545
MEAN: 0.6491725293199905
STDDEV: 0.008154833805841557
