# Experiment

In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('..')

from IPython.display import HTML
import pandas as pd
import numpy as np
import os
import pickle
from tqdm import tqdm

import definitions
from modules.preparation import FeatureExtraction
from modules.model.classifier import Classifier
from modules.model.evaluation import Evaluation
from modules.common import Feature

## Loading

In [2]:
labels = pd.read_csv(os.path.join(definitions.DATA_LABEL, os.listdir(definitions.DATA_LABEL)[0]))
labels.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [3]:
data_raw = pd.read_csv(os.path.join(definitions.DATA_PREP, os.listdir(definitions.DATA_PREP)[0]))
data_raw.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [4]:
data_sample = data_raw[data_raw['bookingID'] == 0]
data_sample.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454


In [5]:
data = pd.read_csv(os.path.join(definitions.DATA_AGGREGATED, os.listdir(definitions.DATA_AGGREGATED)[0]))
data.head()

Unnamed: 0,best_accuracy,bookingID,mean1_acceleration_x,mean1_acceleration_y,mean1_acceleration_z,mean1_delta_acceleration_x,mean1_delta_acceleration_y,mean1_delta_acceleration_z,mean1_delta_bearing,mean1_delta_gyro_x,...,stddev_delta_scalar_acceleration,stddev_delta_scalar_gyro,stddev_delta_speed,stddev_gyro_x,stddev_gyro_y,stddev_gyro_z,stddev_scalar_acceleration,stddev_scalar_gyro,stddev_speed,worst_accuracy
0,4.0,0,2.840349,-7.209301,1.620598,4.279216,2.946221,4.600614,23.382351,0.335549,...,0.873805,0.09871,0.724042,0.065921,0.100175,0.063654,0.623848,0.091343,7.192993,48.0
1,3.0,1,2.999995,11.39368,0.666485,2.267892,2.560028,2.728265,30.25,0.146817,...,0.746419,0.058891,0.750789,0.027724,0.091645,0.033818,0.51965,0.077417,7.021788,7.709
2,3.0,2,1.956122,13.333716,2.31287,2.063865,3.373533,2.942564,157.0,0.300546,...,0.789266,0.084953,0.857855,0.053765,0.11702,0.036122,0.51385,0.092789,2.890323,8.0
3,10.0,4,1.653586,-7.581676,-0.29825,1.726963,3.868589,2.615402,23.04743,0.2891,...,0.87678,0.089444,0.746951,0.042322,0.112516,0.065897,0.619783,0.08946,5.593342,10.0
4,3.0,6,3.858134,12.010161,5.682811,4.038663,2.825723,3.832276,52.4,0.327974,...,0.796394,0.08197,0.672737,0.055144,0.106766,0.057411,0.585079,0.09866,5.312417,12.0


In [6]:
for key, df in labels.groupby('bookingID'):
    if (len(df)>=2):
        print(key, df['label'].values)

13 [0 1]
154618822837 [1 0]
223338299461 [0 1]
395136991308 [1 0]
403726925929 [0 1]
455266533495 [1 0]
481036337234 [1 0]
515396075694 [0 1]
695784702084 [0 1]
919123001348 [1 0]
970662608932 [0 1]
1279900254294 [1 0]
1348619731077 [1 0]
1391569403991 [0 1]
1408749273124 [0 1]
1511828488211 [1 0]
1632087572573 [1 0]
1649267441751 [1 0]


In [7]:
data_complete = pd.merge(data, labels, on=Feature.FEAT_booking_id, suffixes=(False, False)).drop_duplicates(subset='bookingID', keep=False)
data_complete.head()

Unnamed: 0,best_accuracy,bookingID,mean1_acceleration_x,mean1_acceleration_y,mean1_acceleration_z,mean1_delta_acceleration_x,mean1_delta_acceleration_y,mean1_delta_acceleration_z,mean1_delta_bearing,mean1_delta_gyro_x,...,stddev_delta_scalar_gyro,stddev_delta_speed,stddev_gyro_x,stddev_gyro_y,stddev_gyro_z,stddev_scalar_acceleration,stddev_scalar_gyro,stddev_speed,worst_accuracy,label
0,4.0,0,2.840349,-7.209301,1.620598,4.279216,2.946221,4.600614,23.382351,0.335549,...,0.09871,0.724042,0.065921,0.100175,0.063654,0.623848,0.091343,7.192993,48.0,0
1,3.0,1,2.999995,11.39368,0.666485,2.267892,2.560028,2.728265,30.25,0.146817,...,0.058891,0.750789,0.027724,0.091645,0.033818,0.51965,0.077417,7.021788,7.709,1
2,3.0,2,1.956122,13.333716,2.31287,2.063865,3.373533,2.942564,157.0,0.300546,...,0.084953,0.857855,0.053765,0.11702,0.036122,0.51385,0.092789,2.890323,8.0,1
3,10.0,4,1.653586,-7.581676,-0.29825,1.726963,3.868589,2.615402,23.04743,0.2891,...,0.089444,0.746951,0.042322,0.112516,0.065897,0.619783,0.08946,5.593342,10.0,1
4,3.0,6,3.858134,12.010161,5.682811,4.038663,2.825723,3.832276,52.4,0.327974,...,0.08197,0.672737,0.055144,0.106766,0.057411,0.585079,0.09866,5.312417,12.0,0


In [8]:
print('0 : ' + str(len(data_complete[data_complete['label']==0])))
print('1 : ' + str(len(data_complete[data_complete['label']==1])))
print('Total : ' + str(len(data_complete)))

0 : 14999
1 : 4983
Total : 19982


## Exploration

In [9]:
df = FeatureExtraction.expand(data_sample)
HTML(df[:5].to_html())

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,valid_speed,valid_bearing,scalar_acceleration,scalar_gyro,deltasec,deltasec_speed,deltasec_bearing,delta_speed,delta_bearing,delta_scalar_acceleration,delta_scalar_gyro,delta_acceleration_x,delta_acceleration_y,delta_acceleration_z,delta_gyro_x,delta_gyro_y,delta_gyro_z
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991,1,1,10.176551,0.118788,,,,,,,,,,,,,
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454,1,1,10.059553,0.101508,1.0,1.0,1.0,-3.214536,0.0,-0.116998,-0.01728,-0.271707,0.105872,-0.023926,-0.030847,0.015166,-0.027545
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454,1,1,9.503762,0.046282,1.0,1.0,1.0,0.0,0.0,-0.555791,-0.055227,-2.252612,0.564798,0.829477,0.018127,0.046222,-0.027797
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454,1,1,9.83032,0.034511,1.0,1.0,1.0,0.0,0.0,0.326559,-0.01177,0.289502,-0.27724,-0.651529,0.006552,0.037701,-0.041143
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454,1,1,9.967466,0.053095,1.0,1.0,1.0,0.0,0.0,0.137145,0.018583,0.818561,-0.305502,0.482404,0.008117,-0.051254,0.047654


## Train

In [10]:
X = data_complete.drop(['label', 'bookingID'], axis=1)
X = X.reindex(sorted(X.columns), axis=1)
X = X.values
y = data_complete['label'].values

In [11]:
def get_weight(cls):
    return np.where(cls==1, 2.0, 1.0)

sample_weight = get_weight(y)

In [12]:
model = Classifier()
scores = Evaluation.evaluate(model, X, y, sample_weight)

Fold 1: 0.6967040748162991
Fold 2: 0.7304769539078156
Fold 3: 0.7291783567134268
Fold 4: 0.7069986613119142
Fold 5: 0.7044377510040161
Fold 6: 0.7061499330655958
Fold 7: 0.7041566265060241
Fold 8: 0.7059973226238286
Fold 9: 0.719429718875502
Fold 10: 0.7374930006885447
MEAN: 0.7141022399512967
STDDEV: 0.01392573036581773


In [13]:
model.fit(X, y, sample_weight)

In [14]:
pickle.dump(model, open(definitions.MODEL_FINAL, 'wb'))

In [15]:
model_load = pickle.load(open(definitions.MODEL_FINAL, 'rb'))

In [16]:
pred = model_load.predict(X[:1000])
Evaluation.score(y[:1000], pred)

0.9382779352918702

In [17]:
pred[:10]

array([0.60137829, 0.55775538, 0.71847224, 0.47609618, 0.30444702,
       0.09665763, 0.15954784, 0.07492398, 0.26740922, 0.33080954])