## Warning the process can take up to 4 hours on an EC2 r4.4xlarge
#### We create features for Xgboost model using the library tsfresh
#### The script takes data from the 1-preprocessed folder and returns a big dataframe in './data/2-features/'

In [38]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tsfresh import extract_features
import json

import warnings
warnings.simplefilter("ignore", UserWarning)

pd.options.display.max_columns = 300

#We don't calculate all the possible features with tsfresh, just a part of it
features_to_calculate = json.load(open('./feature_calculator.json'))

data_cols = ['Accuracy','Bearing',
             'acceleration_x','acceleration_y','acceleration_z','acceleration',
             'gyro_x','gyro_y','gyro_z','gyro',
             'Speed']

In [40]:
def create_matrix(_filename):
    print("Processing %s" %_filename)
    df = pd.read_parquet('../data/1-preprocessed/'+_filename)
    dt = pd.read_csv('../data/0-raw_data/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')
    dt = dt.groupby('bookingID', as_index=False)['label'].max()

    df['acceleration'] = (df[["acceleration_x", "acceleration_y", "acceleration_z"]]**2).sum(axis=1)**0.5
    df['gyro'] = (df[["gyro_x", "gyro_y", "gyro_z"]]**2).sum(axis=1)**0.5

    #we reset second to start from 0 for every (bookingID)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
    df['second'] = df.groupby(['bookingID'])['second'].cumcount()
    #df = df.loc[df.bookingID.isin(df.bookingID.unique()[:300])].copy()
    df = df.drop(['Accuracy'], axis=1)
    
    extracted_features = extract_features(df, column_id="bookingID", column_sort="second", 
                                          default_fc_parameters=features_to_calculate, n_jobs=14)

    extracted_features = extracted_features.reset_index().rename(columns={'id':'bookingID'})
    #add labels
    df = extracted_features.merge(dt, on='bookingID', how='left')
    #save file to parquet
    df.to_parquet('../data/2-features/'+_filename.replace('.parquet','')+'_xgboost2.parquet')
    print("Finished with %s " %_filename)


In [41]:
mypath = '../data/1-preprocessed/'
onlyfiles = ['features.parquet']

for filename in onlyfiles:
    create_matrix(filename)
print("Finished to create additional features for XGBoost.")

Processing features.parquet






Feature Extraction:   0%|          | 0/70 [00:00<?, ?it/s][A[A[A[A



Feature Extraction:   1%|▏         | 1/70 [33:55<39:00:11, 2034.95s/it][A[A[A[A



Feature Extraction:   3%|▎         | 2/70 [35:15<27:21:48, 1448.66s/it][A[A[A[A



Feature Extraction:   4%|▍         | 3/70 [35:39<19:00:25, 1021.27s/it][A[A[A[A



Feature Extraction:   6%|▌         | 4/70 [36:11<13:16:44, 724.31s/it] [A[A[A[A



Feature Extraction:   7%|▋         | 5/70 [36:17<9:11:24, 509.00s/it] [A[A[A[A



Feature Extraction:   9%|▊         | 6/70 [36:21<6:21:20, 357.51s/it][A[A[A[A



Feature Extraction:  10%|█         | 7/70 [36:25<4:24:02, 251.47s/it][A[A[A[A



Feature Extraction:  11%|█▏        | 8/70 [36:44<3:07:42, 181.65s/it][A[A[A[A



Feature Extraction:  13%|█▎        | 9/70 [36:56<2:12:52, 130.70s/it][A[A[A[A



Feature Extraction:  14%|█▍        | 10/70 [37:02<1:33:15, 93.26s/it][A[A[A[A



Feature Extraction:  16%|█▌        | 11/70 [37:23<1:10:33, 71.7

Finished with features.parquet 
Finished to create additional features for XGBoost.
