#### We create features for Xgboost model
#### The script takes data from the 1-preprocessed folder and returns a classic dataframe in './data/2-features/'

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

import warnings
warnings.simplefilter("ignore", UserWarning)

pd.options.display.max_columns = 300

data_cols = ['Accuracy','Bearing',
             'acceleration_x','acceleration_y','acceleration_z',
             'gyro_x','gyro_y','gyro_z',
             'Speed']

In [2]:
def create_matrix(_filename):
    print("Processing %s" %_filename)
    df = pd.read_parquet('../data/1-preprocessed/'+_filename)
    dt = pd.read_csv('../data/0-raw_data/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')
    dt = dt.groupby('bookingID', as_index=False)['label'].max()
    
    #add labels
    df = df.merge(dt, on='bookingID', how='left')
    #we reset second to start from 0 for every (bookingID, bookingID2)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
    df['second'] = df.groupby(['bookingID'])['second'].cumcount()
    
    #manual feature creation
    #every acceleration >= 9.8m/s2 is considered as harsh
    for col in ['acceleration_x','acceleration_y','acceleration_z']:
        df['harsh_'+col] = (np.abs(df[col])>= 9.8).astype(int)
        
    # calculate the slope for gyro and speed
    for col in ['gyro_x','gyro_y','gyro_z','Speed']:
        for i in range(1,6):
            df['d_'+col+"-"+str(i)] = df.groupby(['bookingID','bookingID2'])[col].shift(+i)
            df['d_'+col+"-"+str(i)] = (df['d_'+col+"-"+str(i)] - df[col])/i
    
    # for the slopes, flag when they are above the 80 percentile
    for d_col in [col for col in df.columns if col.startswith('d_') ]:
        df['harsh_positive_'+d_col] = ((df[d_col]>=df.loc[df[d_col]>0,col].quantile(.8)) & (df[col]>0)).astype(int)
        df['harsh_negative_'+d_col] = ((df[col]<=df.loc[df[col]<0,col].quantile(.2)) & (df[col]<0)).astype(int)
        
    df2 = df.copy()
    df2 = df2.drop(['bookingID2','second','label'], axis=1)
    df2 = df2.groupby(['bookingID']).agg(['count', 'mean', 'std', 'sum', 'min', 'median', 'max'])
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]
    
    for col in data_cols:
        df[col] = df[col].astype('float32')
    
    df2 = df2.reset_index()

    #add labels
    df2 = df2.merge(dt, on='bookingID', how='left')
    #save file to parquet
    df2.to_parquet('../data/2-features/'+_filename.replace('.parquet','')+'_xgboost.parquet')
    print("Finished with %s " %_filename)


    

In [3]:
mypath = '../data/1-preprocessed/'
onlyfiles = ['features.parquet']

for filename in onlyfiles:
    create_matrix(filename)
print("Finished to create features for XGBoost.")

Processing features.parquet
Finished with features.parquet 
Finished to create features for XGBoost.
