## The script will redo all the pre processing steps done in 01-preprocessing folder and create also all the features needed for the prediction

####  We regroup the different bookingID in all csv files into one big parquet file
#### We suppose that the hold out data will be the same format as the training available online
#### For missing information we interpolate the data as long as the duration of the missing values is less than 2 minutes

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tsfresh import extract_features
import json
import warnings

warnings.simplefilter("ignore", UserWarning)

pd.options.display.max_columns = 300

#We won't calculate all the possible features with tsfresh, just a part of it
features_to_calculate = json.load(open('./feature_calculator.json'))

### 0 - Configure paths for hold out data

In [4]:
raw_mypath = '../data/0-raw_data/holdout/features/'
raw_onlyfiles = [f for f in listdir(raw_mypath) if f.startswith('part-')]

label_mypath = '../data/0-raw_data/holdout/labels/'
# Normally there should be only one file for the holdout label data
label_files = [f for f in listdir(label_mypath) if f.startswith('part-')]
filename_label = label_files[0]

pre_mypath = '../data/1-preprocessed/'
pre_filename = 'holdout_features.parquet'

ft_mypath = '../data/2-features/'

ml_mypath = '../data/3-ml_datasets/'

### 1- Regroup all csv files of the holdout data into one big parquet file

In [5]:
print("Regrouping all csv in one dataframe...")

cols = ['bookingID','Accuracy','Bearing',
        'acceleration_x','acceleration_y','acceleration_z',
        'gyro_x','gyro_y','gyro_z','second','Speed']

data_cols = ['Accuracy','Bearing',
             'acceleration_x','acceleration_y','acceleration_z',
             'gyro_x','gyro_y','gyro_z','Speed','second']

df = pd.DataFrame([])

for filename in raw_onlyfiles:
    temp = pd.read_csv(raw_mypath+filename)
    temp = temp[cols]
    temp = temp.loc[~temp.bookingID.isnull()]
    df = df.append(temp)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
print("Finished to regroup all bookingID into 1 dataframe")
print(df.shape)

#interpolate data for diff<120
df['t0'] = df['second']
df['t-1'] = df.groupby(['bookingID'])['second'].shift(+1)
df['diff'] = df['t0']-df['t-1']
#if diff > 2 minutes we consider that the trip has to be split into 2 sub parts
# we create a second bookingID because it doesn't make sense to interpolate for such a long duration
df['bookingID2'] = 0
df.loc[df['diff']>120,'bookingID2'] = 1
df['bookingID2'] = df.groupby(['bookingID'])['bookingID2'].transform('cumsum')

df['time'] = pd.to_datetime(df["second"], unit='s')
df = df.drop(['t0','t-1','diff'], axis=1)

df = df.set_index(['time'])
#expand and interpolate trip by booking ID and bookingID2
print("Expanding")
df = df.groupby(['bookingID','bookingID2'])[data_cols].resample('1S').asfreq().interpolate(method='linear')
df = df.reset_index()
print(df.shape)

df = df.drop(['time'], axis=1)

#for testing only
#df = df.loc[df.bookingID.isin(np.random.choice(df.bookingID.unique(), 100))].reset_index(drop=True)

print("Saving preprocessed features...")
df.to_parquet(pre_mypath+pre_filename)
print("Finished to save file")

Finished to regroup all bookingID into 1 dataframe
(3227110, 11)
Expanding
(16579136, 13)
Saving preprocessed features...
Finished to save file


### 2 - Do the 1st feature engineering for classic ML
#### We create features for Xgboost/lightgbm model
#### The script takes data from the 1-preprocessed folder and returns a classic dataframe in './data/2-features/'

In [12]:
def create_matrix_ml1(_filename, _filename_label):
    
    data_cols = ['Accuracy','Bearing',
                 'acceleration_x','acceleration_y','acceleration_z',
                 'gyro_x','gyro_y','gyro_z',
                 'Speed']
    
    print("Processing %s" %_filename)
    df = pd.read_parquet(pre_mypath+_filename)
    dt = pd.read_csv(label_mypath+_filename_label)
    dt = dt.groupby('bookingID', as_index=False)['label'].max()
    
    #add labels
    df = df.merge(dt, on='bookingID', how='left')
    #we reset second to start from 0 for every (bookingID, bookingID2)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
    df['second'] = df.groupby(['bookingID'])['second'].cumcount()
    
    #manual feature creation
    #every acceleration >= 9.8m/s2 is considered as harsh
    for col in ['acceleration_x','acceleration_y','acceleration_z']:
        df['harsh_'+col] = (np.abs(df[col])>= 9.8).astype(int)
        
    # calculate the slope for gyro and speed
    for col in ['gyro_x','gyro_y','gyro_z','Speed']:
        for i in range(1,6):
            df['d_'+col+"-"+str(i)] = df.groupby(['bookingID','bookingID2'])[col].shift(+i)
            df['d_'+col+"-"+str(i)] = (df['d_'+col+"-"+str(i)] - df[col])/i
    
    # for the slopes, flag when they are above the 80 percentile
    for d_col in [col for col in df.columns if col.startswith('d_') ]:
        df['harsh_positive_'+d_col] = ((df[d_col]>=df.loc[df[d_col]>0,col].quantile(.8)) & (df[col]>0)).astype(int)
        df['harsh_negative_'+d_col] = ((df[col]<=df.loc[df[col]<0,col].quantile(.2)) & (df[col]<0)).astype(int)
        
    df2 = df.copy()
    df2 = df2.drop(['bookingID2','second','label'], axis=1)
    df2 = df2.groupby(['bookingID']).agg(['count', 'mean', 'std', 'sum', 'min', 'median', 'max'])
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]
    
    for col in data_cols:
        df[col] = df[col].astype('float32')
    
    df2 = df2.reset_index()

    #add labels
    df2 = df2.merge(dt, on='bookingID', how='left')
    #save file to parquet
    df2.to_parquet(ft_mypath+_filename.replace('.parquet','')+'_xgboost1.parquet')
    print("Finished with %s " %_filename)
    

In [13]:
print("Creating 1st batch of features for Xgboost/Lgbm...")
create_matrix_ml1(pre_filename, filename_label)
print("Finished to create features for XGBoost/lgbm.")

Creating 1st batch of features for Xgboost/Lgbm...
Processing holdout_features.parquet
Finished with holdout_features.parquet 
Finished to create features for XGBoost/lgbm.


### 3 - Do the 2nd feature engineering for classic ML
### Becareful this part make take a while (4 hours on an EC2 r4.4xlarge instance)
#### We create additionnal features for Xgboost/lightgbm model using tsfresh
#### The script takes data from the 1-preprocessed folder and returns an additionnal classic dataframe in './data/2-features/'

In [15]:
def create_matrix_ml2(_filename, _filename_label):
    
    data_cols = ['Accuracy','Bearing',
                 'acceleration_x','acceleration_y','acceleration_z','acceleration',
                 'gyro_x','gyro_y','gyro_z','gyro',
                 'Speed']
    
    print("Processing %s" %_filename)
    df = pd.read_parquet(pre_mypath+_filename)
    dt = pd.read_csv(label_mypath+_filename_label)
    dt = dt.groupby('bookingID', as_index=False)['label'].max()

    df['acceleration'] = (df[["acceleration_x", "acceleration_y", "acceleration_z"]]**2).sum(axis=1)**0.5
    df['gyro'] = (df[["gyro_x", "gyro_y", "gyro_z"]]**2).sum(axis=1)**0.5

    #we reset second to start from 0 for every (bookingID)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
    df['second'] = df.groupby(['bookingID'])['second'].cumcount()
    #df = df.loc[df.bookingID.isin(df.bookingID.unique()[:300])].copy()
    df = df.drop(['Accuracy'], axis=1)
    
    extracted_features = extract_features(df, column_id="bookingID", column_sort="second", 
                                          default_fc_parameters=features_to_calculate, n_jobs=14)

    extracted_features = extracted_features.reset_index().rename(columns={'id':'bookingID'})
    #add labels
    df = extracted_features.merge(dt, on='bookingID', how='left')
    #save file to parquet
    df.to_parquet(ft_mypath+_filename.replace('.parquet','')+'_xgboost2.parquet')
    print("Finished with %s " %_filename)


In [16]:
print("Creating 2nd batch of features for Xgboost/Lgbm...")
create_matrix_ml2(pre_filename, filename_label)
print("Finished to create additional features for XGBoost/lgbm.")

Creating 2nd batch of features for Xgboost/Lgbm...
Processing holdout_features.parquet


Feature Extraction: 100%|██████████| 69/69 [01:40<00:00,  1.74s/it]


Finished with holdout_features.parquet 
Finished to create additional features for XGBoost/lgbm.


### 5 - Regroup holdout_features_xgboost1.parquet and holdout_features_xgboost2.parquet into one file

In [18]:
print("Regrouping 1st and 2nd batches of features for Xgboost/Lgbm...")
df_xgb1 = pd.read_parquet(ft_mypath+'holdout_features_xgboost1.parquet')
df_xgb1 = df_xgb1.drop('label', axis=1)
df_xgb2 = pd.read_parquet(ft_mypath+'holdout_features_xgboost2.parquet')
keep_cols2 = ['bookingID']+[col for col in df_xgb2.columns if col not in df_xgb1.columns]
df_xgb = df_xgb1.merge(df_xgb2[keep_cols2], on='bookingID', how='left')

df_xgb.to_parquet(ml_mypath+'holdout_xgb.parquet')

print("Finished to create final ml dataset for XGBoost/lgbm.")

Regrouping 1st and 2nd batches of features for Xgboost/Lgbm...
Finished to create final ml dataset for XGBoost/lgbm.
