####  We regroup the different bookingID in all csv files into one big parquet file
#### For missing information we interpolate the data as long as the duration of the missing values is less than 2 minutes

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join


In [3]:
mypath = '../data/0-raw_data/safety/features/'
onlyfiles = [f for f in listdir(mypath) if f.startswith('part-')]

cols = ['bookingID','Accuracy','Bearing',
        'acceleration_x','acceleration_y','acceleration_z',
        'gyro_x','gyro_y','gyro_z','second','Speed']

data_cols = ['Accuracy','Bearing',
             'acceleration_x','acceleration_y','acceleration_z',
             'gyro_x','gyro_y','gyro_z','Speed','second']

df = pd.DataFrame([])

for filename in onlyfiles:
    temp = pd.read_csv(mypath+filename)
    temp = temp[cols]
    temp = temp.loc[~temp.bookingID.isnull()]
    df = df.append(temp)
    df = df.sort_values(by=['bookingID','second']).reset_index(drop=True)
print("Finished to regroup all bookingID into 1 dataframe")
print(df.shape)

#interpolate data for diff<120
df['t0'] = df['second']
df['t-1'] = df.groupby(['bookingID'])['second'].shift(+1)
df['diff'] = df['t0']-df['t-1']
#if diff > 2 minutes we consider that the trip has to be split into 2 sub parts
# we create a second bookingID because it doesn't make sense to interpolate for such a long duration
df['bookingID2'] = 0
df.loc[df['diff']>120,'bookingID2'] = 1
df['bookingID2'] = df.groupby(['bookingID'])['bookingID2'].transform('cumsum')

df['time'] = pd.to_datetime(df["second"], unit='s')
df = df.drop(['t0','t-1','diff'], axis=1)

df = df.set_index(['time'])
#expand and interpolate trip by booking ID and bookingID2
print("Expanding")
df = df.groupby(['bookingID','bookingID2'])[data_cols].resample('1S').asfreq().interpolate(method='linear')
df = df.reset_index()
print(df.shape)

df = df.drop(['time'], axis=1)
print("Saving features...")
df.to_parquet('../data/1-preprocessed/features.parquet')
print("Finished to save file")

Finished to regroup all bookingID into 1 dataframe
(16135561, 11)
Expanding
(16970447, 13)
Saving features...
Finished to save file
