# Preprocess OBD-II dataset

## Load raw data
At this stage, your data directory should contain 'VehicularData(anonymized).csv'

In [1]:
import numpy as np
import os, copy, pickle, json
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

parent = os.path.abspath('')
dataset = os.path.join(parent, 'datasets', 'VehicularData(anonymized).csv')

df1 = pd.read_csv(dataset)
print(f"Columns : {df1.columns}")

Columns : Index(['Car_Id', 'Person_Id', 'Trip', 'GPS_Time', 'Device_Time', 'GPS_Long',
       'GPS_Lat', 'GPS_Speed_Ms', 'GPS_HDOP', 'GPS_Bearing', 'Gx', 'Gy', 'Gz',
       'G_Calibrated', 'OBD_KPL_Average', 'OBD_Trip_KPL_Average',
       'OBD_Intake_Air_Temp_C', 'Device_Barometer_M', 'GPS_Altitude_M',
       'OBD_Engine_Load', 'OBD_Fuel_Level', 'GPS_Accuracy_M', 'OBD_Speed_Km',
       'GPS_Speed_Km', 'Device_Trip_Dist_Km', 'OBD_Engine_Coolant_Temp_C',
       'OBD_Engine_RPM', 'OBD_Adapter_Voltage', 'OBD_KPL_Instant',
       'OBD_Fuel_Flow_CCmin', 'Device_Fuel_Remaining',
       'OBD_Ambient_Air_Temp_C', 'OBD_CO2_gkm_Average', 'OBD_CO2_gkm_Instant',
       'Device_Cost_Km_Inst', 'Device_Cost_Km_Trip', 'OBD_Air_Pedal',
       'Context', 'Acceleration_kmhs', 'Reaction_Time', 'Air_Drag_Force',
       'Speed_RPM_Relation', 'KPL_Instant'],
      dtype='object')


  df1 = pd.read_csv(dataset)


## Drop non-OBD/unnecessary columns
Save the filtered dataframe to a separate CSV

In [2]:
drop_cols = ['GPS_Time', 'Device_Time', 'Trip', 'GPS_Long', 'GPS_Lat', 'GPS_Speed_Ms', 'GPS_HDOP', 'GPS_Bearing', 'Gx', 'Gy', 'Gz', 'G_Calibrated', 'OBD_Trip_KPL_Average',
             'Device_Barometer_M', 'GPS_Altitude_M', 'GPS_Accuracy_M', 'GPS_Speed_Km', 'Device_Trip_Dist_Km', 'OBD_Adapter_Voltage', 'Device_Fuel_Remaining',
             'Device_Cost_Km_Inst', 'Device_Cost_Km_Trip', 'Context', 'Reaction_Time', 'Speed_RPM_Relation', 'KPL_Instant']

for col in drop_cols:
    try:
        df1.drop(col, axis=1, inplace=True)
    except:
        pass
columns = df1.columns
print(f"Columns after dropping :\n{columns}\nCount : {len(columns)}")
print(f"Number of samples : {len(df1)}")

df1.to_csv(os.path.join(parent, 'datasets', 'vehicular_modified.csv'), index=False)

Columns after dropping :
Index(['Car_Id', 'Person_Id', 'OBD_KPL_Average', 'OBD_Intake_Air_Temp_C',
       'OBD_Engine_Load', 'OBD_Fuel_Level', 'OBD_Speed_Km',
       'OBD_Engine_Coolant_Temp_C', 'OBD_Engine_RPM', 'OBD_KPL_Instant',
       'OBD_Fuel_Flow_CCmin', 'OBD_Ambient_Air_Temp_C', 'OBD_CO2_gkm_Average',
       'OBD_CO2_gkm_Instant', 'OBD_Air_Pedal', 'Acceleration_kmhs',
       'Air_Drag_Force'],
      dtype='object')
Count : 17
Number of samples : 91794


## Normalize the dataset
Save to JSON

In [3]:
data1 = np.array(df1[df1['Car_Id']==1])
data2 = np.array(df1[df1['Car_Id']==2])
num_samples1 = len(data1)
num_samples2 = len(data2)

scalers = {}
for c_idx, col_name in enumerate(columns):
    if col_name in ['Car_Id', 'Person_Id']:
        continue
    # Fit the scaler with CarID=1 instances
    scaler = MinMaxScaler(feature_range=(0, 1)).fit(data1[:, c_idx].reshape(-1, 1))
    # scaler = StandardScaler().fit(data1[:, c_idx].reshape(-1, 1))
    data1[:, c_idx] = scaler.transform(data1[:, c_idx].reshape(-1, 1)).reshape((num_samples1,))

    # Scale CarID=2 instances with trained scalers
    data2[:, c_idx] = scaler.transform(data2[:, c_idx].reshape(-1, 1)).reshape((num_samples2,))

    # Keep scaler instances for future use
    scalers[col_name] = copy.deepcopy(scaler)

# Save to JSON
new_dataset = {}
new_dataset['columns'] = list(columns)
new_dataset['car1'] = data1.tolist()
new_dataset['car2'] = data2.tolist()

with open(os.path.join(parent, 'datasets', 'vehicular_modified.json'), 'w') as fp:
    json.dump(new_dataset, fp)

with open(os.path.join(parent, 'datasets', 'obd2_features_scalers.pk'), 'wb') as fp:
    pickle.dump(scalers, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Load Filtered CSV and JSON for sanity check

In [5]:
with open(os.path.join(parent, 'datasets', 'vehicular_modified.json'), 'r') as fp:
    df = json.load(fp)
columns = list(df['columns'])
df['car1'] = np.array(df['car1'], dtype=np.float32)
df['car2'] = np.array(df['car2'], dtype=np.float32)
print(f"Columns : {columns}")

data = df['car1']
num_samples, dim = data.shape
print(num_samples, dim)

df = pd.read_csv(os.path.join(parent, 'datasets', 'vehicular_modified.csv'))
print(df.columns)

Columns : ['Car_Id', 'Person_Id', 'OBD_KPL_Average', 'OBD_Intake_Air_Temp_C', 'OBD_Engine_Load', 'OBD_Fuel_Level', 'OBD_Speed_Km', 'OBD_Engine_Coolant_Temp_C', 'OBD_Engine_RPM', 'OBD_KPL_Instant', 'OBD_Fuel_Flow_CCmin', 'OBD_Ambient_Air_Temp_C', 'OBD_CO2_gkm_Average', 'OBD_CO2_gkm_Instant', 'OBD_Air_Pedal', 'Acceleration_kmhs', 'Air_Drag_Force']
85095 17
Index(['Car_Id', 'Person_Id', 'OBD_KPL_Average', 'OBD_Intake_Air_Temp_C',
       'OBD_Engine_Load', 'OBD_Fuel_Level', 'OBD_Speed_Km',
       'OBD_Engine_Coolant_Temp_C', 'OBD_Engine_RPM', 'OBD_KPL_Instant',
       'OBD_Fuel_Flow_CCmin', 'OBD_Ambient_Air_Temp_C', 'OBD_CO2_gkm_Average',
       'OBD_CO2_gkm_Instant', 'OBD_Air_Pedal', 'Acceleration_kmhs',
       'Air_Drag_Force'],
      dtype='object')


# Separate Driver-wise data from JSON

This will create a separate directory to store separated data

## Load JSON and separate

In [6]:
driverwise_datapath = os.path.join(parent, 'datasets', 'obd_driverwise_data')
if not os.path.exists(driverwise_datapath):
    os.mkdir(driverwise_datapath)

with open(os.path.join(parent, 'datasets', 'vehicular_modified.json'), 'r') as fp:
    df = json.load(fp)
columns = list(df['columns'])
df['car1'] = np.array(df['car1'], dtype=np.float32)
df['car2'] = np.array(df['car2'], dtype=np.float32)
print(f"Columns : {columns}")
driver_idx = np.where(np.array(df['columns'])=='Person_Id')[0]

# Car-1
drivers = np.unique(df['car1'][:, driver_idx]).tolist()
for driver in drivers :
    arr_idxs = np.where(df['car1'][:, driver_idx]==driver)[0]
    arr = df['car1'][arr_idxs, 2:]
    np.save(os.path.join(driverwise_datapath, f"obd_car1_driver{int(driver)}.npy"), arr)

# Car-2
drivers = np.unique(df['car2'][:, driver_idx]).tolist()
for driver in drivers :
    arr_idxs = np.where(df['car2'][:, driver_idx]==driver)[0]
    arr = df['car2'][arr_idxs, 2:]
    np.save(os.path.join(driverwise_datapath, f"obd_car2_driver{int(driver)}.npy"), arr)

Columns : ['Car_Id', 'Person_Id', 'OBD_KPL_Average', 'OBD_Intake_Air_Temp_C', 'OBD_Engine_Load', 'OBD_Fuel_Level', 'OBD_Speed_Km', 'OBD_Engine_Coolant_Temp_C', 'OBD_Engine_RPM', 'OBD_KPL_Instant', 'OBD_Fuel_Flow_CCmin', 'OBD_Ambient_Air_Temp_C', 'OBD_CO2_gkm_Average', 'OBD_CO2_gkm_Instant', 'OBD_Air_Pedal', 'Acceleration_kmhs', 'Air_Drag_Force']


## Sanity check

In [7]:
import numpy as np
import os

parent = os.path.abspath('')
datapath = os.path.join(parent, 'datasets', 'obd_driverwise_data')
all_splits = [x for x in os.listdir(datapath) if 'driver' in x]

for driver in all_splits:
    X = np.load(os.path.join(datapath, driver))
    print(f"{driver} : {X.shape}")

obd_car1_driver1.npy : (37519, 15)
obd_car1_driver10.npy : (4006, 15)
obd_car1_driver2.npy : (19930, 15)
obd_car1_driver3.npy : (10992, 15)
obd_car1_driver4.npy : (1193, 15)
obd_car1_driver5.npy : (6176, 15)
obd_car1_driver6.npy : (146, 15)
obd_car1_driver7.npy : (1676, 15)
obd_car1_driver8.npy : (1418, 15)
obd_car1_driver9.npy : (2039, 15)
obd_car2_driver1.npy : (2038, 15)
obd_car2_driver2.npy : (1041, 15)
obd_car2_driver3.npy : (1927, 15)
obd_car2_driver4.npy : (1693, 15)


# Create HDF5 from NPY for Lazy loading

Provide a window size and create chunks of OBD data. The left (input) will be of size (N, T, D) and the right (ground truth) will be of size (N, 1, D). Furthermore, it will also divide all the chunks into train and eval.

It will create different `.h5` files, one for each of `X_train_left`, `X_train_right`, `X_val_left`, and `X_val_right`. You can create one `.h5` with different fields inside that.

In [8]:
from services import createOBDchunks
import h5py

context = 1000
test_size = 0.15
verbose = True

X_train_left, X_train_right, X_val_left, X_val_right = createOBDchunks(datapath, context=context, test_size=test_size, verbose=verbose)
with h5py.File(os.path.join(datapath, f'obd___X_train_left_{context}.h5'), 'w') as f:
    f.create_dataset("data", data=X_train_left)
with h5py.File(os.path.join(datapath, f'obd___X_train_right_{context}.h5'), 'w') as f:
    f.create_dataset("data", data=X_train_right)
with h5py.File(os.path.join(datapath, f'obd___X_val_left_{context}.h5'), 'w') as f:
    f.create_dataset("data", data=X_val_left)
with h5py.File(os.path.join(datapath, f'obd___X_val_right_{context}.h5'), 'w') as f:
    f.create_dataset("data", data=X_val_right)

Loading "obd_car1_driver1.npy" ...	Complete!
Loading "obd_car1_driver10.npy" ...	Complete!
Loading "obd_car1_driver2.npy" ...	Complete!
Loading "obd_car1_driver3.npy" ...	Complete!
Loading "obd_car1_driver4.npy" ...	Complete!
Loading "obd_car1_driver5.npy" ...	Complete!
Loading "obd_car1_driver7.npy" ...	Complete!
Loading "obd_car1_driver8.npy" ...	Complete!
Loading "obd_car1_driver9.npy" ...	Complete!
Loading "obd_car2_driver1.npy" ...	Complete!
Loading "obd_car2_driver2.npy" ...	Complete!
Loading "obd_car2_driver3.npy" ...	Complete!
Loading "obd_car2_driver4.npy" ...	Complete!
X_left : (78661, 999, 15)	X_right : (78661, 1, 15)


## Sanity check

In [11]:
import os, h5py
from utils import load_data

parent = os.path.abspath('')
data = 'data' # 'mix' 
dataset_name = "obd" # "obdmix"

context = 1000

print(f"Loading dataset : {dataset_name} ...", flush=True)
X_train_left = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_train_left_{context}.h5'))
X_train_right = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_train_right_{context}.h5'))
X_val_left = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_val_left_{context}.h5'))
X_val_right = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_val_right_{context}.h5'))
print(f"Dataset loaded.", flush=True)

print(f"X_train_left : ({len(X_train_left), len(X_train_left[0]), len(X_train_left[0][0])})")
print(f"X_train_right : ({len(X_train_right), len(X_train_right[0]), len(X_train_right[0][0])})")
print(f"X_val_left : ({len(X_val_left), len(X_val_left[0]), len(X_val_left[0][0])})")
print(f"X_val_right : ({len(X_val_right), len(X_val_right[0]), len(X_val_right[0][0])})")

Loading dataset : obd ...
Dataset loaded.
X_train_left : ((66862, 999, 15))
X_train_right : ((66862, 1, 15))
X_val_left : ((11799, 999, 15))
X_val_right : ((11799, 1, 15))


# HDF5 of Single Instances for Inference with single instance

In [12]:
import os, h5py
from utils import load_data

parent = os.path.abspath('')
if not os.path.exists(os.path.join(parent, 'datasets', 'sample_data')):
    os.mkdir(os.path.join(parent, 'datasets', 'sample_data'))
data = 'data' # 'mix' 
dataset_name = "obd" # "obdmix"

context = 1000

print(f"Loading dataset : {dataset_name} ...", flush=True)
X_train_left = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_train_left_{context}.h5'))
X_train_right = load_data(os.path.join(parent, 'datasets', f'obd_driverwise_{data}', f'obd___X_train_right_{context}.h5'))
print(f"Dataset loaded.", flush=True)

for i in range(5):
    sample_left = X_train_left[i]
    sample_right = X_train_right[i]
    with h5py.File(os.path.join(parent, 'datasets', 'sample_data', f'sample{i}_{dataset_name}_{context}.h5'), 'w') as f:
        f.create_dataset("x_l", data=sample_left)
        f.create_dataset("x_r", data=sample_right)

Loading dataset : obd ...
Dataset loaded.
