# Data Exploration and Preprocessing

In [None]:
import pandas as pd
alertl = pd.read_csv('AlertLevel.csv')
rvbsm = pd.read_csv('RvBsm.csv')
summary = pd.read_csv('Summary.csv')
eventappid = pd.read_csv('EventAppID.csv')
host = pd.read_csv('Host.csv')
rvzone = pd.read_csv('RvZone.csv')
evtwarn = pd.read_csv('EvtWarn.csv')
vehclass = pd.read_csv('RvBasicVehClass.csv')
spat = pd.read_csv('Spat.csv')

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [None]:
summary.groupby(['Device','Trip']).size().reset_index().rename(columns={0:'count'}).head()

In [None]:
# unique device and trip : loop over summary 
device = 2004
trip = 12

In [None]:
summary[(summary['Device'] == device) & (summary['Trip'] == trip)]

In [None]:
# filter data for the particular device and trip ids from other tables
host_data = host[(host['Device'] == device) & (host['Trip'] == trip)].reset_index().drop(columns=['index'])
rvbsm_data = rvbsm[(rvbsm['Device'] == device) & (rvbsm['Trip'] == trip)].reset_index().drop(columns=['index'])
evt_data = evtwarn[(evtwarn['Device'] == device) & (evtwarn['Trip'] == trip)].reset_index().drop(columns=['index'])
spat_data = spat[(spat['Device'] == device) & (spat['Trip'] == trip)].reset_index().drop(columns=['index'])

In [None]:
# add unique identifer to join tables on: format: device_trip_time
host_data['merge_id'] = host_data['Device'].astype(str) +'_'+ host_data['Trip'].astype(str)  +'_'+ host_data['Time'].astype(str)
rvbsm_data['merge_id'] = rvbsm_data['Device'].astype(str) +'_'+ rvbsm_data['Trip'].astype(str)  +'_'+ rvbsm_data['Time'].astype(str)
evt_data['merge_id'] = evt_data['Device'].astype(str) +'_'+ evt_data['Trip'].astype(str)  +'_'+ evt_data['Time'].astype(str)
spat_data['merge_id'] = spat_data['Device'].astype(str) +'_'+ spat_data['Trip'].astype(str)  +'_'+ spat_data['Time'].astype(str)

In [None]:
# update index to unique identifiers for the tables
host_data = host_data.set_index('merge_id')
rvbsm_data = rvbsm_data.set_index('merge_id')
evt_data = evt_data.set_index('merge_id')
spat_data = spat_data.set_index('merge_id')

In [None]:
# join the tables
ht_ = host_data.join(rvbsm_data,lsuffix='_host',rsuffix='_rvbsm').join(evt_data,rsuffix='_evtwarn').join(spat_data,rsuffix='_spat')

In [None]:
# save it as a trip (probably)
ht_.to_csv('trip0.csv')

In [None]:
pd.DataFrame(ht_.columns)

In [None]:
ht_.dropna(thresh=50)

In [None]:
ht_.drop(columns=['LocalTimeMS_spat','NativeFlag_spat','Time_spat','Trip_spat','Device_spat','RvRandomId_evtwarn','RvDevice_evtwarn'])

In [None]:
# Ignore things above (eda)
# filter event with appid = 2
evt_ = evtwarn[(evtwarn['EventAppId'] == 2)]

In [None]:
# get unique trip and device. ignore consecutive events with interval less than 2.5s
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
            

In [None]:
# e_.loc[0]['Time']-list(host[(host['Device']==e_.loc[0]['Device']) & (host['Trip']==e_.loc[0]['Trip']) & (host['Time']>e_.loc[0]['Time']) & (host['Time']<(e_.loc[0]['Time']+6000)) & (host['BrakeStatus']==1)]['Time'])[0]

In [None]:
# new table with unique warings
e_.head()

In [None]:
# calculate reaction time using host table . max look for 60s
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]

In [None]:
reactiveness

In [None]:
evtwarn['EventAppId'].value_counts()

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 0)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 1)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 3)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 9)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 10)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 11)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

In [None]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 12)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness

# Train an RNN+LSTM Model
Prepare training data and train

In [1]:
import pandas as pd
alertl = pd.read_csv('AlertLevel.csv')
rvbsm = pd.read_csv('RvBsm.csv')
summary = pd.read_csv('Summary.csv')
eventappid = pd.read_csv('EventAppID.csv')
host = pd.read_csv('Host.csv')
rvzone = pd.read_csv('RvZone.csv')
evtwarn = pd.read_csv('EvtWarn.csv')
vehclass = pd.read_csv('RvBasicVehClass.csv')
spat = pd.read_csv('Spat.csv')

In [2]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 2)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness.head()

In [3]:
# label reactiveness : less that 1sec as 1
reactiveness['reactiveness'] = reactiveness['ReactionTime'].map(lambda x: 1 if abs(x)<1 else 0)

In [4]:
# drop duplicates (for now) (later average?)
reactiveness=reactiveness.drop_duplicates(['Device','Trip'])

In [5]:
reactiveness['reactiveness'].value_counts()

1    80
0    20
Name: reactiveness, dtype: int64

In [6]:
# Features: ['BrakeStatus','Speed','ThrottlePosPct', 'StreerAngle']
# get features from host table into x_train, calculate max len of the 
x_train = []
y_train = []
max_len = -float('inf')

for i,row in reactiveness.iterrows():
    x=(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip'])][['BrakeStatus','Speed','ThrottlePosPct','SteerAngle']]).values
    if len(x)>=max_len:
        max_len = len(x)
    x_train.append(x)
    y_train.append(row['reactiveness'])

In [7]:
# create same length by appending zeros
import numpy as np
x_t = []
for x in x_train:
    zeros = np.zeros((max_len-len(x),4))
    x_t.append(np.append(x,zeros,axis=0))

In [8]:
# convert to np arrays
x_ = np.array(x_t)
y_ = np.array(y_train)
from sklearn.utils import shuffle
x_, y_ = shuffle(x_, y_, random_state=0)

In [9]:
# imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM 
from sklearn.model_selection import train_test_split

In [10]:
# binary classification lstm model
model = Sequential()
model.add(LSTM(128,input_shape=(x_.shape[1:]),activation='relu',return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
# complie
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit
x_train, x_test, y_train, y_test = train_test_split(x_, y_, test_size=0.2, random_state=42)
model.fit(x_train,y_train,epochs=1,validation_data=(x_test,y_test))

Train on 80 samples, validate on 20 samples


# Train on smaller subset of trips



In [2]:
import pandas as pd
import numpy as np

In [3]:
alertl = pd.read_csv('AlertLevel.csv')
rvbsm = pd.read_csv('RvBsm.csv')
summary = pd.read_csv('Summary.csv')
eventappid = pd.read_csv('EventAppID.csv')
host = pd.read_csv('Host.csv')
rvzone = pd.read_csv('RvZone.csv')
evtwarn = pd.read_csv('EvtWarn.csv')
vehclass = pd.read_csv('RvBasicVehClass.csv')
spat = pd.read_csv('Spat.csv')

In [4]:
evt_ = evtwarn[(evtwarn['EventAppId'] == 2)]
ignore_interval = 2.5 * 100
e_  = pd.DataFrame(columns=evt_.columns)
i_ = 0
for i, row in evt_.iterrows():
    if i_==0:
        e_.loc[i_] = row
        i_+=1
    else:
        if ((i-1 in evt_.index) and (evt_.loc[i]['Trip']==evt_.loc[i-1]['Trip']) \
        and (evt_.loc[i]['Device']==evt_.loc[i-1]['Device']) and (evt_.loc[i]['Time']-evt_.loc[i-1]['Time']<=ignore_interval)):
            pass
        else:
            e_.loc[i_] = row
            i_+=1
max_look_time = 60 * 100
reactiveness = pd.DataFrame(columns=['Device','Trip','ReactionTime','Time'])
for i,row in e_.iterrows():
    rt = list(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip']) & (host['Time']>row['Time']) & (host['Time']<(row['Time']+max_look_time)) & (host['BrakeStatus']==1)]['Time'])
    if len(rt)>0:
        rt=(rt[0]-row['Time'])/100
    else:
        rt=-1
    reactiveness.loc[i]=[row['Device'],row['Trip'],rt,row['Time']]
reactiveness.head()

In [5]:
# label reactiveness : less that 1sec as 1
reactiveness['reactiveness'] = reactiveness['ReactionTime'].map(lambda x: 1 if abs(x)<1 else 0)

In [6]:
# drop duplicates (for now) (later average?)
reactiveness = reactiveness.drop_duplicates(['Device','Trip'])

In [7]:
reactiveness['reactiveness'].value_counts()

1    80
0    20
Name: reactiveness, dtype: int64

In [8]:
# Features: ['BrakeStatus','Speed','ThrottlePosPct', 'StreerAngle']
# get features from host table into x_train, divided into smaller chunks
x_train = []
y_train = []
chunk_size = 100

for i,row in reactiveness.iterrows():
    x=(host[(host['Device']==row['Device']) & (host['Trip']==row['Trip'])][['BrakeStatus','Speed','ThrottlePosPct','SteerAngle']]).values
    for j in range(0,len(x)-chunk_size,chunk_size):
        x_train.append(x[j:j+chunk_size])
        y_train.append(row['reactiveness'])

In [9]:
# convert to np arrays
x_ = np.array(x_train)
y_ = np.array(y_train)
from sklearn.utils import shuffle
x_, y_ = shuffle(x_, y_, random_state=0)

In [10]:
# imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM 
from sklearn.model_selection import train_test_split

In [11]:
# binary classification lstm model
model = Sequential()
model.add(LSTM(128,input_shape=(x_.shape[1:]),activation='relu',return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [12]:
# complie
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# fit
x_train, x_test, y_train, y_test = train_test_split(x_, y_, test_size=0.2, random_state=42)
model.fit(x_train,y_train,epochs=1,validation_data=(x_test,y_test))

Train on 2468 samples, validate on 618 samples
Instructions for updating:
Use tf.cast instead.


<tensorflow.python.keras.callbacks.History at 0x7f010d2716a0>