In [48]:
import pandas as pd
import numpy as np
from numpy import arange
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

from cm import plot_confusion_matrix

import tensorflow as tf

In [49]:
real_codes = pd.read_csv('data/J1939Faults.csv', low_memory = False)
onboard = pd.read_csv('data/VehicleDiagnosticOnboardData.csv', low_memory = False)

# Initial data cleaning and filtering

In [50]:
#Per the project's instructions: 

real_codes = real_codes[real_codes['EquipmentID'].str.len() < 6]
real_codes = real_codes[real_codes['ecuMake'] == 'CMMNS']
real_codes = real_codes[real_codes['active'] == True]

In [51]:
real_codes = real_codes.drop(columns=['actionDescription', 'faultValue', 'MCTNumber'])

In [52]:
real_codes[real_codes['spn'] == 3362]

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,LocationTimeStamp
11438,12321,1514613,2015-03-24 11:07:15.000,Condition Exists Catalyst Dosing Unit Input Lines,04993120*00019058*082113134117*07700053*I0*BBZ*,79461375,6X1u10D1500000000,CMMNS,0,3362,31,True,1,1370,36.876574,-81.397361,2015-03-24 11:07:51.000
11475,12358,1519318,2015-03-24 14:37:17.000,Condition Exists Catalyst Dosing Unit Input Lines,04993120*00019058*082113134117*07700053*I0*BBZ*,79461375,6X1u10D1500000000,CMMNS,0,3362,31,True,1,1370,36.194861,-83.174768,2015-03-24 14:37:52.000
11491,12374,1520761,2015-03-24 15:41:08.000,Condition Exists Catalyst Dosing Unit Input Lines,04993120*00019058*082113134117*07700053*I0*BBZ*,79461375,6X1u10D1500000000,CMMNS,0,3362,31,True,1,1370,36.194814,-83.174722,2015-03-24 15:41:45.000
11646,12529,1531190,2015-03-25 07:58:21.000,Condition Exists Catalyst Dosing Unit Input Lines,04993120*00019058*082113134117*07700053*I0*BBZ*,79461375,6X1u10D1500000000,CMMNS,0,3362,31,True,1,1370,36.194907,-83.175000,2015-03-25 07:58:58.000
17240,18123,1816124,2015-04-10 10:08:09.000,Condition Exists Catalyst Dosing Unit Input Lines,05317106*04260187*092514204317*09400024*G1*BDR*,79813609,6X1u13D1500000000,CMMNS,0,3362,31,True,1,1785,36.174722,-86.022592,2015-04-10 10:08:46.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1176634,1236775,118452393,2020-01-26 05:02:23.000,Condition Exists Catalyst Dosing Unit Input Lines,04358814*06107315*061516161145*09401661*G1*BDR*,79931760,6X1u13D1500000000,CMMNS,0,3362,31,True,1,1991,35.840787,-86.428287,2020-01-26 05:02:58.000
1181915,1242056,120991271,2020-02-14 07:27:54.000,Condition Exists Catalyst Dosing Unit Input Lines,04358814*06011766*122016144238*09401671*G1*BDR*,79897856,6X1u13D1500000000,CMMNS,0,3362,31,True,1,1862,40.437129,-75.908657,2020-02-14 07:28:30.000
1182447,1242588,121281871,2020-02-17 05:38:19.000,Condition Exists Catalyst Dosing Unit Input Lines,05317106*05031273*050815201656*09400035*G1*BDR*,79857685,6X1u13D1500000000,CMMNS,0,3362,31,True,1,1817,35.607453,-82.329027,2020-02-17 05:38:55.000
1182512,1242653,121330201,2020-02-17 10:44:39.000,Condition Exists Catalyst Dosing Unit Input Lines,05317106*05005224*051718172255*09401583*G1*BDR*,79845785,6X1u13D1500000000,CMMNS,0,3362,31,True,1,1814,35.828333,-86.414629,2020-02-17 10:45:15.000


In [53]:
#Selects lat and lon coordinates that are not in the bounded boxes

geo_codes = real_codes[(~real_codes['Latitude'].between(36.05942, 36.07392)) | (~real_codes['Longitude'].between(-86.44366, -86.42579))]
geo_codes = geo_codes[(~geo_codes['Latitude'].between(35.58108, 35.59558)) | (~geo_codes['Longitude'].between(-86.4528, -86.435))]
geo_codes = geo_codes[(~geo_codes['Latitude'].between(36.18775, 36.20225)) | (~geo_codes['Longitude'].between(-83.1837, -83.1658))]

#Here is an alternative method for selecting lats and lons that are not in the bounded boxes: 

#geo_codes = real_codes.query('(Latitude <= 36.05942 or Latitude >= 36.07392) or (Longitude <= -86.5419 or Longitude >= -86.3275)')


In [54]:
geo_codes = geo_codes.sort_values(["EquipmentID", "EventTimeStamp"], ascending = (False, True))

# Some EDA

In [55]:
#Number of unique trucks in the data

geo_codes.EquipmentID.nunique()

928

In [56]:
geo_codes_select1569 = geo_codes[geo_codes['spn'].isin([1569])]
geo_codes_select5246 = geo_codes[geo_codes['spn'].isin([5246])]

In [57]:
#Number of unique 1569s thrown by the ECUs

geo_codes_select1569['RecordID'].nunique()

4277

In [58]:
#Number of unique 5426s thrown by the ECUs

geo_codes_select5246['RecordID'].nunique()

270

In [59]:
geo_codes.RecordID.nunique(())

200154

In [60]:
#Looking at the names of the metrics that are recorded when the ECU throws a code

geo_codes[geo_codes['RecordID']  == 4246].head(25)

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,LocationTimeStamp
4245,4246,1048627,2015-02-24 13:45:06.000,Low (Severity Low) Catalyst Tank Level,05317106*04119044*051914190353*09400015*G1*BDR*,79751302,6X1u13D1500000000,CMMNS,0,1761,17,True,43,R1762,41.254166,-85.088888,2015-02-24 13:45:41.000


In [61]:
geo_codes.eventDescription.value_counts()

Low (Severity Low) Engine Coolant Level                                   137206
Low (Severity Medium) Engine Coolant Level                                 14676
High (Severity Low) Water In Fuel Indicator                                 6992
Condition Exists Engine Protection Torque Derate                            4277
High Voltage (Water In Fuel Indicator)                                      3768
                                                                           ...  
High Voltage (Engine Exhaust Gas Pressure)                                     1
High (Severity Medium) Aftertreatment Diesel Particulate Filter System         1
High Voltage (Particulate Trap Outlet Pressure 1)                              1
High Voltage (Aftertreatment 1 Fuel Pressure 1)                                1
Data May Be Invalid Wheel-Based Vehicle Speed                                  1
Name: eventDescription, Length: 266, dtype: int64

# More data cleaning and filtering based on prior EDA

In [62]:
geo_codes = geo_codes.drop(columns=['LocationTimeStamp', 'active', 'ecuSource', 'ecuMake', 'ESS_Id'])

In [63]:
#Reorders our columns in a logical and readable way: index types, data types, others

geo_codes = geo_codes.reindex(columns = ['RecordID', 'EquipmentID', 'EventTimeStamp', 'spn', 'fmi', 'spnfmi',
                                         'activeTransitionCount', 'ecuModel', 'ecuSerialNumber', 'ecuSoftwareVersion',
                                         'eventDescription', 'Latitude', 'Longitude'])

In [64]:
#Concats the spn and fmi fields into a single field, then converts them back to an int

geo_codes['spnfmi'] = geo_codes.spn.astype('str').map(str) + geo_codes.fmi.astype('str')
geo_codes['spnfmi'] = geo_codes['spnfmi'].astype('int32')

# Get dummy variables in preperation of logistic regression

In [65]:
geo_codes_spnfmi = pd.get_dummies(geo_codes, columns = ['spnfmi'])

# Convert datatime column for indexing and sorting

In [66]:
geo_codes_spnfmi['EventTimeStamp'] = geo_codes_spnfmi['EventTimeStamp'].astype('datetime64[ns]')

In [67]:
geo_codes_spnfmi['EventTimeStamp'] = pd.to_datetime(geo_codes_spnfmi['EventTimeStamp'], utc = True)

In [68]:
geo_codes_dropped = geo_codes_spnfmi.reset_index()

In [69]:
#List comprehension of columns created from the get_dummies

List = [x for x in geo_codes_dropped.columns if 'spnfmi_' in x]

# Pandas .rolling() to create a cumulative count of spn-fmi combo codes thrown

In [70]:
geo_codes_dropped = geo_codes_dropped.sort_values(['EquipmentID', 'EventTimeStamp']).reset_index(drop = True)

In [71]:
geo_codes_dropped_rolling = geo_codes_dropped.groupby('EquipmentID').rolling(window = "14d", on = 'EventTimeStamp')[List].sum().reset_index()

In [72]:
merge_geo_codes = geo_codes_dropped.merge(geo_codes_dropped_rolling, left_index = True, right_index = True)

In [73]:
merge_geo_codes = merge_geo_codes.rename(columns = {'EquipmentID_x' : 'EquipmentID', 'EventTimeStamp_x' : 'EventTimeStamp'})

In [74]:
merge_geo_codes = merge_geo_codes.loc[:,~merge_geo_codes.columns.str.contains('_x', case=False)]

In [75]:
merge_geo_codes = merge_geo_codes.drop(columns=['index'])

In [76]:
#Saved for future use when coming back and merging onboard

In [77]:
def backfill (df):
    df['flagtime'] = df.loc[df['spnfmi_156931_y'] == 1, 'EventTimeStamp']
    df['flagtime'] = df.flagtime.bfill()
    return df

In [78]:
merge_geo_codes = merge_geo_codes.groupby('EquipmentID').apply(backfill)

In [79]:
merge_geo_codes['deltatime'] = merge_geo_codes['flagtime'] - merge_geo_codes['EventTimeStamp']

In [80]:
merge_geo_codes['targettime'] = merge_geo_codes.deltatime.dt.total_seconds().between(-1, 86400)

In [81]:
merge_geo_codes = merge_geo_codes[~merge_geo_codes['deltatime'].dt.total_seconds().between(-1, 3600)]

In [82]:
merge_geo_codes = merge_geo_codes[merge_geo_codes['spn'] != 5246]

In [83]:
merge_geo_codes[merge_geo_codes['spn'] == 4334]

Unnamed: 0,RecordID,EquipmentID,EventTimeStamp,spn,fmi,activeTransitionCount,ecuModel,ecuSerialNumber,ecuSoftwareVersion,eventDescription,...,spnfmi_6553531_y,spnfmi_8812116_y,spnfmi_11486311_y,spnfmi_19660829_y,spnfmi_33504023_y,spnfmi_52103214_y,spnfmi_52428731_y,flagtime,deltatime,targettime
5875,371938,1378,2016-02-04 16:20:27+00:00,4334,3,1,6X1u10D1500000000,79461379,04993120*00018641*082113134117*07700053*I0*BBZ*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
20868,348795,1413,2016-01-17 10:09:43+00:00,4334,16,1,6X1u10D1500000000,79466583,04993120*00027778*082113134117*07700053*I0*BBZ*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-02-09 11:59:58+00:00,23 days 01:50:15,False
20900,368137,1413,2016-02-02 03:45:55+00:00,4334,16,1,6X1u10D1500000000,79466583,04993120*00027778*082113134117*07700053*I0*BBZ*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-02-09 11:59:58+00:00,7 days 08:14:03,False
24698,167126,1437,2015-08-14 20:10:00+00:00,4334,18,1,6X1u10D1500000000,79722862,04993120*00033706*082113134117*07700053*I0*BBZ*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
25663,192663,1453,2015-09-04 01:30:18+00:00,4334,3,1,6X1u10D1500000000,79476011,04993120*00044979*042114185815*07700062*I0*BBZ*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-11-18 10:49:09+00:00,75 days 09:18:51,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195713,1099402,2082,2018-12-27 19:26:00+00:00,4334,4,1,6X1u17D1500000000,80004541,04384413*22043780*031617122339*60701702*G1*BGT*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
195947,1232249,2089,2020-01-13 15:48:26+00:00,4334,18,1,6X1u17D1500000000,80011058,04384413*22053303*120718181232*60701742*G1*BGT*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
196555,1089549,2124,2018-12-04 14:03:19+00:00,4334,4,1,6X1u17D1500000000,80020355,04384413*22076119*090617144354*60701715*G1*BGT*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
197248,1010829,2143,2018-05-09 10:05:59+00:00,4334,18,1,6X1u17D1500000000,80015366,04384413*22062764*090617144354*60701715*G1*BGT*,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False


In [84]:
merge_geo_codes.to_csv('data/merge_geo_codes2.csv', index = False)

predictors_train.max()

In [86]:
merge_geo_codes[merge_geo_codes['spn'] == 3362]

Unnamed: 0,RecordID,EquipmentID,EventTimeStamp,spn,fmi,activeTransitionCount,ecuModel,ecuSerialNumber,ecuSoftwareVersion,eventDescription,...,spnfmi_6553531_y,spnfmi_8812116_y,spnfmi_11486311_y,spnfmi_19660829_y,spnfmi_33504023_y,spnfmi_52103214_y,spnfmi_52428731_y,flagtime,deltatime,targettime
925,306040,1358,2015-12-09 12:38:45+00:00,3362,31,1,6X1u10D1500000000,79445804,04993120*00042558*040213150018*07700044*I0*BBZ*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-12-09 17:20:34+00:00,0 days 04:41:49,True
1565,12321,1370,2015-03-24 11:07:15+00:00,3362,31,1,6X1u10D1500000000,79461375,04993120*00019058*082113134117*07700053*I0*BBZ*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-03-24 12:07:25+00:00,0 days 01:00:10,True
2444,385283,1372,2016-02-16 08:51:51+00:00,3362,31,1,6X1u10D1500000000,79461374,04993120*00017231*040213150018*07700044*I0*BBZ*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-02-16 09:52:02+00:00,0 days 01:00:11,True
3031,323852,1373,2015-12-24 09:34:30+00:00,3362,31,1,6X1u10D1500000000,79461373,04993120*00016941*051215183709*07700066*I0*BBZ*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2016-01-07 16:38:34+00:00,14 days 07:04:04,False
4370,280526,1375,2015-11-16 12:50:51+00:00,3362,31,1,6X1u10D1500000000,79461127,04993120*00018954*040213150018*07700044*I0*BBZ*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200056,403360,309,2016-03-02 12:05:01+00:00,3362,31,2,6X1u13D1500000000,79755342,05317106*04125773*051914190353*09400015*G1*BDR*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-03-12 12:33:31+00:00,740 days 00:28:30,False
200069,841926,309,2017-08-03 06:22:57+00:00,3362,31,1,6X1u13D1500000000,79755342,05317106*04125773*061416163421*09401361*G1*BDR*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-03-12 12:33:31+00:00,221 days 06:10:34,False
200075,1025294,309,2018-06-14 08:02:14+00:00,3362,31,1,6X1u13D1500000000,79755342,05317106*04125773*061416163421*09401361*G1*BDR*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False
200087,1158983,309,2019-05-15 06:49:16+00:00,3362,31,1,6X1u13D1500000000,79755342,05317106*04125773*061416163421*09401361*G1*BDR*,Condition Exists Catalyst Dosing Unit Input Lines,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaT,NaT,False


# Create a modified version of a train test split

In [38]:
#A function that simulates a train test split that keeps time sequences for trucks together, but still allows individual trucks
#to randomly be assigned to training and testing

def train_test_brake (df):
    df = df.sort_values('EventTimeStamp')
    t_len = df.shape[0]
    t_perc80 = int(t_len * 0.8)
    df80 = df.iloc[0 : t_perc80, :]
    df20 = df.iloc[t_perc80 : , :]
    return(df80, df20) 

In [39]:
dataframe_80_20 = merge_geo_codes.groupby('EquipmentID').apply(train_test_brake)

In [40]:
#Creates training dataset

df80 = pd.concat([x[0] for x in dataframe_80_20])

In [41]:
#Creates testing dataset

df20 = pd.concat([x[1] for x in dataframe_80_20])

# Logistic Regression

#predictors = df80.columns.tolist()
bad_predictors = ['RecordID', 'EquipmentID', 'EventTimeStamp', 
                  'spn', 'fmi', 'ecuModel', 'activeTransitionCount',
                  'ecuSerialNumber', 'ecuSoftwareVersion', 
                  'eventDescription', 'Latitude', 'Longitude', 
                  'EquipmentID_y', 'EventTimeStamp_y', 'spnfmi_156931_y', 
                  'flagtime', 'deltatime', 'targettime']

In [42]:
#predictors.remove(bad_predictors)

predictors_train = df80.drop(bad_predictors, axis = 1)
predictors_test = df20.drop(bad_predictors, axis = 1)

#df20 = df20.drop(bad_predictors, axis = 1)

predictors_train

# Predictors
X_train = predictors_train

X_test = predictors_test

# Target
y_train = df80.targettime

y_test = df20.targettime

logreg = LogisticRegression().fit(X_train, y_train)

lr_weights = LogisticRegression(class_weight = {0:1, 1:12})

lr_weights.fit(X_train, y_train)

y_pred = lr_weights.predict(X_test)

accuracy_score(y_test, y_pred)

logreg.coef_

logreg.intercept_

y_pred = logreg.predict(X_test)

y_pred

logreg.predict_proba(X_test)

confusion_matrix(y_test, y_pred)

plot_confusion_matrix(y_test, y_pred, labels = ['0', '1'], metric = 'accuracy')

plot_confusion_matrix(y_test, y_pred, labels = ['0', '1'], metric = 'precision')

f1_score(y_test, y_pred, average = 'micro')

print(classification_report(y_test, y_pred))

# LASSO TESTING

In [43]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [44]:
#with tf.device('/gpu:0'):

#Smaller c values specify stronger regularization 
with tf.device('/gpu:0'):
    lasso = LogisticRegression(penalty = 'l1', 
                           C = 0.05, 
                           solver = 'saga', 
                           class_weight = 'balanced', 
                           max_iter = 1000, 
                           ).fit(X_train, y_train)

lr = LogisticRegression(penalty = 'l1', 
                       solver = 'saga', 
                       class_weight = 'balanced',
                       max_iter = 5000)

In [45]:
#param_grid = dict()

In [46]:
#param_grid['C'] = arange(0, 0.2, 0.05) 

In [47]:
#gs = GridSearchCV(estimator = lr, 
#                 param_grid = param_grid, 
#                 scoring = f1, 
#                 cv = 3)

with tf.device('/gpu:0'):
    results = gs.fit(X_train, y_train)

print(results.best_score_)

print(results.best_params_)

results