In [1]:
# 1. run_id assignor

# trains model using a few weeks of statewide bus locations for all NJTransit buses from June 2018
# given a vehicle id (obtained from Clever Devices API getStopPredictions.jsp)
# will predict what GTFS run # the bus is on
# allowing for the vehicle's schedule to be cross-referenced
# n.b. the run_id is omitted from the getStopPredictions.jsp API response for inbound buses to the stop

In [2]:
source = 'nj'
route = 119

In [None]:
# relative import
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from src.buses.reportcard_helpers import *

In [None]:
import pandas as pd
import numpy as np
import copy

In [None]:
# get the training dataset - takes about a minute for Rt. 119
# how long for entire database?

from mysql.connector import connection
db_user ='buswatcher'
db_password = 'njtransit'
db_host = 'localhost'
db_name = 'bus_position_log'
conn = connection.MySQLConnection(user=db_user, password=db_password, host=db_host, database=db_name)

arrival_query = ('SELECT * FROM run_predictor_training_set WHERE (rt="%s");' % route)
df = pd.read_sql_query(arrival_query, conn)

df.replace(
        to_replace='MAN',
        value=unicode('666'), # vs value=np.NaN,
        inplace=True,
        limit=None,
        regex=False, 
        method='pad')


In [None]:
# fix the timestamp
df['timestamp'] = df['timestamp'].str.split('.').str.get(0)
df = df.set_index(pd.DatetimeIndex(df['timestamp']), drop=False)

# extract the time
df['timestamp_ml'] = df.index.time
df['timestamp_ml'] = df['timestamp_ml'].apply(lambda x: float(str(x).replace(":","")))

In [None]:
# cleanup columns

# negatives in lon
df['lon'] = abs(df['lon'])

# straggler strings in run
df['run'] = df['run'].str.replace(r'\D+', '')


In [None]:
sorted(df['run'].unique())

# setup training and test set, LogRegression and RandomForest models

In [None]:
# after https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512

features = ['lat','lon','bid','rt','run','timestamp_ml']
X_train = df.loc[:,features]
X_test = df.loc[:,features]
y_train = df.loc[:,['run']]
y_test = df.loc[:,['run']]


from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
l = LogisticRegression()

from sklearn.ensemble import RandomForestClassifier
r = RandomForestClassifier(n_estimators=25,max_depth=10)


## feature hashing --> LogRegression and RandomForest

In [None]:
# from https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512

from sklearn.feature_extraction import FeatureHasher
X_train_hash = copy.copy(X_train)
X_test_hash = copy.copy(X_test)
for i in range(X_train_hash.shape[1]):
    X_train_hash.iloc[:,i]=X_train_hash.iloc[:,i].astype('str')
for i in range(X_test_hash.shape[1]):
    X_test_hash.iloc[:,i]=X_test_hash.iloc[:,i].astype('str')
h = FeatureHasher(n_features=100,input_type="string")
X_train_hash = h.transform(X_train_hash.values)
X_test_hash = h.transform(X_test_hash.values)

#l.fit(X_train_hash,y_train)
l.fit(X_train_hash,y_train.values.ravel())
y_pred = l.predict_proba(X_test_hash)
print(log_loss(y_test,y_pred))#0.4

#r.fit(X_train_hash,y_train)
r.fit(X_train_hash,y_train.values.ravel())
y_pred = r.predict_proba(X_test_hash)
print(log_loss(y_test,y_pred))

In [None]:
import pickle
filename = 'runid_predictor_model_hashing_logistic.sav'
pickle.dump(l, open(filename, 'wb'))
filename2 = 'runid_predictor_model_hashing_randomforest.sav'
pickle.dump(r, open(filename2, 'wb'))

## one-hot encoding --> LogRegression and RandomForest

In [None]:
# from https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512

from sklearn.preprocessing import OneHotEncoder
X_train_values = X_train.values
X_test_values = X_test.values
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train_values)
X_train_one_hot = enc.transform(X_train_values)
X_test_one_hot = enc.transform(X_test_values)
l.fit(X_train_one_hot,y_train)
y_pred = l.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))
r.fit(X_train_one_hot,y_train)
y_pred = r.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))
print(X_train_one_hot.shape)


In [None]:
X_train_values

In [None]:
# with changes for preprocessing the category labels from 
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features

from sklearn import preprocessing

# turns the labels into a numpy array
cat_features = ['lat','lon','bid','rt','run','timestamp_ml']
encoder = preprocessing.LabelBinarizer()
new_cat_features = encoder.fit_transform(cat_features)

# build and fit the model
X_train_values = X_train.values
X_test_values = X_test.values
enc = OneHotEncoder(handle_unknown='ignore',categorical_features=new_cat_features)
enc.fit(X_train_values)
X_train_one_hot = enc.transform(X_train_values)
X_test_one_hot = enc.transform(X_test_values)
l.fit(X_train_one_hot,y_train)

# score some predictions
y_pred = l.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))
r.fit(X_train_one_hot,y_train)
y_pred = r.predict_proba(X_test_one_hot)
print(log_loss(y_test,y_pred))
print(X_train_one_hot.shape)

In [None]:
X_train.run.unique()

# do a specific operational prediction - here is a getStopPrediction, what run is it on?

In [None]:
call stopwatcher --- use it by feeding it parameters from stopwatcher

### FUTURE WORK

In [None]:
# 2. look up the run and scheduled stop time 
# to see how late it is, and then log that to a run history file


In [None]:
# 3.can we also back out from this to the entire route ?
# (e.g. look up all stops for the run and then go find the vehicle in the busgrabber corpus if we've been grabbing that all along?)
