# Guess Route of a NJT Bus from Clever Devices API 

In [1]:
# 1. run_id assignor

# trains model using a few weeks of statewide bus locations for all NJTransit buses from June 2018
# given a vehicle id (obtained from Clever Devices API getStopPredictions.jsp)
# will predict what GTFS run # the bus is on
# allowing for the vehicle's schedule to be cross-referenced
# n.b. the run_id is omitted from the getStopPredictions.jsp API response for inbound buses to the stop

# 1st version also used lat + lon
# but it turns out that a given bus isn't always at the same geo location on a given run at a given time
# so tarining the model that way actually makes it less accurate than simply going on bus id, route, and time
# chances are you, you'll line those 3 up much more easily.


In [2]:
# # config 1
# source = 'nj'
# route = 119
# stop_no = 30189 # webster and congess, jersey city

In [3]:
# config 2
source = 'nj'
route = 87
stop_no = 21062 # palisade and south, jersey city

## create and train the model

In [4]:
import pandas as pd
import numpy as np
import copy


In [5]:
# get the training dataset - takes about a minute for Rt. 119
# how long for entire database?

from mysql.connector import connection
db_user ='buswatcher'
db_password = 'njtransit'
db_host = 'localhost'
db_name = 'bus_position_log'
conn = connection.MySQLConnection(user=db_user, password=db_password, host=db_host, database=db_name)

arrival_query = ('SELECT bid,rt,timestamp,run FROM run_predictor_training_set WHERE (rt="%s");' % route)
df = pd.read_sql_query(arrival_query, conn)


In [6]:
# cleanup crew ------------------------------------------------------------------------

# recode 'manager' runs
df.replace(
        to_replace='MAN',
        value=unicode('666'), # vs value=np.NaN,
        inplace=True,
        limit=None,
        regex=False, 
        method='pad')

# fix the timestamp
df['timestamp'] = df['timestamp'].str.split('.').str.get(0)
df = df.set_index(pd.DatetimeIndex(df['timestamp']), drop=False)

# extract the hh:mm:ss part of timestamp
df['timestamp_ml'] = df.index.time
df['timestamp_ml'] = df['timestamp_ml'].apply(lambda x: float(str(x).replace(":","")))


# change bid to v for consistency with stopwatcher
df.rename(columns = {'bid':'v'}, inplace = True)

# straggler strings in run
df['run'] = df['run'].str.replace(r'\D+', '')

# get rid of the timestamp
df = df.reset_index(drop=True)
df = df.drop(columns=['timestamp'])

# inspect
df.head()


Unnamed: 0,v,rt,run,timestamp_ml
0,6960,87,244,191839.0
1,6852,87,10,191839.0
2,6943,87,257,191839.0
3,6963,87,239,191839.0
4,6938,87,236,191839.0


In [None]:
# NOW.....

# 1. split df in X and y
# 2. encode X (all features) with one-hot so X will be 150,000 by 700, y will be 150,000 by 1
# 3. build the model, fit and predict

In [7]:
# GOING CRAZY. DO I ENCODE THE TARGET (run) OR NOT? IF DO I GET THE BAD SHAPE THING BELOW. 
# IF I DON'T I ALWAYS GET THE SAME PREDICTION

In [19]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
enc.fit_transform(df)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
# encode with pandas
df_dummies = pd.get_dummies(df)

features = df_dummies.filter(regex='(v|rt|timestamp_ml)', axis=1) # regex for v,rt,run
targets= df_dummies.filter(regex='(run)', axis=1) # regex for v,rt,run
X = features.values
y = targets.values


In [11]:
# test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X,y, test_size=0.2, random_state=42)

In [12]:
X_train.shape

(120083, 155)

In [13]:
y_train.shape

(120083, 62)

In [14]:
# fit the models

from sklearn.linear_model import LogisticRegression
l = LogisticRegression()
l.fit(X_train,y_train)
print (">>One-Hot Encoding: Logistic Regression<<")
print "Train Accuracy :: ", accuracy_score(y_train, l.predict(X_train))
print "Test Accuracy  :: ", accuracy_score(y_test, l.predict(X_test))
print ("Accuracy on training set: {:.3f}".format(l.score(X_train,y_train)))  # AT add
print ("Accuracy on test set: {:.3f}".format(l.score(X_test,y_test))) # AT add

# from sklearn.ensemble import RandomForestClassifier
# r = RandomForestClassifier(n_estimators=25,max_depth=10)
# r.fit(X_train_one_hot,y_train.values.ravel()) 
# print (">>One-Hot Encoding: Random Forest<<")
# print ("Accuracy on training set: {:.3f}".format(r.score(X_train_one_hot,y_train)))  # AT add
# print ("Accuracy on test set: {:.3f}".format(r.score(X_test_one_hot,y_test))) # AT add

# from sklearn.neighbors import KNeighborsClassifier
# k = KNeighborsClassifier(n_neighbors=3)


ValueError: bad input shape (120083, 62)

## make some predictions on actual data

In [None]:
# get the arrival predictions for NOW for this STOP
from src.buses.Buses import *
import datetime
now = datetime.datetime.now()
arrivals = parse_stopprediction_xml(get_xml_data(source, 'stop_predictions', stop=stop_no, route=route))
arrivals

In [None]:
# make arrivals look like X_train

arrivals_temp = []

for bus in arrivals:
    bus.timestamp_ml = float(str(now.hour)+str(now.minute)+str(now.second))
    arrivals_temp.append([bus.v,bus.rd,bus.timestamp_ml])

X_production_one_hot=enc.transform(arrivals_temp)


In [None]:
l.predict(X_production_one_hot)

In [None]:
import sys
sys.exit()

In [None]:
# loop over arrivals, adding predicted run_id

for bus in X_production_one_hot:
    run_id_predicted = l.predict(bus)
    print run_id_predicted

In [None]:
# look up the scheduled arrival time for this run, stop

# setup the schedules

import pygtfs
sched = pygtfs.Schedule('../data/gtfs/njtbus.sqlite3')
gtfs_route_id = 13 # hardcoded for now (NJT #119)

In [None]:
# exploring the run schedule
# TRIP -- STOP -- TIME
schedule_as_list=[]
route_schedule = sched.routes_by_id(gtfs_route_id)[0]
for trip in route_schedule.trips:
    print trip.trip_id, trip.direction_id
    for stoptime in trip.stop_times:
        # print stoptime.trip_id, stoptime.stop_id, stoptime.arrival_time
        schedule_as_list.append([stoptime.trip_id, stoptime.stop_id, stoptime.arrival_time])

        
# for x in schedule_as_list:
#     print x[0],
#     if int(x[0]) == gtfs_route_id:
#         print x[0],

In [None]:
# compute how late it is

In [None]:
# then log to a run history file