In [25]:
# The ultimate target feature: time from one stop to another

# Imports
import pandas as pd
import numpy as np
from datetime import date, datetime
from patsy import dmatrices
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import export_graphviz, DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.tree import export_graphviz 
from sklearn import tree
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from statsmodels.formula.api import ols
from numpy import loadtxt


# Read csv file into a dataframe.
df = pd.read_csv('csv_data/beta4.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,LineID,Direction,Journey_Pattern_ID,Timeframe,Vehicle_Journey_ID,Lon,Lat,Vehicle_ID,...,End_Stop,Stops_To_Travel,Time_To_Travel_Dirty,Time_To_Travel,Scheduled_Speed_Per_Stop,time_bins,Rain,Temperature,Wind_Speed,Hour_Of_Day_y
0,143,1352181949000000,4,0,40001,2012-11-06,5332,-6.26497,53.402534,43034,...,57,52,2637000000,43.0,1.016949,4,0.0,2.2,10.0,23
1,177,1352182129000000,4,0,40001,2012-11-06,5332,-6.264059,53.396149,43034,...,57,52,2457000000,40.0,1.016949,4,0.0,2.2,10.0,23
2,196,1352182208000000,4,0,40001,2012-11-06,5332,-6.263807,53.392265,43034,...,57,52,2378000000,39.0,1.016949,4,0.0,2.2,10.0,23
3,229,1352182391000000,4,0,40001,2012-11-06,5332,-6.265748,53.380424,43034,...,57,52,2195000000,36.0,1.016949,4,0.0,2.2,10.0,23
4,247,1352182484000000,4,0,40001,2012-11-06,5317,-6.225992,53.327579,43032,...,59,19,802000000,13.0,1.016949,1,0.0,2.2,10.0,23


# 1.0 Model Analysis

## 1.1 Random Forest Regression

In [26]:

y, X = dmatrices('Time_To_Travel ~ Day_Of_Week + Time_Bin_Start +  Scheduled_Speed_Per_Stop + Wind_Speed +  Temperature  + Holiday + Stops_To_Travel + Stop_Sequence', df, return_type="dataframe") 
y = np.ravel(y)
#X

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=33) 

In [28]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [29]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=10))

In [30]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [31]:
clf = GridSearchCV(pipeline, hyperparameters, cv=8)

In [32]:
clf.fit(X_train, y_train)

GridSearchCV(cv=8, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [34]:
pred = clf.predict(X_test)

In [35]:
r2_score(y_test, pred)

0.97346094420683948

Good, closer to 1 the better

In [36]:
mean_squared_error(y_test, pred)

11.291805654964442

Good, closer to 0 the better.

This saves the model for later.

In [None]:
joblib.dump(clf, '../flask_app/static/rf_regressor.pkl')

In [None]:
X_test.head()

This can then be reloaded to be used in the flask app. As seen below.

In [None]:
clf2 = joblib.load('../flask_app/static/rf_regressor.pkl')

In [None]:
predictions = list(clf2.predict(X_test))
predictions

modified=[]

for i in predictions:
    modified.append(i)
# print(modified)

X_test["Prediction"]=0
X_test["Prediction"]=modified
X_test.head()

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)

In [None]:
df.shape

## To Do

Date is in the model as a variable making significant impact as cannot group random forest by dates - to confirm why it makes an impact (improves score about 7% but we should  understand is this equiv to groupby or coincidence or a proxy for events).

Getting a memory error with feature importance so for now using the linear results to guide - issue with rain: clearly important - we know that - and humidity is important, but no rain columns are making any impact. See commented out code below.

In [None]:
rfc.fit(X_train, y_train)

In [None]:
pd.DataFrame({'feature': X_train.columns, 'importance':rfc.feature_importances_})

# Decision tree for single tree - not going to work here but gives visual of the tree structure

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4,min_samples_leaf=5)
clf = clf.fit(X_train,y_train)

In [None]:
tree.export_graphviz(clf, out_file='tree.dot', feature_names = X_train.columns)
!dot -Tpng tree.dot > tree.png
from IPython.display import Image 
Image(filename='tree.png')

In [None]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True,show_classification_report=True, show_confussion_matrix=True):
    
    y_pred=clf.predict(X)
    
    if show_accuracy:
        print ("accuracy",metrics.accuracy_score(y, y_pred))
        
    if show_classification_report:
        print ("classification",metrics.classification_report(y,y_pred))
            
    if show_confussion_matrix:
        print ("confusion matrix",metrics.confusion_matrix(y,y_pred))
        
measure_performance(X_train,y_train,clf)

# K nearest neighbours

In [None]:
import random
import math
from numpy.random import permutation

# Randomly shuffle the index of df.
random_indices = permutation(df.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(df)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = df.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
train = df.loc[random_indices[test_cutoff:]]

In [None]:
x_columns = ['speed', 'stop_order','DayOfWeek', 'HourOfDay', 'MinsOfHour', 'Direction_north']
y_column = ['time_bins']

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])


In [None]:
# Get the actual values for the test set.
actual = test[y_column]

# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)


In [None]:
mse

# Linear model

In [None]:
import statsmodels.formula.api as sm

# Train all continuous features.
check1 = sm.ols(formula='Time_To_Travel ~ Day_Of_Week + Time_Bin_Start + Scheduled_Speed_Per_Stop +  Temperature + Wind_Speed +  Stops_To_Travel + Stop_Sequence', data=df).fit()
# Print the weights learned for each feature.
print(check1.summary())

# Logistic Regression Model with Bins

In [None]:
# Training the model with original selection of four features:

logreg = sm.logit(formula='Time_To_Travel ~ Day_Of_Week + Time_Bin_Start + Scheduled_Speed_Per_Stop +  Temperature + Wind_Speed +  Stops_To_Travel + Stop_Sequence', data=df).fit()

# Print the weights learned for each feature.
print(logreg.params)