In [1]:
# The ultimate target feature: time from one stop to another

# Imports
import pandas as pd
import numpy as np
from datetime import date, datetime
from patsy import dmatrices
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import export_graphviz, DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.tree import export_graphviz 
from sklearn import tree
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from statsmodels.formula.api import ols

# Read csv file into a dataframe.
df = pd.read_csv('csv_data/bus_route4_clean.csv')

# 1.0 Model Analysis

## 1.1 Random Forest Regression

In [2]:
#journey pattern ID here just represents direction - we can switch for direction for user input
#took out RDS event as not right to simply include one event - not consist for all events at RDS in Jan
#stop order is essentially the number of times that bus stopped - we will not know that
#speed - uses the stop order to work out time travelling (includes time journey started) / stop order (position) - we will not know that
#therefore is the best we can do in real time is to record the journey times/distances as speeds for each scheduled run?
y, X = dmatrices('time_to_travel ~ Stop_sequence + scheduled_speed_per_stop + stops_travelled + Time_bin_xxx + DayOfWeek', df, return_type="dataframe") 
y = np.ravel(y)
X

Unnamed: 0,Intercept,Stop_sequence,scheduled_speed_per_stop,stops_travelled,Time_bin_xxx,DayOfWeek
0,1.0,3.0,0.983607,54.0,8110.0,1.0
1,1.0,3.0,0.983607,54.0,12000.0,1.0
2,1.0,3.0,0.983607,54.0,6110.0,2.0
3,1.0,3.0,0.983607,54.0,6110.0,2.0
4,1.0,3.0,0.983607,54.0,6111.0,2.0
5,1.0,3.0,0.983607,54.0,7100.0,2.0
6,1.0,3.0,0.983607,54.0,7110.0,2.0
7,1.0,3.0,0.983607,54.0,7110.0,2.0
8,1.0,3.0,0.983607,54.0,7110.0,2.0
9,1.0,3.0,0.983607,54.0,8000.0,2.0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=33) 

In [4]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [5]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=10))

In [6]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [7]:
clf = GridSearchCV(pipeline, hyperparameters, cv=8)

In [8]:
clf.fit(X_train, y_train)

GridSearchCV(cv=8, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [9]:
pred = clf.predict(X_test)

In [10]:
r2_score(y_test, pred)

0.96277202083421209

Good, closer to 1 the better

In [11]:
mean_squared_error(y_test, pred)

18.435334457138449

Good, closer to 0 the better.

This saves the model for later.

In [12]:
joblib.dump(clf, '../flask_app/static/rf_regressor.pkl')

['../flask_app/static/rf_regressor.pkl']

In [13]:
X_test.head()

Unnamed: 0,Intercept,Stop_sequence,scheduled_speed_per_stop,stops_travelled,Time_bin_xxx,DayOfWeek
88330,1.0,18.0,0.983607,38.0,14100.0,2.0
31264,1.0,57.0,0.983607,56.0,23111.0,1.0
113644,1.0,14.0,0.983607,55.0,19110.0,0.0
59900,1.0,61.0,0.983607,53.0,20000.0,2.0
125864,1.0,38.0,0.983607,55.0,22000.0,1.0


This can then be reloaded to be used in the flask app. As seen below.

In [14]:
clf2 = joblib.load('../flask_app/static/rf_regressor.pkl')

In [15]:
predictions = list(clf2.predict(X_test))
predictions

modified=[]

for i in predictions:
    modified.append(i)
# print(modified)

X_test["Prediction"]=0
X_test["Prediction"]=modified
X_test.to_csv('output_random_forest.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [16]:
rfc = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)

## Linear

In [17]:
import statsmodels.formula.api as sm

# Train all continuous features.
check1 = sm.ols(formula= 'time_to_travel ~ Stop_sequence + scheduled_journey_time + Time_bin_xxx + DayOfWeek', data=df).fit()
# Print the weights learned for each feature.
print(check1.summary())

                            OLS Regression Results                            
Dep. Variable:         time_to_travel   R-squared:                       0.856
Model:                            OLS   Adj. R-squared:                  0.856
Method:                 Least Squares   F-statistic:                 2.100e+05
Date:                Fri, 07 Jul 2017   Prob (F-statistic):               0.00
Time:                        15:54:54   Log-Likelihood:            -5.0124e+05
No. Observations:              141263   AIC:                         1.002e+06
Df Residuals:                  141258   BIC:                         1.003e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 39

# Logistic Regression Model with Bins