# Dataset

In [None]:
import json
import pandas as pd
import pymysql
from sqlalchemy import create_engine
from sklearn.cross_validation import train_test_split
from sklearn import metrics
pd.set_option('display.max_columns', 500)


with open("credentials.json") as f:
    credentials = json.loads(f.read())
    
    host = credentials["host"]
    user = credentials["db_user"]
    password = credentials["db_pass"]
    db = credentials["db_name"]

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:3306/{db}")

df = pd.read_sql_query('SELECT * FROM trips_2017 WHERE lineid = "46A" AND direction = 1', engine)
df.head()

In [None]:
# Replace missing actual time departure values with timetable values
df.actualtime_dep.fillna(df.plannedtime_dep, inplace=True)
df.head()

In [None]:
# Remove rows with missing values for actual time arrival as we cannot safely assume these are as per timetable
df = df[pd.notnull(df['actualtime_arr'])]
df.head()

In [None]:
# Create a new column for trip duration
df['trip_duration'] = df['actualtime_arr'] - df['actualtime_dep']
df.head()

In [None]:
# Create a new column with the hour of the day the trip took place
df['actualtime_dep_H'] = round(df['actualtime_dep']/3600)
df.head()

In [None]:
# Hour of actual time arrival
df['actualtime_arr_H'] = round(df['actualtime_arr']/3600)
df.head()

In [None]:
# Average hour of the day of the journey
df['avg_H'] = (df['actualtime_dep_H'] + df['actualtime_arr_H']) / 2
df.head()

In [None]:
df['avg_H'] = df['avg_H'].astype(int)
df.head()

In [None]:
# Creating column solely for the dates to correlate with the dates column on the historical weather data table
df['time'] = df['timestamp'] + df['avg_H'] * 3600
df.time

In [None]:
# Removing suppressed rows where suppressed=1.0
df = df.query('suppressed != 1.0')

In [None]:
df.index = range(len(df))

In [None]:
# Creating columns from timestamp for further processing
df['dayofweek'] = df['timestamp']
df['monthofyear'] = df['timestamp']

In [None]:
# Converting the unix time to datetime format
df.dayofweek = pd.to_datetime(df['dayofweek'], unit='s')
df.monthofyear = pd.to_datetime(df['monthofyear'], unit='s')

In [None]:
# Converting datetime to name of weekday, and to name of month (in separate columns)
df['dayofweek'] = df['dayofweek'].dt.weekday_name
df['monthofyear'] = df['monthofyear'].dt.month

In [None]:
# Creating dummy variables for weekday names and name of month
df_dayofweek_dummies = pd.get_dummies(df['dayofweek'])


In [None]:
# Removing rows not in the month of March
df = df.query('monthofyear == 3')

In [None]:
df

In [None]:
df.shape

In [None]:
df1 = pd.concat([df, df_dayofweek_dummies], axis=1, join_axes=[df.index])

In [None]:
df1

In [None]:
# Pull weather data from database
df2 = pd.read_sql_query('SELECT * FROM DarkSky_historical_weather_data WHERE year = 2017', engine)
df2.head()

In [None]:
d = {'clear-day':'clear','clear-night':'clear','partly-cloudy-day':'partly-cloudy','partly-cloudy-night':'partly-cloudy'}
df2 = df2.replace(d)

In [None]:
df2.rename(columns={'day_of_week': 'dayofweek', 'month': 'monthofyear'}, inplace=True)

In [None]:
df3 = pd.merge(df1, df2, on=['time'])

In [None]:
df3.head()

In [None]:
df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp', 'precip_intensity','trip_duration']]

In [None]:
# Trip duration is in seconds, convert to minutes and round to the nearest integer
df3['trip_duration'] = round(df3['trip_duration']/60)

In [None]:
df3['trip_duration'] = df3['trip_duration'].astype(int)

In [None]:
df3['temp'] = round(df3['temp'])

In [None]:
df3['temp'] = df3['temp'].astype(int)

In [None]:
#df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp','trip_duration']]

In [None]:
df3.head()

In [None]:
df3.shape

# Preprocessing
You can see that our dataset has eleven columns. The task is to predict the trip duration (last column) based on the day of the week, the time of the day and the weather conditions (temperature and rain intesity). The next step is to split our dataset into attributes and labels. 

In [None]:
# Assign data from first four columns to X variable
X = df3.iloc[:, 0:10]

# Assign data from fifth column to y variable
y = df3['trip_duration']

In [None]:
y.head()

In [None]:
# Split the dataset 70/30
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

# Gradient Boosting Regression 
http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html

In [None]:
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [None]:
#n_estimators : int (default=100)
    #The number of boosting stages to perform. 
    #Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.

#max_depth : integer, optional (default=3)
    #maximum depth of the individual regression estimators. 
    #The maximum depth limits the number of nodes in the tree. 
    #Tune this parameter for best performance; the best value depends on the interaction of the input variables.
    
#min_samples_split : int, float, optional (default=2)
    #The minimum number of samples required to split an internal node:
    #If int, then consider min_samples_split as the minimum number.
    #If float, then min_samples_split is a percentage and ceil(min_samples_split * n_samples) are the minimum number of samples for each split.
        #Changed in version 0.18: Added float values for percentages.

#learning_rate : float, optional (default=0.1)
    #learning rate shrinks the contribution of each tree by learning_rate. 
    #There is a trade-off between learning_rate and n_estimators.

#loss : {‘deviance’, ‘exponential’}, optional (default=’deviance’)
    #loss function to be optimized. 
    #‘deviance’ refers to deviance (= logistic regression) for classification with probabilistic outputs. 
    #For loss ‘exponential’ gradient boosting recovers the AdaBoost algorithm.

In [None]:
# Fit regression model
params = {'n_estimators': 600, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.02, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

In [None]:
# Compute the importance of each feature based on the model
pd.DataFrame({'feature': X.columns, 'importance': clf.feature_importances_})

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(clf.predict([[9, 0, 1, 0, 0, 0, 0, 0, 12, 0.0]])[0]),"minutes")

In [None]:
pred = clf.predict(X_test)

In [None]:
predictions = pd.DataFrame(pred)
predictions.rename(columns={0:'estimated_time'}, inplace=True )
predictions['estimated_time'] = round(predictions['estimated_time'])
predictions['estimated_time'] = predictions['estimated_time'].astype(int)
predictions.head()

In [None]:
print(metrics.mean_absolute_error(y_test,predictions)) 

# KNN Regression
n_neighbors : int, optional (default = 5)

    Number of neighbors to use by default for kneighbors queries.

weights : str or callable

    weight function used in prediction. Possible values:

    ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
    ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have  a greater influence than neighbors which are further away.
    [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.
    Uniform weights are used by default.

algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}, optional

    Algorithm used to compute the nearest neighbors:

    ‘ball_tree’ will use BallTree
    ‘kd_tree’ will use KDTree
    ‘brute’ will use a brute-force search.
    ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, weights = "uniform", algorithm = "auto")
knn.fit(X_train, y_train)

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(knn.predict([[9, 0, 1, 0, 0, 0, 0, 0, 12, 0.0]])[0]),"minutes")

In [None]:
pred2 = knn.predict(X_test)

In [None]:
predictions2 = pd.DataFrame(pred2)
predictions2.rename(columns={0:'estimated_time'}, inplace=True )
predictions2['estimated_time'] = round(predictions2['estimated_time'])
predictions2['estimated_time'] = predictions2['estimated_time'].astype(int)
predictions2.head()

In [None]:
# around 9.4 with 2 neighbours
# around 8.6 with 5 neighbours
# around 8.4 with 5 neightbours and uniform distance
print(metrics.mean_absolute_error(y_test,predictions2)) 

# Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [None]:
regr = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0)
regr.fit(X_train, y_train)

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(regr.predict([[9, 0, 1, 0, 0, 0, 0, 0, 12, 0.0]])[0]),"minutes")

In [None]:
pred3 = regr.predict(X_test)

In [None]:
pred3

In [None]:
predictions3 = pd.DataFrame(pred3)
predictions3.rename(columns={0:'estimated_time'}, inplace=True )
predictions3['estimated_time'] = round(predictions3['estimated_time'])
predictions3['estimated_time'] = predictions3['estimated_time'].astype(int)
predictions3.head()

In [None]:
print(metrics.mean_absolute_error(y_test,predictions3)) 

# GBR with XGBoost
https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

In [None]:
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBRegressor

In [None]:
# Train the model on the training data
boost = XGBRegressor()
boost.fit(X_train, y_train)

In [None]:
y_pred = boost.predict(X_test)

In [None]:
#print(round(boost.predict([[9, 0, 1, 0, 0, 0, 0, 0, 12, 0.0]])[0]),"minutes")

In [None]:
predictions6 = [round(value) for value in y_pred]

In [None]:
print(metrics.mean_absolute_error(y_test,predictions6)) 

# ANN Regression

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [None]:
# Train the NN model
from sklearn.neural_network import MLPRegressor  
mlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=2000)  
mlp.fit(X_train, y_train.values.ravel()) 

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(mlp.predict([[-1.35814288, -0.42520414,  2.35526298, -0.46323037, -0.46323037,
       -0.42270958, -0.3360006 , -0.31017723, -1.88838929, -0.29194067]])[0]),"minutes")

In [None]:
pred4 = mlp.predict(X_test)

In [None]:
predictions4 = pd.DataFrame(pred4)
predictions4.rename(columns={0:'estimated_time'}, inplace=True )
predictions4['estimated_time'] = round(predictions4['estimated_time'])
predictions4['estimated_time'] = predictions4['estimated_time'].astype(int)
predictions4.head()

In [None]:
print(metrics.mean_absolute_error(y_test,predictions4)) 

In [None]:
# Train the NN model
from sklearn.neural_network import MLPClassifier  
nnc = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=2000)  
nnc.fit(X_train, y_train.values.ravel()) 

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(nnc.predict([[-1.35814288, -0.42520414,  2.35526298, -0.46323037, -0.46323037,
       -0.42270958, -0.3360006 , -0.31017723, -1.88838929, -0.29194067]])[0]),"minutes")

In [None]:
pred5 = nnc.predict(X_test)

In [None]:
predictions5 = pd.DataFrame(pred5)
predictions5.rename(columns={0:'estimated_time'}, inplace=True )
predictions5['estimated_time'] = round(predictions5['estimated_time'])
predictions5['estimated_time'] = predictions5['estimated_time'].astype(int)
predictions5.head()

In [None]:
print(metrics.mean_absolute_error(y_test,predictions5)) 

# Overall summary

In [None]:
#GBR
print(metrics.mean_absolute_error(y_test,predictions)) 

In [None]:
#ANN - R
print(metrics.mean_absolute_error(y_test,predictions4)) 

In [None]:
#RFR
print(metrics.mean_absolute_error(y_test,predictions3)) 

In [None]:
#KNN
print(metrics.mean_absolute_error(y_test,predictions2)) 

In [None]:
#ANN - C
print(metrics.mean_absolute_error(y_test,predictions5)) 

In [None]:
# XGB
print(metrics.mean_absolute_error(y_test,predictions6)) 

In [None]:
# test_time takes: hour[0], day of week[1:8], temp[8], rain[9]
test_time = [[9, 0, 0, 0, 1, 0, 0, 0, 7, 0.0]]
test_time_nn = [[-1.35814288, -0.42520414,  2.35526298, -0.46323037, -0.46323037, -0.42270958, -0.3360006 , -0.31017723, -1.88838929, -0.29194067]]
# Please, note, test_time_nn is not necessarily the same data as test_time

print("%.2f" % clf.predict(test_time)[0],"minutes") #GBR
print("%.2f" % mlp.predict(test_time_nn)[0],"minutes") #ANN - R
print("%.2f" % regr.predict(test_time)[0],"minutes") # RFR
print("%.2f" % knn.predict(test_time)[0],"minutes") #KNN
print("%.2f" % nnc.predict(test_time_nn)[0],"minutes") #ANN - C

In [None]:
df3[2364:]

In [None]:
X_train[1]