# Dataset

In [None]:
import json
import pandas as pd
import pymysql
from sqlalchemy import create_engine
pd.set_option('display.max_columns', 500)

with open("credentials.json") as f:
    credentials = json.loads(f.read())
    
    host = credentials["host"]
    user = credentials["db_user"]
    password = credentials["db_pass"]
    db = credentials["db_name"]

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:3306/{db}")

df = pd.read_sql_query('SELECT * FROM trips_2017 WHERE lineid = "46A" AND direction = 2', engine)
df.head()

In [None]:
# Replace missing actual time departure values with timetable values
df.actualtime_dep.fillna(df.plannedtime_dep, inplace=True)
df.head()

In [None]:
# Remove rows with missing values for actual time arrival as we cannot safely assume these are as per timetable
df = df[pd.notnull(df['actualtime_arr'])]
df.head()

In [None]:
# Create a new column for trip duration
df['trip_duration'] = df['actualtime_arr'] - df['actualtime_dep']
df.head()

In [None]:
# Create a new column with the hour of the day the trip took place
df['actualtime_dep_H'] = round(df['actualtime_dep']/3600)
df.head()

In [None]:
# Hour of actual time arrival
df['actualtime_arr_H'] = round(df['actualtime_arr']/3600)
df.head()

In [None]:
# Average hour of the day of the journey
df['avg_H'] = (df['actualtime_dep_H'] + df['actualtime_arr_H']) / 2
df.head()

In [None]:
df['avg_H'] = df['avg_H'].astype(int)
df.head()

In [None]:
# Creating column solely for the dates to correlate with the dates column on the historical weather data table
df['time'] = df['timestamp'] + df['avg_H'] * 3600
df.time

In [None]:
# Removing suppressed rows where suppressed=1.0
df = df.query('suppressed != 1.0')

In [None]:
df.index = range(len(df))

In [None]:
# Creating columns from timestamp for further processing
df['dayofweek'] = df['timestamp']
df['monthofyear'] = df['timestamp']

In [None]:
# Converting the unix time to datetime format
df.dayofweek = pd.to_datetime(df['dayofweek'], unit='s')
df.monthofyear = pd.to_datetime(df['monthofyear'], unit='s')

In [None]:
# Converting datetime to name of weekday, and to name of month (in separate columns)
df['dayofweek'] = df['dayofweek'].dt.weekday_name
df['monthofyear'] = df['monthofyear'].dt.month

In [None]:
# Creating dummy variables for weekday names and name of month
df_dayofweek_dummies = pd.get_dummies(df['dayofweek'])


In [None]:
# Removing rows not in the month of March
df = df.query('monthofyear == 3')

In [None]:
df

In [None]:
df.shape

In [None]:
df1 = pd.concat([df, df_dayofweek_dummies], axis=1, join_axes=[df.index])

In [None]:
df1

In [None]:
# Pull historical weather data
df2 = pd.read_sql_query('SELECT * FROM DarkSky_historical_weather_data WHERE year = 2017', engine)
df2.head()

In [None]:
d = {'clear-day':'clear','clear-night':'clear','partly-cloudy-day':'partly-cloudy','partly-cloudy-night':'partly-cloudy'}
df2 = df2.replace(d)

In [None]:
df2.rename(columns={'day_of_week': 'dayofweek', 'month': 'monthofyear'}, inplace=True)

In [None]:
df3 = pd.merge(df1, df2, on=['time'])

In [None]:
df3.head()

In [None]:
df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp', 'precip_intensity','trip_duration']]

In [None]:
# Trip duration is in seconds, convert to minutes and round to the nearest integer
df3['trip_duration'] = round(df3['trip_duration']/60)

In [None]:
df3['trip_duration'] = df3['trip_duration'].astype(int)

In [None]:
df3['temp'] = round(df3['temp'])

In [None]:
df3['temp'] = df3['temp'].astype(int)

In [None]:
#df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp','trip_duration']]

In [None]:
df3.head()

# Preprocessing
You can see that our dataset has eleven columns. The task is to predict the trip duration (last column) based on the day of the week, the time of the day and the weather conditions (temperature and rain intesity). The next step is to split our dataset into attributes and labels. 

In [None]:
# Assign data from first four columns to X variable
X = df3.iloc[:, 0:9]

# Assign data from fifth column to y variable
y = df3['trip_duration']

In [None]:
y.head()

# Train Test Split
To avoid over-fitting, we will divide our dataset into training and test splits. The training data will be used to train the neural network and the test data will be used to evaluate the performance of the neural network. This helps with the problem of over-fitting because we're evaluating our neural network on data that it has not seen (i.e. been trained on) before.

In [None]:
# Split the dataset 70/30
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) 

# Feature Scaling
Before making actual predictions, it is always a good practice to scale the features so that all of them can be uniformly evaluated. Feature scaling is performed only on the training data and not on test data. This is because in real world, data is not scaled and the ultimate purpose of the neural network is to make predictions on real world data. Therefore, we try to keep our test data as real as possible.

In [None]:
# Feature scaling
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test) 

In [None]:
# Train the NN model
from sklearn.neural_network import MLPClassifier  
mlp = MLPClassifier(hidden_layer_sizes=(20, 20, 20), max_iter=2000)  
mlp.fit(X_train, y_train.values.ravel())  

Let's see what is happening in the above script. The first step is to import the MLPClassifier class from the sklearn.neural_network library. In the second line, this class is initialized with two parameters.

The first parameter, hidden_layer_sizes, is used to set the size of the hidden layers. In our script we will create three layers of 10 nodes each. There is no standard formula for choosing the number of layers and nodes for a neural network and it varies quite a bit depending on the problem at hand. The best way is to try different combinations and see what works best.

The second parameter to MLPClassifier specifies the number of iterations, or the epochs, that you want your neural network to execute. Remember, one epoch is a combination of one cycle of feed-forward and back propagation phase.

By default the 'relu' activation function is used with 'adam' cost optimizer. However, you can change these functions using the activation and solver parameters, respectively.

In the third line the fit function is used to train the algorithm on our training data i.e. X_train and y_train.

The final step is to make predictions on our test data. 

In [None]:
predictions = mlp.predict(X_test)  

In [None]:
from sklearn import metrics
print(metrics.accuracy_score(y_test,predictions))  

In [None]:
predictions

# Evaluating the Algorithm
Now is the time to evaluate how well our algorithm performs. To evaluate an algorithm, the most commonly used metrics are a confusion matrix, precision, recall, and f1 score. The confusion_matrix and classification_report methods of the sklearn.metrics library can help us find these scores. 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix 
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test,predictions))  

The main diagonal (top left to bottom right contains our correct classifications). The remaining cells show the errors the model made. 


# Random Forest on the same dataframe df3

In [None]:
df3.head()

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
#X = df3[['temp', 'precip_intensity', 'avg_H']]
X = df3[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp', 'precip_intensity', 'avg_H']]
y = df3['trip_duration']
print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

In [None]:
# Train a classification tree with max_depth=3 on all data.
dtc = DecisionTreeClassifier(max_depth=3, random_state=1)
dtc.fit(X, y)
print(dtc)

In [None]:
# Compute the importance of each feature based on the trained decision tree classifier
pd.DataFrame({'feature': X.columns, 'importance': dtc.feature_importances_})

In [None]:
predictions = dtc.predict(X)
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

In [None]:
# Refit the model on the training set only
dtc.fit(X_train, y_train)

In [None]:

# Predict on the hold-out test set
predictions_test = dtc.predict(X_test)
df_true_vs_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': predictions_test})
df_true_vs_predicted_test

In [None]:
print("Accuracy: ", metrics.accuracy_score(y_test, predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, predictions_test))

In [None]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
scores = cross_val_score(DecisionTreeClassifier(max_depth=3, random_state=1), X, y, scoring='accuracy', cv=3)
print(scores)
print(scores.mean())

In [None]:
# RANDOM FOREST
# Train RF with 100 trees
rfc = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)

In [None]:
# Fit model on full dataset
rfc.fit(X, y)

In [None]:
# Feature importance
pd.DataFrame({'feature': X.columns, 'importance':rfc.feature_importances_})

In [None]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
rfc_predictions = rfc.predict(X)
df_true_vs_rfc_predicted = pd.DataFrame({'ActualClass': y, 'PredictedClass': rfc_predictions})
df_true_vs_rfc_predicted

In [None]:
print("Accuracy: ", metrics.accuracy_score(y, rfc_predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, rfc_predictions))
print("Classification report:\n ", metrics.classification_report(y, rfc_predictions))

In [None]:
rfc.fit(X_train, y_train)

In [None]:
rfc_predictions_test = rfc.predict(X_test)
df_true_vs_rfc_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': rfc_predictions_test})
df_true_vs_rfc_predicted_test

In [None]:
print("Accuracy: ", metrics.accuracy_score(y_test, rfc_predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, rfc_predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, rfc_predictions_test))

In [None]:
scores = cross_val_score(RandomForestClassifier(n_estimators=10, max_features='auto', oob_score=True, random_state=1), X, y, scoring='accuracy', cv=3)
print(scores)
print(scores.mean())

In [None]:
# compute the out-of-bag classification accuracy
rfc.oob_score_

In [None]:
df3["trip_duration"].mean()

In [None]:
df3["trip_duration"].median()

In [None]:
df3.describe()

In [None]:
df3.trip_duration.value_counts()
# 3192 rows; 109 rows are 98 minutes %3.4

In [None]:
df3.shape

In [None]:

x = 10900/3192
x

In [None]:
18700/2349