# Dataset

In [None]:
import json
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
from sklearn import neighbors
from sklearn.neighbors import NearestNeighbors
pd.set_option('display.max_columns', 500)

In [None]:
# Credentials
with open("credentials.json") as f:
    credentials = json.loads(f.read())
    
    host = credentials["host"]
    user = credentials["db_user"]
    password = credentials["db_pass"]
    db = credentials["db_name"]

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:3306/{db}")

In [None]:
# Database call for bus data
df = pd.read_sql_query('SELECT * FROM trips_2017 WHERE lineid = "46A" AND direction = 2', engine)
df.head()

In [None]:
# Replace missing actual time departure values with timetable values
df.actualtime_dep.fillna(df.plannedtime_dep, inplace=True)
df.head()

In [None]:
# Remove rows with missing values for actual time arrival as we cannot safely assume these are as per timetable
df = df[pd.notnull(df['actualtime_arr'])]
df.head()

In [None]:
# Create a new column for trip duration
df['trip_duration'] = df['actualtime_arr'] - df['actualtime_dep']
df.head()

In [None]:
# Create a new column with the hour of the day the trip took place
df['actualtime_dep_H'] = round(df['actualtime_dep']/3600)
df.head()

In [None]:
# Hour of actual time arrival
df['actualtime_arr_H'] = round(df['actualtime_arr']/3600)
df.head()

In [None]:
# Average hour of the day of the journey
df['avg_H'] = (df['actualtime_dep_H'] + df['actualtime_arr_H']) / 2
df.head()

In [None]:
df['avg_H'] = df['avg_H'].astype(int)
df.head()

In [None]:
# Creating column solely for the dates to correlate with the dates column on the historical weather data table
df['time'] = df['timestamp'] + df['avg_H'] * 3600
df.time

In [None]:
# Removing suppressed rows where suppressed=1.0
df = df.query('suppressed != 1.0')

In [None]:
df.index = range(len(df))

In [None]:
# Creating columns from timestamp for further processing
df['dayofweek'] = df['timestamp']
df['monthofyear'] = df['timestamp']

In [None]:
# Converting the unix time to datetime format
df.dayofweek = pd.to_datetime(df['dayofweek'], unit='s')
df.monthofyear = pd.to_datetime(df['monthofyear'], unit='s')

In [None]:
# Converting datetime to name of weekday, and to name of month (in separate columns)
df['dayofweek'] = df['dayofweek'].dt.weekday_name
df['monthofyear'] = df['monthofyear'].dt.month

In [None]:
# Creating dummy variables for weekday names and name of month
df_dayofweek_dummies = pd.get_dummies(df['dayofweek'])


In [None]:
# Removing rows not in the month of March
df = df.query('monthofyear == 3')

In [None]:
df

In [None]:
df.shape

In [None]:
df1 = pd.concat([df, df_dayofweek_dummies], axis=1, join_axes=[df.index])

In [None]:
df1

In [None]:
# Pull historical weather data
df2 = pd.read_sql_query('SELECT * FROM DarkSky_historical_weather_data WHERE year = 2017', engine)
df2.head()

In [None]:
d = {'clear-day':'clear','clear-night':'clear','partly-cloudy-day':'partly-cloudy','partly-cloudy-night':'partly-cloudy'}
df2 = df2.replace(d)

In [None]:
df2.rename(columns={'day_of_week': 'dayofweek', 'month': 'monthofyear'}, inplace=True)

In [None]:
df3 = pd.merge(df1, df2, on=['time'])

In [None]:
df3.head()

In [None]:
df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp', 'precip_intensity','trip_duration']]

In [None]:
# Trip duration is in seconds, convert to minutes and round to the nearest integer
df3['trip_duration'] = round(df3['trip_duration']/60)

In [None]:
df3['trip_duration'] = df3['trip_duration'].astype(int)

In [None]:
df3['temp'] = round(df3['temp'])

In [None]:
df3['temp'] = df3['temp'].astype(int)

In [None]:
#df3 = df3[['avg_H', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'temp','trip_duration']]

In [None]:
df3.head()

# Preprocessing
You can see that our dataset has eleven columns. The task is to predict the trip duration (last column) based on the day of the week, the time of the day and the weather conditions (temperature and rain intesity). The next step is to split our dataset into attributes and labels. 

In [None]:
# Assign data from first four columns to X variable
X = df3.iloc[:, 0:10]

# Assign data from fifth column to y variable
y = df3['trip_duration']

In [None]:
X.head()

In [None]:
y.head()

# KNN Regression 


In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X, y)

In [None]:
# predict for 9 am on a Tuesday with 0.0 rain and 12 degrees
print(round(knn.predict([[9, 0, 1, 0, 0, 0, 0, 0, 12, 0.0]])[0]),"minutes")

In [None]:
pred = knn.predict(X)

In [None]:
predictions = pd.DataFrame(pred)
predictions.rename(columns={0:'estimated_time'}, inplace=True )
predictions['estimated_time'] = round(predictions['estimated_time'])
predictions['estimated_time'] = predictions['estimated_time'].astype(int)
predictions.head()

In [None]:
from sklearn import metrics
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))

In [None]:
metrics.mean_absolute_error(y,predictions)/predictions.mean()

In [None]:
print(metrics.mean_absolute_error(y,predictions)) 