In [1]:
# The ultimate target feature: time from one stop to another

# Imports
import pandas as pd
import numpy as np
from datetime import date, datetime
from patsy import dmatrices
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import export_graphviz, DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from statsmodels.formula.api import ols

# Read csv file into a dataframe.
df = pd.read_csv('csv_data/route4.csv')

## 1.1 Rename column names

In [None]:
df = df.rename(columns={'Timeframe': 'Start_date'})

## 1.2 Dropping duplicates

In [None]:
df=df.drop_duplicates(keep='first')

## 1.3 Dropping constant columns or columns with missing data

In [None]:
df = df.drop('Direction', axis=1)
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('Congestion', axis=1)
df[df.Journey_Pattern_ID == 'null']
df = df[df['Journey_Pattern_ID'] != '00040002']
df = df[df['Journey_Pattern_ID'] != '00041002']

## 1.4 Remove rows where bus is not at stop

In [None]:
df = df.loc[(df != 0).all(axis=1), :]

## 1.5 Group to normalise time & remove rows where bus idle at stop

In [None]:
# Create empty column which will hold normalised time
df['normal_time'] = 0

In [None]:
# Create empty column which will hold the stop order
df['stop_order'] = 0

In [None]:
grouped_df = df.groupby(['Vehicle_Journey_ID', 'Start_date'])

In [None]:
def normalize_time(df):
    """Normalise the time for each journey"""
    for i in range(df['Timestamp'].size):
        df['normal_time'].values[i] = (df['Timestamp'].values[i] - df['Timestamp'].values[0]) / 1000000
        
    return df

In [None]:
norm_gb = grouped_df.apply(normalize_time)

In [None]:
grouped_df = norm_gb.groupby(['Vehicle_Journey_ID', 'Start_date'])

In [None]:
def remove_idle_at_stop(df):
    df = df.drop_duplicates(subset='Stop_ID', keep='first')
    
    return df

In [None]:
norm_gb = grouped_df.apply(remove_idle_at_stop)

In [None]:
grouped_df = norm_gb.groupby(['Vehicle_Journey_ID', 'Start_date'])

## 1.6 Add new features

In [None]:
df["Time"] = pd.to_datetime(df['Timestamp']*1000, unit="ns")

In [None]:
df['HourOfDay'] = df['Time'].dt.hour

In [None]:
df['MinsOfHour'] = df['Time'].dt.minute
df['MinsOfHour30'] = np.where((df['MinsOfHour'] > 30), 1, 0)
df['MinsOfHour15'] = np.where((df['MinsOfHour'] > 15), 1, 0)
df['MinsOfHour45'] = np.where((df['MinsOfHour'] > 45), 1, 0)

In [None]:
df['Time_bin_xxx'] =df.HourOfDay.astype('str') + df.MinsOfHour15.astype('str')+ df.MinsOfHour30.astype('str') + df.MinsOfHour45.astype('str')
df['Time_bin_xxx']=df['Time_bin_xxx'].astype('int')


In [None]:
df['DayOfWeek'] = df['Time'].dt.dayofweek

In [None]:
df['Direction'] = np.where((df['Journey_Pattern_ID'] == '00041001'), 1,0)

# 2.0 Merge Datasets

## 2.1 Merge bus stop info

In [None]:
df_all_routes = pd.read_csv('csv_data/Route_4_stops.csv', encoding='latin-1')

In [None]:
df_all_routes['Stop_ID']=df_all_routes['Stop_ID'].astype('str')

In [None]:
df_routes = df_all_routes[['Stop_ID','Stop_name','Stop_sequence']]

In [None]:
df = pd.merge(df, df_routes, on=['Stop_ID'])

# 3.0 Remove and categories columns

## 3.1 Drop missing values

In [None]:
df = df[df['Journey_Pattern_ID'] != 'null']

## 3.3 Drop columns no longer needed

In [None]:
df = df.drop('Lat', axis=1)
df = df.drop('Lon', axis=1)
df = df.drop('Block_ID', axis=1)
df = df.drop('Operator', axis=1)

## 3.4 Create time to destination feature

In [None]:
df['end_time'] = df.groupby(['Vehicle_Journey_ID', 'Start_date'])['Timestamp'].transform(max)

In [None]:
df['start_time'] = df.groupby(['Vehicle_Journey_ID', 'Start_date'])['Timestamp'].transform(min)

In [None]:
df['start_stop'] = df.groupby(['Vehicle_Journey_ID', 'Start_date'])['Stop_sequence'].transform(min)

In [None]:
df['max_stop_sequence'] = df.groupby(['Direction'])['Stop_sequence'].transform(max)

In [None]:
df['end_stop'] = df.groupby(['Vehicle_Journey_ID', 'Start_date'])['Stop_sequence'].transform(max)

In [None]:
df['stops_travelled'] = ((df['end_stop'] - df['start_stop']) )

In [None]:
df['scheduled__overall_journey_time']=60

In [None]:
df['journey_time'] = ((df['end_time'] - df['start_time']) )
df['time_travelling'] = ((df['Timestamp'] - df['start_time']) )
df['time_to_travel'] = ((df['end_time'] - df['Timestamp'] ) )
df['time_travelling'] = pd.to_timedelta(df['time_travelling']*1000, unit="ns").astype('timedelta64[m]')
df['time_to_travel'] = pd.to_timedelta(df['time_to_travel']*1000, unit="ns").astype('timedelta64[m]')
df['journey_time'] = pd.to_timedelta(df['journey_time']*1000, unit="ns").astype('timedelta64[m]')
df['Timestamp'] = pd.to_timedelta(df['Timestamp']*1000, unit="ns").astype('timedelta64[m]')
df['mins_late'] = ((df['journey_time'].astype(int) - 60))
df['late'] = np.where((df['mins_late'] > 1), 1, 0)
df['speed_journey_full']= ((df['journey_time'] / df['max_stop_sequence'].astype('float64') ))
df['speed_trip']= ((df['time_travelling'] / df['Stop_sequence'].astype('float64')) )
df.head()

In [None]:
df['scheduled_speed_per_stop'] = df['scheduled__overall_journey_time']/df['max_stop_sequence']

In [None]:
df['scheduled_journey_time']=df['scheduled_speed_per_stop'] * df['stops_travelled']
df.to_csv('check.csv')

In [None]:
df['Vehicle_Journey_ID'] = pd.to_numeric(df['Vehicle_Journey_ID'], errors='coerce')
df.shape

In [None]:
bins=[10,20,30,40,50,60,70,80,90,100,110]

In [None]:
df['time_bins'] = np.digitize(df.journey_time.values, bins=bins)
df.time_bins.unique()

In [None]:
# Save cleaned dataframe to new CSV file
df.to_csv('csv_data/bus_route4_clean.csv', index=False)