In [65]:
# general 
import datetime

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Import statsmodels package for training a linear regression model.
import statsmodels.formula.api as sm

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Importing and Cleaning Data

In [66]:
#Read in the data from a csv file using Pandas
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [67]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

In [68]:
#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')   

In [69]:
# Convert Unix timestamp to datetime

# Take uneccesary trailing zeroes off of unix timestamp. 
df['Timestamp'] = df['Timestamp'].apply(lambda x: x//1000000)

# Convert the timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

In [70]:
# Add day of week column
df['Day'] = df['Timestamp'].dt.dayofweek

In [71]:
# Add hour of day column
df['Hour'] = df['Timestamp'].dt.hour

In [72]:
# Convert some features to categorical
for column in ['LineID', 'Direction','VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop','Day','Hour']:
    df[column] = df[column].astype('category')

In [73]:
# Remove irrelevant features
df = df.drop('BlockID', 1)
df = df.drop('Operator', 1)
df = df.drop('Delay', 1)
df = df.drop('Congestion', 1)

In [74]:
# Reduce df to journey pattern we are interested in
df = df.loc[df['JourneyPatternID'] == '00150001']

## Creating Basetable

In [75]:
# Making list of dictionaries, to be turned into a dataframe

rows = []

for date in df.TimeFrame.unique():    
    temp_df = df[df.TimeFrame == date]
    
    for journey in temp_df.VehicleJourneyID.unique():
        
        inner_df = temp_df[temp_df.VehicleJourneyID == journey]
        
        row = {}
    
        duration = inner_df.Timestamp.max()-inner_df.Timestamp.min()
        
        if (duration > datetime.timedelta(minutes=30)):
        
            row['hour'] = inner_df.Timestamp.min().hour
            row['day'] = inner_df.Timestamp.min().dayofweek
            row['duration'] = duration.seconds//60

            rows.append(row)

In [76]:
# Creating dataframe from the list we made in the last cell
basetable = pd.DataFrame(rows)

In [77]:
basetable.head(10)

Unnamed: 0,day,duration,hour
0,1,105,6
1,1,92,6
2,1,93,6
3,1,94,6
4,1,148,6
5,1,96,6
6,1,108,6
7,1,107,7
8,1,107,7
9,1,103,7
