In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.formula.api as sm

import pprint as pp

%matplotlib inline

## Reading Data & Formatting

In [3]:
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [4]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')
    
# Convert other features to categorical
for column in ['LineID', 'VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop']:
    df[column] = df[column].astype('category')

In [5]:
# Add column for human readable time
df['HumanTime'] = pd.to_datetime(df['Timestamp'], unit='us')

In [6]:
# Add day of week column
df['Day'] = df['HumanTime'].dt.dayofweek

# Add hour of day column
df['Hour'] = df['HumanTime'].dt.hour

In [7]:
# Dropping irrelevant columns
df = df.drop('BlockID', 1)
df = df.drop('Operator', 1)
df = df.drop('Delay', 1)
df = df.drop('Congestion', 1)

## Separating journey patterns

In [8]:
patterns = df.JourneyPatternID.unique()

for pattern in patterns:
    print(pattern)

00150001
00151001
015A1001
015A0001
null
nan
015B0002
015B1001
015A0002
015B0001
015B1002
015B0003
00151002
00150002
029A1001
066A0001
00400001
056A1001


In [26]:
# Choosing only line 00150001, for a single day
# newdf = df[(df.JourneyPatternID == '00150001') & (df.TimeFrame == '2012-11-26')]
newdf = df[(df.JourneyPatternID == '00150001')]
newdf.head(100)

Unnamed: 0,Timestamp,LineID,Direction,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour
0,1352160000000000,15.0,0,00150001,2012-11-05,5826,-6.258584,53.340099,33210,4870,0,2012-11-06 00:00:00,1,0
3,1352160010000000,15.0,0,00150001,2012-11-05,5843,-6.323327,53.277756,33254,4869,0,2012-11-06 00:00:10,1,0
6,1352160019000000,15.0,0,00150001,2012-11-05,5826,-6.257967,53.342365,33210,4870,0,2012-11-06 00:00:19,1,0
9,1352160031000000,15.0,0,00150001,2012-11-05,5843,-6.327923,53.276974,33254,4869,0,2012-11-06 00:00:31,1,0
13,1352160039000000,15.0,0,00150001,2012-11-05,5826,-6.257433,53.342899,33210,4870,0,2012-11-06 00:00:39,1,0
18,1352160049000000,15.0,0,00150001,2012-11-05,5843,-6.331139,53.276196,33254,4870,0,2012-11-06 00:00:49,1,0
21,1352160059000000,15.0,0,00150001,2012-11-05,5826,-6.254167,53.342182,33210,4870,0,2012-11-06 00:00:59,1,0
23,1352160069000000,15.0,0,00150001,2012-11-05,5843,-6.331205,53.276012,33254,3007,0,2012-11-06 00:01:09,1,0
26,1352160079000000,15.0,0,00150001,2012-11-05,5826,-6.251433,53.342201,33210,4870,0,2012-11-06 00:01:19,1,0
28,1352160090000000,15.0,0,00150001,2012-11-05,5843,-6.331040,53.274563,33254,3007,0,2012-11-06 00:01:30,1,0


In [27]:
stops = newdf.StopID.unique()

for stop in stops:
    print(stop)

4870
4869
3007
6283
6282
6318
6319
7246
6320
4594
4595
4596
4563
1218
1270
1272
1273
1274
1275
1276
1219
1220
1221
664
665
666
1277
667
668
614
615
616
617
618
619
675
4415
4719
5190
348
403
746
844
1014
1015
1016
1017
1019
1020
1076
1077
1078
1081
1083
1085
1121
1079
1080
1122
1123
1124
1125
1127
1130
1140
1141
1142
1143
1144
1145
1018
1082
6326
6316
6335
1150
1164
4886
1154
1165
1151
6079
6285
6288
1155
299
497
790
6286
1153
4887
7245
1158
6317
6315
1166
946
4528
1152
7236
6287
1069
1157
6115


In [10]:
# drop duplicate rows
newdf = newdf.drop_duplicates(["TimeFrame", "VehicleJourneyID", "StopID"])

# drop rows where bus isn't at stop
newdf = newdf[(newdf.AtStop == 1) & (newdf.HumanTime.dt.hour == 4)]

In [13]:
# drop rows where bus isn't at stop
nightime = df[(df.HumanTime.dt.hour > 2) & (df.HumanTime.dt.hour < 6)]

In [14]:
nightime.tail()

Unnamed: 0,Timestamp,LineID,Direction,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour


In [15]:
# filter out routes which only have 1 stop. 
# This solves the problem of busses starting on the wrong side... 
newdf = newdf.groupby(['TimeFrame', 'VehicleJourneyID']).filter(lambda x: len(x) >= 3)

In [16]:
# Putting the first sightings of a vehiclejourneyid and timeframe combo timestamp into a dictionary

start_times = {}

# This gives you the first line anything has been seen by 
firstlines = newdf.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)

# This iterates through them and assigns values to the dictionary
for index, row in firstlines.iterrows():
    start_times[row.TimeFrame, row.VehicleJourneyID] = {"time":row.Timestamp, "loc":[row.Lat, row.Lon]}

In [17]:
pp.pprint(start_times)

{}


In [18]:
locations = []

for x in start_times:
#     locations.append(start_times[x])
    print(start_times[x]["loc"][0], start_times[x]["loc"][1])

## Problem - Not all busses start in the same spot! 

(5148 2012-11-26) doesn't start on the north side. 

This is how it's supposed to be! The ones that do start on the northside are phony. 

No, because (5141, 2012-11-26) starts on the north side and hits a bunch of stops. Why do we have north south both doing the same journey?

5148 (starts in south) doesn't actually hit any stops except 1...
5150 is the same
5156
5158

Seems to be a pattern. The busses that start in the south don't actually service any stops. Why are they signalling that they're on the journeypattern? 

### Adding Runtime column

This column will represent the number of seconds elapsed since the bus started along its journey

In [22]:
def add_runtime(row, mydict):
    # Takes a row and a dictionary of start times
    # returns time elapsed (seconds) between that row's timestamp and the start of the line
    start = mydict[row.TimeFrame, row.VehicleJourneyID]["time"]
    current = row.Timestamp
    return (current - start) // 1000000

newdf['Runtime'] = ""

# Applies this function to the newdf
newdf['Runtime'] = newdf.apply(lambda row: add_runtime(row, start_times),axis=1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [13]:
# test = newdf[newdf.]
test = newdf[(newdf.VehicleJourneyID == 5326) & (newdf.TimeFrame == '2012-11-26')]
test

Unnamed: 0,Timestamp,LineID,Direction,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour,Runtime
1849385,1353960900000000,15.0,0,150001,2012-11-26,5326,-6.151467,53.402832,33453,6318,1,2012-11-26 20:15:00,0,20,0
1849929,1353961340000000,15.0,0,150001,2012-11-26,5326,-6.162333,53.401539,33453,6320,1,2012-11-26 20:22:20,0,20,440
1850015,1353961420000000,15.0,0,150001,2012-11-26,5326,-6.163521,53.399288,33453,4594,1,2012-11-26 20:23:40,0,20,520
1850187,1353961580000000,15.0,0,150001,2012-11-26,5326,-6.181091,53.401699,33453,4563,1,2012-11-26 20:26:20,0,20,680
1850306,1353961699000000,15.0,0,150001,2012-11-26,5326,-6.191873,53.392952,33453,1272,1,2012-11-26 20:28:19,0,20,799
1850385,1353961779000000,15.0,0,150001,2012-11-26,5326,-6.195288,53.391106,33453,1273,1,2012-11-26 20:29:39,0,20,879
1850430,1353961820000000,15.0,0,150001,2012-11-26,5326,-6.198,53.388981,33453,1274,1,2012-11-26 20:30:20,0,20,920
1850759,1353962139000000,15.0,0,150001,2012-11-26,5326,-6.227001,53.368011,33453,667,1,2012-11-26 20:35:39,0,20,1239
1850888,1353962260000000,15.0,0,150001,2012-11-26,5326,-6.233564,53.363689,33453,614,1,2012-11-26 20:37:40,0,20,1360
1850976,1353962341000000,15.0,0,150001,2012-11-26,5326,-6.239722,53.359943,33453,616,1,2012-11-26 20:39:01,0,20,1441


### Saving newdf to a dataframe


In [14]:
newdf.to_csv("bus_data/cleaned_data/line15_00150001.csv")