In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.formula.api as sm

import pprint as pp

%matplotlib inline

## Reading Data & Formatting

In [15]:
df = pd.read_csv("bus_data/Dcc/siri.20121106.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [16]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')
    
# Convert other features to categorical
for column in ['Congestion', 'BlockID', 'VehicleID', 'AtStop']:
    df[column] = df[column].astype('category')

# Convert LineID & VehicleJourneyID features to str
for column in ['LineID', 'VehicleJourneyID',]:
    df[column] = df[column].astype('str')

In [17]:
# Add column for human readable time
df['HumanTime'] = pd.to_datetime(df['Timestamp'], unit='us')

In [18]:
# Add day of week column
df['Day'] = df['HumanTime'].dt.dayofweek

# Add hour of day column
df['Hour'] = df['HumanTime'].dt.hour

In [19]:
# Dropping irrelevant columns
df = df.drop('BlockID', 1)
df = df.drop('Direction', 1)
df = df.drop('Operator', 1)
df = df.drop('Delay', 1)
df = df.drop('Congestion', 1)

### Separate LineID & JourneyPatternID

In [20]:
def get_line(string):
    if len(string) > 4:
        var = string[:4]
        var = var.lstrip("0")
        return var
    else:
        print("Error!")
        return None

def get_journey(string):
    if len(string) > 4:
        var = string[-4:]
        return var
    else:
        print("Error!")
        return None
        
df['LineID'] = df['JourneyPatternID'].apply(lambda x: get_line(x))
df['JourneyPatternID'] = df['JourneyPatternID'].apply(lambda x: get_journey(x))

df.head(1000)

Error!
Error!


Setting NaNs in `categories` is deprecated and will be removed in a future version of pandas.
  ordered=self.ordered)


Unnamed: 0,Timestamp,LineID,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour
0,1352160000000000,15,0001,2012-11-05,5826,-6.258584,53.340099,33210,4870,0,2012-11-06 00:00:00,1,0
1,1352160000000000,46A,1002,2012-11-05,7267,-6.259093,53.345425,36024,794,0,2012-11-06 00:00:00,1,0
2,1352160000000000,14,0001,2012-11-05,6206,-6.257329,53.287521,33325,1047,0,2012-11-06 00:00:00,1,0
3,1352160002000000,41B,0002,2012-11-05,61,-6.264167,53.453217,33631,3874,1,2012-11-06 00:00:02,1,0
4,1352160002000000,,,2012-11-05,1116,-6.171050,53.259201,33137,3283,0,2012-11-06 00:00:02,1,0
5,1352160002000000,39A,1002,2012-11-05,3795,-6.262447,53.346767,36060,1479,0,2012-11-06 00:00:02,1,0
6,1352160002000000,65,0001,2012-11-05,4004,-6.594641,53.129776,38004,7283,0,2012-11-06 00:00:02,1,0
7,1352160002000000,40D,1001,2012-11-05,2466,-6.258850,53.362499,33274,52,0,2012-11-06 00:00:02,1,0
8,1352160002000000,,,2012-11-05,5076,-6.261073,53.352112,43035,4725,0,2012-11-06 00:00:02,1,0
9,1352160002000000,11,1002,2012-11-05,5241,-6.230217,53.323002,33462,320,0,2012-11-06 00:00:02,1,0


## Separating journey patterns

In [22]:
lines = df.LineID.unique()
patterns = df.JourneyPatternID.unique()

# for line in lines:
#     print(line)
    
# print()

# for pattern in patterns:
#     print(pattern)

In [23]:
# drop rows where bus isn't at stop
# df = df[(df.AtStop == 1)]

# drop duplicate rows
# df = df.drop_duplicates(["TimeFrame", "VehicleJourneyID", "StopID"])

In [24]:
# filter out routes which only have 1 stop. 
# This solves the problem of busses starting on the wrong side... 
df = df.groupby(['TimeFrame', 'VehicleJourneyID']).filter(lambda x: len(x) >= 3)

In [25]:
# Putting the first sightings of a vehiclejourneyid and timeframe combo timestamp into a dictionary

start_times = {}

# This gives you the first line anything has been seen by 
firstlines = df.groupby(["TimeFrame", "VehicleJourneyID"]).head(1)

# This iterates through them and assigns values to the dictionary
for index, row in firstlines.iterrows():
    start_times[row.TimeFrame, row.VehicleJourneyID] = {"time":row.Timestamp, "loc":[row.Lat, row.Lon]}

### Adding Runtime column

This column will represent the number of seconds elapsed since the bus started along its journey

In [26]:
def add_runtime(row, mydict):
    # Takes a row and a dictionary of start times
    # returns time elapsed (seconds) between that row's timestamp and the start of the line
    start = mydict[row.TimeFrame, row.VehicleJourneyID]["time"]
    current = row.Timestamp
    return (current - start) // 1000000

df['Runtime'] = ""

# Applies this function to the newdf
df['Runtime'] = df.apply(lambda row: add_runtime(row, start_times),axis=1)

In [27]:
df.head(100)

Unnamed: 0,Timestamp,LineID,JourneyPatternID,TimeFrame,VehicleJourneyID,Lon,Lat,VehicleID,StopID,AtStop,HumanTime,Day,Hour,Runtime
52,1352160004000000,40,0001,2012-11-05,6401,-6.398231,53.345596,33606,2125,1,2012-11-06 00:00:04,1,0,0
94,1352160008000000,25A,0001,2012-11-05,3416,-6.430205,53.340366,33360,4607,1,2012-11-06 00:00:08,1,0,0
210,1352160025000000,38,0001,2012-11-05,1269,-6.378301,53.392982,33563,1816,1,2012-11-06 00:00:25,1,0,0
249,1352160031000000,13,0007,2012-11-05,4169,-6.401314,53.317501,33351,4667,1,2012-11-06 00:00:31,1,0,0
253,1352160031000000,65B,0001,2012-11-05,4018,-6.342323,53.275558,33485,4872,1,2012-11-06 00:00:31,1,0,0
281,1352160033000000,39,0002,2012-11-05,3810,-6.418562,53.393379,36059,1877,1,2012-11-06 00:00:33,1,0,0
294,1352160039000000,14,0001,2012-11-05,6206,-6.262346,53.287159,33325,1049,1,2012-11-06 00:00:39,1,0,0
303,1352160039000000,184,0001,2012-11-05,1197,-6.089990,53.132530,38093,7351,1,2012-11-06 00:00:39,1,0,0
411,1352160045000000,40,0001,2012-11-05,6401,-6.398444,53.343945,33606,2126,1,2012-11-06 00:00:45,1,0,41
523,1352160063000000,41C,0001,2012-11-05,126,-6.244390,53.452393,33626,3707,1,2012-11-06 00:01:03,1,0,0


### Finding Stops on a route

In [None]:
df_15 = df[(df.LineID == 15 )]

### Saving newdf to a dataframe


In [14]:
newdf.to_csv("bus_data/cleaned_data/line15_00150001.csv")