In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statsmodels.formula.api as sm

%matplotlib inline

## Reading Data & Formatting

In [81]:
df = pd.read_csv("bus_data/cleaned_data/line15.csv", low_memory=False, header=None)
df.columns = ["Timestamp", "LineID", "Direction", "JourneyPatternID", "TimeFrame", 
              "VehicleJourneyID", "Operator", "Congestion", "Lon", "Lat", 
              "Delay", "BlockID", "VehicleID", "StopID", "AtStop"]

In [90]:
#Select all columns of type 'object'
object_columns = df.select_dtypes(['object']).columns

#Convert selected columns to type 'category'
for column in object_columns:
    df[column] = df[column].astype('category')
    
# Convert other features to categorical
for column in ['LineID', 'VehicleJourneyID', 'Congestion', 'BlockID', 'VehicleID', 'AtStop']:
    df[column] = df[column].astype('category')

In [91]:
# Add column for human readable time
df['HumanTime'] = pd.to_datetime(timedf['Timestamp'], unit='us')

In [138]:
# Dropping irrelevant columns
df = df.drop('BlockID', 1)
df = df.drop('Operator', 1)
df = df.drop('Delay', 1)
df = df.drop('Congestion', 1)

## Separating journey patterns

In [139]:
patterns = df.JourneyPatternID.unique()

for pattern in patterns:
    print(pattern)

00150001
00151001
015A1001
015A0001
null
nan
015B0002
015B1001
015A0002
015B0001
015B1002
015B0003
00151002
00150002
029A1001
066A0001
00400001
056A1001


In [256]:
# Choosing only line 00150001, for a single day, where the busses are at a stop
newdf = df[(df.JourneyPatternID == '00150001') & (df.TimeFrame == '2012-11-26') & (df.AtStop == 1)]
newdf.shape

(9042, 12)

In [257]:
newdf = newdf.drop_duplicates(["TimeFrame", "VehicleJourneyID", "StopID"])

In [258]:
groups = newdf.groupby(["TimeFrame", "VehicleJourneyID"])

### Problem - Journeys don't all start with the same stop. 
#### This means that we'll have to find some other way of initialising the journey. Maybe the first row containing VehicleJourneyID with AtStop == 0 ? 

In [260]:
for name, group in groups:
    group.reset_index(inplace=True)
    start = group.loc[0, 'StopID']
    print(start)

6318
7246
6320
6282
7246
6282
4594
6319
6318
6282
6318
6282
6318
6282
6320
6318
6282
4594
6318
6282
6320
6282
6318
6318
6318
4594
6318
6318
6282
6319
6282
6318
6318
6282
6320
6282
6318
6282
6318
6282
4594
6282
6319
6282
4596
6282
6320
6318
6282
4594
6282
4563
6282
6318
6282
6320
6318
6282
6318
6282
6318
6282
6318
6282
4594
6282
6318
6282
1270
6318
6282
4563
6318
6319
6282
6318
6282
6318
6318
6282
6319
6282
6318
6319
6282
6318
6318
6282
6319
6282
6318
7246
1273
6282
6318
6282
6318
6282
6318
6318
6282
6318
6318
6318
6282
6318
6282
6318
6282
6282
6282
6318
6282
6318
6282
6318
6282
4595
6282
6318
6282
6318
6282
6318
6318
6282
5190
6282
6318
6282
6318
6282
7246
6282
6318
6318
6282
6318
4594
6282
7246
6282
6318
6282
6318
6318
6318
6318
675
