In [6]:
import pandas as pd
import os

In [7]:
def read_data(filename):

    df = pd.read_csv(filename, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
                  "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]

    
    return df

In [8]:
stops = pd.DataFrame(columns=['StopID', 'LineID', 'JourneyPatternID'])
# stops = pd.DataFrame()

for filename in os.listdir('bus_data/clean_data5'):
    df = read_data('bus_data/clean_data5/'+filename)
    df = df[['StopID', 'LineID', 'JourneyPatternID']]
    stops = stops.append(df)

In [9]:
backup = stops.copy()
stops.shape

(14799437, 3)

In [10]:
final_stops = stops.drop_duplicates()

In [11]:
final_stops.head()

Unnamed: 0,StopID,LineID,JourneyPatternID
0,381.0,1,1001.0
1,382.0,1,1001.0
2,4451.0,1,1001.0
3,383.0,1,1001.0
4,384.0,1,1001.0


In [13]:
#Some stops may service different busses in different directions
final_stops.shape

(11438, 3)

In [14]:
final_stops = final_stops.astype({'StopID':'int64','LineID':str,'JourneyPatternID':'int64'})

In [15]:
# Casting LineID to string creates float strings!?
def remove_float(string):
    var = string.split(".")[0]
    return var
final_stops["LineID"] = final_stops['LineID'].apply(lambda x: remove_float(x))

In [16]:
final_stops.head()

Unnamed: 0,StopID,LineID,JourneyPatternID
0,381,1,1001
1,382,1,1001
2,4451,1,1001
3,383,1,1001
4,384,1,1001


In [17]:
final_stops.dtypes

StopID               int64
LineID              object
JourneyPatternID     int64
dtype: object

In [18]:
#has no effect
final_stops = final_stops.drop_duplicates()

In [19]:
final_stops.shape

(11438, 3)

In [20]:
#TEST
final_stops[(final_stops.LineID=="1")]

Unnamed: 0,StopID,LineID,JourneyPatternID
0,381,1,1001
1,382,1,1001
2,4451,1,1001
3,383,1,1001
4,384,1,1001
5,385,1,1001
6,387,1,1001
7,389,1,1001
8,393,1,1001
9,371,1,1001


In [21]:
final_stops = final_stops.sort_values('StopID')

In [22]:
our_stops = final_stops.reset_index()
our_stops = our_stops.drop('index',1)
# all_stops.StopID.value_counts()

In [24]:
our_stops.sort_values(['LineID','JourneyPatternID','StopID'], axis=0)

Unnamed: 0,StopID,LineID,JourneyPatternID
136,44,1,1
143,45,1,1
150,46,1,1
162,47,1,1
168,48,1,1
179,49,1,1
194,50,1,1
216,51,1,1
224,52,1,1
351,119,1,1


In [25]:
our_stops = our_stops.reset_index()

In [26]:
our_stops = our_stops.drop('index', 1)

In [27]:
our_stops

Unnamed: 0,StopID,LineID,JourneyPatternID
0,2,46A,1001
1,2,46E,1001
2,2,38B,1
3,2,38A,1
4,2,38,1
5,3,120,1
6,3,122,1001
7,4,9,1001
8,6,4,1001
9,7,40,1001


In [36]:
our_stops.StopID.value_counts()

786     23
1445    22
794     22
1477    21
1444    21
1479    21
747     21
1478    20
1476    16
748     16
400     16
497     16
768     16
793     16
614     15
675     15
406     15
523     15
1358    15
315     15
792     15
515     14
618     14
4384    14
52      14
619     14
51      14
763     14
617     14
616     14
        ..
6359     1
2261     1
212      1
6343     1
4294     1
6335     1
4286     1
6319     1
4438     1
4446     1
4654     1
2405     1
2589     1
4606     1
500      1
4590     1
2533     1
468      1
4558     1
4534     1
4526     1
4518     1
2461     1
4502     1
2453     1
4494     1
2445     1
2429     1
2421     1
2049     1
Name: StopID, dtype: int64

In [30]:
our_stops[(our_stops.LineID=="1") & (our_stops.JourneyPatternID==1001)]

Unnamed: 0,StopID,LineID,JourneyPatternID
20,10,1,1001
25,12,1,1001
36,14,1,1001
45,15,1,1001
54,17,1,1001
67,18,1,1001
77,19,1,1001
85,21,1,1001
294,85,1,1001
518,203,1,1001


## Read in Schedule Data

In [32]:
schedule = pd.read_csv("bus_data/routestations3.csv",index_col=0, low_memory=False, encoding="ISO-8859-1")

In [33]:
schedule.head()

Unnamed: 0,LineID,JourneyPatternID,stop_sequence,StopID,stop_headsign,Name,Name without locality,Lat,Long
0,1,1,1,226,Sandymount,Shanard Road,Shanard Avenue,53.39114,-6.262185
1,1,1,2,228,Sandymount,Shanliss Rd,Oldtown Avenue,53.391852,-6.259796
2,1,1,3,229,Sandymount,Shanliss Rd,Oldtown Road,53.391399,-6.256521
3,1,1,4,227,Sandymount,Shanliss Rd,Shanliss Drive,53.391143,-6.251314
4,1,1,5,230,Sandymount,Shanliss Rd,Shanowen Road,53.389888,-6.24905


In [34]:
schedule.shape

(11567, 9)

In [35]:
# routestations = pd.merge(schedule, our_stops, on='StopID')
# routestations.shape

In [37]:
routestations = pd.merge(schedule, our_stops, on=['StopID','JourneyPatternID','LineID'])
routestations.shape

(11338, 9)

In [38]:
# schedule[schedule.LineID=="1"]

In [39]:
# routestations[routestations.LineID=="1"]

In [40]:
routestations = routestations.sort_values(['LineID','JourneyPatternID','stop_sequence'], axis=0)

In [41]:
routestations.head()

Unnamed: 0,LineID,JourneyPatternID,stop_sequence,StopID,stop_headsign,Name,Name without locality,Lat,Long
0,1,1,1,226,Sandymount,Shanard Road,Shanard Avenue,53.39114,-6.262185
1,1,1,2,228,Sandymount,Shanliss Rd,Oldtown Avenue,53.391852,-6.259796
2,1,1,3,229,Sandymount,Shanliss Rd,Oldtown Road,53.391399,-6.256521
3,1,1,4,227,Sandymount,Shanliss Rd,Shanliss Drive,53.391143,-6.251314
4,1,1,5,230,Sandymount,Shanliss Rd,Shanowen Road,53.389888,-6.24905


In [42]:
routestations.to_csv("common_routestations3.csv")

In [43]:
#TEST ROUTE
route = routestations[(routestations.LineID=="15B")&(routestations.JourneyPatternID==1)]

In [44]:
route

Unnamed: 0,LineID,JourneyPatternID,stop_sequence,StopID,stop_headsign,Name,Name without locality,Lat,Long
2068,15B,1,3,7371,Stocking Ave,Rogerson's Quay,Lime Street,53.346314,-6.242981
2069,15B,1,4,7221,Stocking Ave,City Quay,Creighton Street,53.346547,-6.245991
2070,15B,1,5,399,Stocking Ave,Pearse Street,Westland Row,53.343848,-6.248264
2071,15B,1,6,400,Stocking Ave,Pearse Street,Hawkins Street,53.344759,-6.252778
2072,15B,1,7,348,Stocking Ave,College Street,D'Olier Street,53.345202,-6.258497
2073,15B,1,8,403,Stocking Ave,Nassau Street,South Frederick Street,53.342713,-6.256691
2074,15B,1,9,746,Stocking Ave,Kildare Street,Natural History Museum,53.340126,-6.255565
2075,15B,1,10,844,Stocking Ave,St. Stephen's Green,Loreto Convent,53.33717,-6.256212
2076,15B,1,11,1014,Stocking Ave,Earlsfort Tce,Leeson Street,53.335676,-6.257339
2077,15B,1,12,1015,Stocking Ave,Adelaide Road,Earlsfort Terrace,53.33273,-6.259936


# Exrtact Common Stops

In [45]:
common_stops = routestations[['StopID','Name','Name without locality','Lat','Long']]

In [46]:
common_stops.shape

(11338, 5)

In [47]:
common_stops = common_stops.drop_duplicates()

In [48]:
common_stops = common_stops.sort_values('StopID')

In [49]:
common_stops.head()

Unnamed: 0,StopID,Name,Name without locality,Lat,Long
5360,2,Parnell Square,Rotunda Hospital,53.352244,-6.263693
639,3,Parnell Square,Rotunda Hospital,53.352308,-6.263781
11305,4,Parnell Square,Rotunda Hospital,53.352565,-6.264161
6041,6,Parnell Square,Rotunda Hospital,53.35274,-6.264439
1026,7,Parnell Square,Rotunda Hospital,53.352832,-6.264556


In [50]:
common_stops = common_stops.reset_index()
common_stops = common_stops.drop('index', 1)
common_stops

Unnamed: 0,StopID,Name,Name without locality,Lat,Long
0,2,Parnell Square,Rotunda Hospital,53.352244,-6.263693
1,3,Parnell Square,Rotunda Hospital,53.352308,-6.263781
2,4,Parnell Square,Rotunda Hospital,53.352565,-6.264161
3,6,Parnell Square,Rotunda Hospital,53.352740,-6.264439
4,7,Parnell Square,Rotunda Hospital,53.352832,-6.264556
5,8,Parnell Square,Rotunda Hospital,53.353263,-6.265169
6,10,Parnell Square,Rotunda Hospital,53.353383,-6.265389
7,11,Dorset St,St. Joseph's Parade,53.357098,-6.264367
8,12,Dorset St,St. Joseph's Parade,53.356787,-6.264620
9,14,Dorset St,Eccles Place,53.358540,-6.262731


In [51]:
common_stops.shape

(4585, 5)

In [52]:
common_stops.to_csv('common_stops3.csv')

# Extract Headsigns

In [57]:
lines = routestations[['LineID','JourneyPatternID','stop_headsign']]

In [58]:
lines = lines.drop_duplicates()
lines = lines.reset_index()
lines = lines.drop('index',1)
lines.head()

Unnamed: 0,LineID,JourneyPatternID,stop_headsign
0,1,1,Sandymount
1,1,1001,Shanard Road
2,102,1,Airport
3,102,1001,Station Rd
4,104,1,Shanard Road


In [59]:
lines

Unnamed: 0,LineID,JourneyPatternID,stop_headsign
0,1,1,Sandymount
1,1,1001,Shanard Road
2,102,1,Airport
3,102,1001,Station Rd
4,104,1,Shanard Road
5,104,1001,Clontarf Road
6,11,1,Sandyford I.E.
7,11,1001,Wadelai Pk
8,111,1,Dun Laoghaire
9,111,1001,Loughlinstown


In [60]:
lines.to_csv('common_lines3.csv')

# Read in Timetable
use only lines which we have

In [168]:
timetable = pd.read_csv("bus_data/static_data/simple_timetable.csv",index_col=0)

In [169]:
timetable.head()

Unnamed: 0,Service,Departure,Direction,LineID
0,saturday,5:45,From Dublin Airport Towards Sutton Station,102
1,saturday,6:45,From Dublin Airport Towards Sutton Station,102
2,saturday,7:10,From Dublin Airport Towards Sutton Station,102
3,saturday,7:20,From Dublin Airport Towards Sutton Station,102
4,saturday,7:05,From Dublin Airport Towards Sutton Station,102


In [170]:
timetable.columns

Index(['Service', 'Departure', 'Direction', 'LineID'], dtype='object')

In [171]:
timetable.dtypes

Service      object
Departure    object
Direction    object
LineID       object
dtype: object

In [172]:
#Ensure only time format remain in times
alphas = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

timetable['Time'] = timetable['Departure'].map(lambda x: x.lstrip(alphas).rstrip(alphas))

In [173]:
# timetable['Time']

In [174]:
#Clean Cells
def five_length(string):
    #ensure leading zero
    if len(string)==4:
        var = "0"+string
    else:
        var = string
    return var
    
timetable["Time"] = timetable['Time'].apply(lambda x: five_length(x))

In [175]:
#Get get rid of leading whitespace
timetable["Time"] = timetable['Time'].apply(lambda x: x.strip())

In [176]:
# timetable.head()

In [177]:
#Time should be in format XX:XX
timetable = timetable[timetable['Time'].map(len) == 5]

In [178]:
timetable = timetable.drop('Departure',1)

In [179]:
#Convert to datetime necessary?
# timetable['Time'] = pd.to_datetime(timetable['Time'])

In [180]:
# timetable.Time
# for i in timetable.Time:
#     if len(i)!=5:
#         print("Heck")

timetable.Time

0     05:45
1     06:45
2     07:10
3     07:20
4     07:05
5     07:25
6     08:20
7     08:05
8     08:25
9     09:00
10    09:25
11    10:15
12    10:45
13    11:20
14    12:00
15    12:20
16    12:40
17    13:05
18    13:40
19    14:15
20    14:40
21    15:10
22    15:40
23    16:10
24    16:35
25    17:10
26    17:35
27    18:05
28    18:40
29    19:00
      ...  
40    15:45
41    16:00
42    16:10
43    16:20
44    16:30
45    16:40
46    16:50
47    17:00
48    17:10
49    17:25
50    17:40
51    17:55
52    18:10
53    18:25
54    18:40
55    18:55
56    19:10
57    19:20
58    19:40
59    20:00
60    20:20
61    20:40
62    21:00
63    21:20
64    21:40
65    22:00
66    22:20
67    22:40
68    23:00
69    23:20
Name: Time, dtype: object

In [181]:
#Make LineIDs upper case
def make_upper(string):
    var = string.upper()
    return var

timetable["LineID"] = timetable['LineID'].apply(lambda x: make_upper(x))

In [182]:
common_timetable = pd.merge(timetable,lines,on=['LineID'])

In [183]:
common_timetable.LineID.nunique()
# common_timetable.LineID.unique()

107

In [184]:
timetable.LineID.nunique()
# timetable.LineID.unique()

130

In [185]:
lines.LineID.nunique()
# lines.LineID.unique()

116

In [189]:
common_timetable[(common_timetable.LineID == "102")&(common_timetable.Service == "saturday")&(common_timetable.JourneyPatternID == 1)]

Unnamed: 0,Service,Direction,LineID,Time,JourneyPatternID,stop_headsign
0,saturday,From Dublin Airport Towards Sutton Station,102,05:45,1,Airport
2,saturday,From Dublin Airport Towards Sutton Station,102,06:45,1,Airport
4,saturday,From Dublin Airport Towards Sutton Station,102,07:10,1,Airport
6,saturday,From Dublin Airport Towards Sutton Station,102,07:20,1,Airport
8,saturday,From Dublin Airport Towards Sutton Station,102,07:05,1,Airport
10,saturday,From Dublin Airport Towards Sutton Station,102,07:25,1,Airport
12,saturday,From Dublin Airport Towards Sutton Station,102,08:20,1,Airport
14,saturday,From Dublin Airport Towards Sutton Station,102,08:05,1,Airport
16,saturday,From Dublin Airport Towards Sutton Station,102,08:25,1,Airport
18,saturday,From Dublin Airport Towards Sutton Station,102,09:00,1,Airport


In [190]:
common_timetable.to_csv('common_timetable.csv')