In [1]:
import pandas as pd
import os


import matplotlib.pyplot as plt
%matplotlib inline

### Static Variables

In [2]:
line = "46A"

### Functions


In [3]:
# def read_file(csv_path):
#     # Reading df from file
#     df = pd.read_csv(csv_path, low_memory=False, header=None)
#     df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
#                   "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
#                   "AtStop", "HumanTime", "Day", "Hour", "Runtime"]

#     #convert StopID to string
#     df['StopID'] = df['StopID'].astype('str')
    
#     return df


In [35]:
def read_file(filename):

    df = pd.read_csv(filename, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
                  "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]
    
    df['LineID'] = df['LineID'].astype('str')

    
    return df

In [36]:
def extract_pattern_order(df):
    
    # cutting stops that don't appear more than once
#     groups = df.groupby('StopID')
#     df = groups.filter(lambda x: len(x) > 1)
    
    # extracting stop order from mean time to reach the stop
    runtimes = df.groupby("StopID").Runtime.mean()
    order = runtimes.sort_values().to_frame()
    order.reset_index(level=0, inplace=True)
    order['Order'] = order.index
    
    # merging these with LineID and JourneyPatternID
    line_df = df[["StopID", "LineID", "JourneyPatternID"]].drop_duplicates("StopID")
    final_df = pd.merge(order, line_df, on='StopID')
    
    return final_df
    
    

In [37]:
def get_location(df):
    
    # Concat location data
    df['location'] = ""
    df['location'] = df['Lon'].astype(str) + "_" + df['Lat'].astype(str)
    
    # Extracting the most frequent locations for each stop
    groups = df.groupby('StopID')
    loc_df = groups['location'].agg(lambda x:x.value_counts().index[0]).to_frame()
    loc_df.reset_index(level=0, inplace=True)

    # Converting location back to float columns
    loc_df['Lon'], loc_df['Lat'] = loc_df['location'].str.split('_', 1).str
    loc_df['Lon'], loc_df['Lat'] = loc_df['Lon'].astype('float64'), loc_df['Lat'].astype('float64')

    # drop concatenated colum 'location'
    loc_df = loc_df.drop('location', axis=1)
    return loc_df

In [38]:
def extract_line_order(path):
    df = read_file(path)
    
#     locations = get_location(df)
    
    master = pd.DataFrame()
    patterns = df['JourneyPatternID'].unique()
        
    for pattern in patterns:
        new_df = df[df.JourneyPatternID == pattern]
        new_df = extract_pattern_order(new_df)
        
        master = pd.concat([master, new_df], ignore_index=True)
#     master = pd.merge(master, locations, on='StopID')  
    
    return master

In [39]:
def main(directory):
    master = pd.DataFrame()
    
    for readfile in os.listdir(directory):
        if readfile.endswith(".csv"): 
            print("Opening line", readfile)
            path = directory + "/" + readfile
            line_df = extract_line_order(path)
            master = pd.concat([master, line_df], ignore_index=True)
    print("Finished!")
    
    master = master.drop_duplicates()
    
    return master

### Testing Functions


In [54]:
# dir_path = "bus_data/clean_location_data/"

# df = main(dir_path)

# df.to_csv("bus_data/static_data2/route_stops_all.csv")

Opening line 1.csv
Opening line 104.csv
Opening line 11.csv
Opening line 111.csv
Opening line 114.csv
Opening line 116.csv
Opening line 118.csv
Opening line 120.csv
Opening line 123.csv
Opening line 13.csv
Opening line 130.csv
Opening line 14.csv
Opening line 140.csv
Opening line 142.csv
Opening line 145.csv
Opening line 14C.csv
Opening line 15.csv
Opening line 150.csv
Opening line 151.csv
Opening line 15A.csv
Opening line 16.csv
Opening line 161.csv
Opening line 16C.csv
Opening line 17A.csv
Opening line 184.csv
Opening line 185.csv
Opening line 220.csv
Opening line 236.csv
Opening line 239.csv
Opening line 25.csv
Opening line 25A.csv
Opening line 25X.csv
Opening line 26.csv
Opening line 27.csv
Opening line 270.csv
Opening line 27A.csv
Opening line 27X.csv
Opening line 29A.csv
Opening line 31.csv
Opening line 31A.csv
Opening line 31B.csv
Opening line 32.csv
Opening line 32A.csv
Opening line 32B.csv
Opening line 32X.csv
Opening line 33.csv
Opening line 33A.csv
Opening line 33B.csv
Openi

In [55]:
line = '15'

df = read_file("bus_data/clean_location_data/" + line + ".csv")


In [57]:
df_1 = df[df.JourneyPatternID == 1]

In [58]:
# extracting stop order from mean time to reach the stop
runtimes = df_1.groupby("StopID").Runtime.mean()
order = runtimes.sort_values().to_frame()
# order.reset_index(level=0, inplace=True)
# order['Order'] = order.index

# # merging these with LineID and JourneyPatternID
# line_df = df[["StopID", "LineID", "JourneyPatternID"]].drop_duplicates("StopID")
# final_df = pd.merge(order, line_df, on='StopID')

# return final_df

In [59]:
order

Unnamed: 0_level_0,Runtime
StopID,Unnamed: 1_level_1
299,0.000000
6335,0.000000
6326,0.000000
497,0.000000
4887,0.000000
4886,0.000000
1164,0.000000
1155,0.000000
664,0.000000
6318,4.588441


In [None]:
	StopID	stop_sequence
89016	6318	1
89017	6319	2
89018	7246	3
89019	6320	4
89020	4594	5
89021	4595	6
89022	4596	7
89023	4563	8
89024	1218	9
89025	1270	10
89026	1272	11
89027	1273	12
89028	1274	13
89029	1275	14
89030	1276	15
89031	1277	16
89032	1219	17
89033	1220	18
89034	1221	19

In [53]:
line_15 = df[(df.LineID == '15') & (df.JourneyPatternID == 1)]

# line_15 = line_15[["StopID", "Order"]]
line_15

Unnamed: 0,StopID,Runtime,Order,LineID,JourneyPatternID
1547,299,0.000000,0,15,1
1548,6335,0.000000,1,15,1
1549,6326,0.000000,2,15,1
1550,497,0.000000,3,15,1
1551,4887,0.000000,4,15,1
1552,4886,0.000000,5,15,1
1553,1164,0.000000,6,15,1
1554,1155,0.000000,7,15,1
1555,664,0.000000,8,15,1
1556,6318,4.588441,9,15,1


In [None]:
schedule = pd.read_csv("bus_data/dublinbus_scheduledData2013csv.csv", low_memory=False, encoding="ISO-8859-1")

In [52]:
line_15.drop_duplicates().shape

(83, 5)

In [None]:
schedule_15 = schedule[schedule]