In [34]:
import pandas as pd
import os


import matplotlib.pyplot as plt
%matplotlib inline

### Static Variables

In [35]:
line = "46A"

### Functions


In [36]:
def read_schedule(schedule_file):
    schedule = pd.read_csv(schedule_file, low_memory=False, encoding="ISO-8859-1")
    
    return schedule


In [46]:
def clean_schedule(schedule):
    
    # drop null values
    schedule.dropna(0, inplace=True)
    
    # extract StopID
    schedule["StopID"] = schedule['stop_id'].apply(lambda x: x[-4:])
    
    # Convert StopIDs back to int
    schedule['StopID'] = schedule['StopID'].astype('int64')
    
    # Extracting LineID from trip_id
    schedule["LineID"] = schedule["trip_id"].str.extract('\-(.*?)\-')
    
    # Extracting direction from trip_id
    schedule["Direction"] = schedule['trip_id'].apply(lambda x: x[-1:])
    
    # Dropping non-major journeypatterns
    schedule = schedule[(schedule.Direction == "I") | (schedule.Direction == "O")]
    
    # Converting Direction to JourneyPatternID
    schedule["JourneyPatternID"] = schedule['Direction'].apply(lambda x: "1001" if x == "I" else "0001")
    schedule['JourneyPatternID'] = schedule['JourneyPatternID'].astype('int64')
    
    # Removing fake stops
    schedule = schedule[~((schedule['Name without locality'].str.contains("set down")) | 
                  (schedule['Name without locality'].str.contains("Set Down")) | 
                  (schedule['Name without locality'].str.contains("Fake")) | 
                  (schedule['Name'].str.contains("Fake")) | 
                  (schedule['Name without locality'].str.contains("Virtual")) |
                  (schedule['Name'].str.contains("Virtual")))]
    
    #     # dropping irrelevant columns
    #     for column in ['trip_id', 'stop_id', 'arrival_time', 'departure_time', 
    #                    'shape_dist_traveled', 'Direction', 'stop_sequence', 'stop_headsign']:
    #             schedule = schedule.drop(column, 1)
            
    # Renaming 'Long' to 'Lon'
    schedule = schedule.rename(columns={'Long': 'Lon'})
    
    return schedule

In [47]:
def get_common_stops(df, schedule):
    line = str(df['LineID'].iloc[0])
    pattern = df['JourneyPatternID'].iloc[0]
    
    
    schedule = schedule[(schedule.LineID == line) & (schedule.JourneyPatternID == pattern)]
#     schedule_out = schedule[(schedule.LineID == line) & (schedule.JourneyPatternID == 1)]

    stops_df = set(df.StopID.unique())
    stops_schedule = set(schedule.StopID.unique())

    
    print("Difference", line, pattern, len(stops_df ^ stops_schedule))
    
    common_stops = stops_df & stops_schedule
    
    df = df[df.StopID.isin(common_stops)]
    
    return df
    

In [48]:
def read_file(filename):
    
    df = pd.read_csv(filename, low_memory=False)


#     df = pd.read_csv(filename, low_memory=False, header=None)
#     df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
#                   "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
#                   "AtStop", "HumanTime", "Day", "Hour", "JourneyGroup", "Runtime"]
    
    df['LineID'] = df['LineID'].astype('str')

    
    return df

In [49]:
def extract_pattern_order(df):
    
    # cutting stops that don't appear more than once
#     groups = df.groupby('StopID')
#     df = groups.filter(lambda x: len(x) > 1)
    
    # extracting stop order from mean time to reach the stop
    runtimes = df.groupby("StopID").Runtime.mean()
    order = runtimes.sort_values().to_frame()
    order.reset_index(level=0, inplace=True)
    order['Order'] = order.index
    
    # merging these with LineID and JourneyPatternID
    line_df = df[["StopID", "LineID", "JourneyPatternID"]].drop_duplicates("StopID")
    final_df = pd.merge(order, line_df, on='StopID')
    
    return final_df
    
    

In [50]:
# def get_location(df):
    
#     # Concat location data
#     df['location'] = ""
#     df['location'] = df['Lon'].astype(str) + "_" + df['Lat'].astype(str)
    
#     # Extracting the most frequent locations for each stop
#     groups = df.groupby('StopID')
#     loc_df = groups['location'].agg(lambda x:x.value_counts().index[0]).to_frame()
#     loc_df.reset_index(level=0, inplace=True)

#     # Converting location back to float columns
#     loc_df['Lon'], loc_df['Lat'] = loc_df['location'].str.split('_', 1).str
#     loc_df['Lon'], loc_df['Lat'] = loc_df['Lon'].astype('float64'), loc_df['Lat'].astype('float64')

#     # drop concatenated colum 'location'
#     loc_df = loc_df.drop('location', axis=1)
#     return loc_df

In [51]:
def extract_line_order(path, schedule):
    
    df = read_file(path)
    
    master = pd.DataFrame()
    patterns = df['JourneyPatternID'].unique()
        
    for pattern in patterns:
        new_df = df[df.JourneyPatternID == pattern]
        new_df = get_common_stops(new_df, schedule)
        new_df = extract_pattern_order(new_df)
        
        master = pd.concat([master, new_df], ignore_index=True)
#     master = pd.merge(master, locations, on='StopID')  
    
    return master

In [52]:
def main(directory, schedule_file):
    master = pd.DataFrame()
    
    schedule = read_schedule(schedule_file)
    schedule = clean_schedule(schedule) 
    
    for readfile in os.listdir(directory):
        if readfile.endswith(".csv"): 
            print("Opening line", readfile)
            path = directory + "/" + readfile
            line_df = extract_line_order(path, schedule)
            master = pd.concat([master, line_df], ignore_index=True)
            
    print("Finished!")
    
    master = master.drop_duplicates()
    
#     # removing fake bus stops and set down bus stops? 
#     master = master[~((master['Name without locality'].str.contains("set down")) | 
#           (master['Name without locality'].str.contains("Set Down")) | 
#           (master['Name without locality'].str.contains("Fake")))]
    
    return master

In [53]:
def get_unique_routes(df):
    df = df[['LineID', 'JourneyPatternID']].drop_duplicates()
    
    return df

### Testing Functions


In [54]:
dir_path = "bus_data/clean_data_final/"
schedule_file = 'bus_data/dublinbus_scheduledData2013csv.csv'

df = main(dir_path, schedule_file)
routes = get_unique_routes(df)

df.to_csv("bus_data/static_data3/route_stops_all.csv")
routes.to_csv("bus_data/static_data3/routes_all.csv")



Opening line 1.csv
Difference 1 1001 1
Difference 1 1 0
Opening line 102.csv
Difference 102 1001 2
Difference 102 1 0
Opening line 104.csv
Difference 104 1 0
Difference 104 1001 2
Opening line 11.csv
Difference 11 1001 2
Difference 11 1 4
Opening line 111.csv
Difference 111 1001 0
Difference 111 1 1
Opening line 114.csv
Difference 114 1001 1
Difference 114 1 1
Opening line 116.csv
Difference 116 1001 7
Difference 116 1 6
Opening line 118.csv
Difference 118 1001 8
Opening line 120.csv
Difference 120 1 12
Difference 120 1001 12
Opening line 122.csv
Difference 122 1 0
Difference 122 1001 0
Opening line 123.csv
Difference 123 1001 1
Difference 123 1 1
Opening line 13.csv
Difference 13 1001 2
Difference 13 1 3
Opening line 130.csv
Difference 130 1001 0
Difference 130 1 0
Opening line 14.csv
Difference 14 1 5
Difference 14 1001 2
Opening line 140.csv
Difference 140 1 4
Difference 140 1001 2
Opening line 142.csv
Difference 142 1001 2
Difference 142 1 0
Opening line 145.csv
Difference 145 1001

In [27]:
line = '15'

df = read_file("bus_data/clean_data_rough/" + line + ".csv")


In [28]:
df_1 = df[df.JourneyPatternID == 1]

In [29]:
# extracting stop order from mean time to reach the stop
runtimes = df_1.groupby("StopID").Runtime.mean()
order = runtimes.sort_values().to_frame()
# order.reset_index(level=0, inplace=True)
# order['Order'] = order.index

# # merging these with LineID and JourneyPatternID
# line_df = df[["StopID", "LineID", "JourneyPatternID"]].drop_duplicates("StopID")
# final_df = pd.merge(order, line_df, on='StopID')

# return final_df

In [30]:
order

Unnamed: 0_level_0,Runtime
StopID,Unnamed: 1_level_1
6318,0.941985
6319,159.966214
7246,184.777260
6320,261.104751
4594,346.595810
4595,436.844461
4596,478.071462
4563,549.546701
1218,625.869832
1270,691.943562


In [None]:
	StopID	stop_sequence
89016	6318	1
89017	6319	2
89018	7246	3
89019	6320	4
89020	4594	5
89021	4595	6
89022	4596	7
89023	4563	8
89024	1218	9
89025	1270	10
89026	1272	11
89027	1273	12
89028	1274	13
89029	1275	14
89030	1276	15
89031	1277	16
89032	1219	17
89033	1220	18
89034	1221	19

In [53]:
line_15 = df[(df.LineID == '15') & (df.JourneyPatternID == 1)]

# line_15 = line_15[["StopID", "Order"]]
line_15

Unnamed: 0,StopID,Runtime,Order,LineID,JourneyPatternID
1547,299,0.000000,0,15,1
1548,6335,0.000000,1,15,1
1549,6326,0.000000,2,15,1
1550,497,0.000000,3,15,1
1551,4887,0.000000,4,15,1
1552,4886,0.000000,5,15,1
1553,1164,0.000000,6,15,1
1554,1155,0.000000,7,15,1
1555,664,0.000000,8,15,1
1556,6318,4.588441,9,15,1


In [None]:
schedule = pd.read_csv("bus_data/dublinbus_scheduledData2013csv.csv", low_memory=False, encoding="ISO-8859-1")

In [52]:
line_15.drop_duplicates().shape

(83, 5)

In [None]:
schedule_15 = schedule[schedule]