In [58]:
import pandas as pd
import os


import matplotlib.pyplot as plt
%matplotlib inline

### Static Variables

In [59]:
line = "46A"

### Functions


In [60]:
def read_file(csv_path):
    # Reading df from file
    df = pd.read_csv(csv_path, low_memory=False, header=None)
    df.columns = ["Timestamp", "LineID", "JourneyPatternID", "TimeFrame", 
                  "VehicleJourneyID", "Lon", "Lat", "VehicleID", "StopID", 
                  "AtStop", "HumanTime", "Day", "Hour", "Runtime"]

    #convert StopID to string
    df['StopID'] = df['StopID'].astype('str')
    
    return df


In [61]:
def extract_pattern_order(df):
    
    # cutting stops that don't appear more than once
    groups = df.groupby('StopID')
    df = groups.filter(lambda x: len(x) > 1)
    
    # extracting stop order from mean time to reach the stop
    runtimes = df.groupby("StopID").Runtime.mean()
    order = runtimes.sort_values().to_frame()
    order.reset_index(level=0, inplace=True)
    order['Order'] = order.index
    
    # merging these with LineID and JourneyPatternID
    line_df = df[["StopID", "LineID", "JourneyPatternID"]].drop_duplicates("StopID")
    final_df = pd.merge(order, line_df, on='StopID')
    
    return final_df
    
    

In [None]:
def get_location(df):
    
    # Concat location data
    df['location'] = ""
    df['location'] = df['Lon'].astype(str) + "_" + df['Lat'].astype(str)
    
    # Extracting the most frequent locations for each stop
    loc_df = groups['location'].agg(lambda x:x.value_counts().index[0]).to_frame()
    loc_df.reset_index(level=0, inplace=True)

    # Converting location back to float columns
    loc_df['Lon'], loc_df['Lat'] = loc_df['location'].str.split('_', 1).str
    loc_df['Lon'], loc_df['Lat'] = loc_df['Lon'].astype('float64'), loc_df['Lat'].astype('float64')

    # drop concatenated colum 'location'
    loc_df = loc_df.drop('location', axis=1)
    return loc_df

In [62]:
def extract_line_order(path):
    df = read_file(path)
    
    master = pd.DataFrame()
    patterns = df['JourneyPatternID'].unique()
        
    for pattern in patterns:
        new_df = df[df.JourneyPatternID == pattern]
        new_df = extract_pattern_order(new_df)
        
        master = pd.concat([master, new_df], ignore_index=True)
        
    return master

In [69]:
def main(directory):
    master = pd.DataFrame()
    
    for readfile in os.listdir(directory):
        if readfile.endswith(".csv"): 
            print("Opening line", readfile)
            path = directory + "/" + readfile
            line_df = extract_line_order(path)
            master = pd.concat([master, line_df], ignore_index=True)
    print("Finished!")
    
    master = master.drop_duplicates()
    
    return master

### Running Functions

In [70]:
dir_path = "bus_data/line_data/"

df = main(dir_path)

Opening line 1.csv
Opening line 102.csv
Opening line 104.csv
Opening line 11.csv
Opening line 111.csv
Opening line 114.csv
Opening line 116.csv
Opening line 118.csv
Opening line 120.csv
Opening line 122.csv
Opening line 123.csv
Opening line 13.csv
Opening line 130.csv
Opening line 14.csv
Opening line 140.csv
Opening line 142.csv
Opening line 145.csv
Opening line 14C.csv
Opening line 15.csv
Opening line 150.csv
Opening line 151.csv
Opening line 15A.csv
Opening line 15B.csv
Opening line 16.csv
Opening line 161.csv
Opening line 16C.csv
Opening line 17.csv
Opening line 17A.csv
Opening line 18.csv
Opening line 184.csv
Opening line 185.csv
Opening line 220.csv
Opening line 236.csv
Opening line 238.csv
Opening line 239.csv
Opening line 25.csv
Opening line 25A.csv
Opening line 25B.csv
Opening line 25X.csv
Opening line 26.csv
Opening line 27.csv
Opening line 270.csv
Opening line 27A.csv
Opening line 27B.csv
Opening line 27X.csv
Opening line 29A.csv
Opening line 31.csv
Opening line 31A.csv
Openi

Unnamed: 0,StopID,Runtime,Order,LineID,JourneyPatternID
0,265,0.0,0,1,2
1,381,0.0,1,1,2
2,271,130.159892,2,1,2
3,340,295.424165,3,1,2
4,350,513.606061,4,1,2


In [76]:
df.head()

Unnamed: 0,StopID,Runtime,Order,LineID,JourneyPatternID
0,265,0.0,0,1,2
1,381,0.0,1,1,2
2,271,130.159892,2,1,2
3,340,295.424165,3,1,2
4,350,513.606061,4,1,2


In [77]:
df.to_csv("bus_data/static_data/route_stops_all.csv")