In [None]:
import service_profile as sp
import numpy as np
from datetime import datetime, time
from collections import defaultdict
import pandas as pd
import concurrent.futures

In [None]:
# resources and variable initialization
EZLINK = "ezlink-201702-bus.csv"
ROUTE = "lta_scheduled_bus_routes_for_feb2017.csv"

In [None]:
# Schema for all data
route_schema = dict(
    service="service",
    direction="direction",
    stop_code="BusStopCode",
    seq="BusStopSequence",
    km="km",
    dt_from="dt_from",
    dt_to="dt_to",
    time_format='%d/%m/%Y')

route_valid_for_date = datetime(2017, 2, 1)
days_of_interest = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
am_peak = dict(start_time=time(7, 30), end_time=time(9, 30))
pm_peak = dict(start_time=time(17, 0), end_time=time(20, 0))

# read data
route = (sp.Route.from_csv(ROUTE,
                           **route_schema).valid_for(route_valid_for_date))

In [None]:
# read ezlink data
col_names = pd.read_csv(EZLINK, nrows=0).columns
ezlink_schema = {
    'ALIGHTING_STOP_STN': str,
    'BOARDING_STOP_STN': str,
    'BUS_REG_NUM': str,
    'Bus_Trip_Num': str,
    'Direction': np.int64,
    'JOURNEY_ID': np.int64,
    'Ride_Distance': np.float64,
    'TRAVEL_MODE': str,
    'Year': np.int64,
    'tap_in_time': str,
    'tap_out_time': str,
    'Srvc_Number': str,
    'Date': str
}
ezlink = pd.read_csv(EZLINK, dtype=ezlink_schema)

In [None]:
# Get dates from data
dates = ezlink['Date'].unique()

# Obtain trunk bus services in data that match trunk services list from above & with valid direction
ezlink_bus_srvc = ezlink[['Srvc_Number', 'Direction']].drop_duplicates().apply(
    tuple, axis=1).tolist()
ezlink_bus_srvc = [
    item for item in ezlink_bus_srvc
    if (item[0] is not None) and (item[1] is not None)
]

In [None]:
data_frames = []
for item in ezlink_bus_srvc:
    service_of_interest = dict(service=item[0], direction=item[1])
    service_route = route.for_service(**service_of_interest)
    if not service_route.dataframe.empty:
        # column names
        stop_code = service_route.col("stop_code")
        seq = service_route.col("seq")
        seq_source = service_route.col("seq") + "_source"
        seq_destination = service_route.col("seq") + "_destination"

        ezlink_sub = ezlink[(ezlink['Srvc_Number'] == item[0])
                            & (ezlink['Direction'] == item[1])]

        if (len(ezlink_sub.head(1)) > 0):
            ezlink_sub['pax'] = 1
            ods = ezlink_sub.groupby(
                ['BOARDING_STOP_STN', 'ALIGHTING_STOP_STN']).agg({
                    'pax': 'sum'
                }).reset_index()
            ods.columns = ["source", "destination", "pax"]

            # find out the sequence for source and destination
            route_df = (service_route.dataframe.set_index(stop_code)[[seq]])

            joined = (ods.set_index("source").join(route_df, how="left"))
            joined.columns = ["destination", "pax", "source_seq"]
            # workaround for bug where index name disappears after join
            joined.index = joined.index.rename("source")
            joined = joined.reset_index()

            # find seq for destination
            joined = (joined.set_index("destination").join(
                route_df, how="left"))
            joined.columns = ["source", "pax", "source_seq", "destination_seq"]
            # workaround for bug where index name disappears after join
            joined.index = joined.index.rename("destination")
            joined = joined.reset_index()

            # find source-destination with smallest number of stop travelled
            joined["stops_travelled"] = joined["destination_seq"] - joined[
                "source_seq"]
            joined = joined.loc[joined["stops_travelled"] > 0]
            joined = (joined.loc[joined.groupby(
                ["source", "destination", "source_seq",
                 "destination_seq"])["stops_travelled"].idxmin()])

            # Generate 2D Matrix of origin-destination bus stops for the service (to include even 0 occurrence pairs)
            idx = list(route_df.index)
            mat_sz = len(idx)

            # Check for bus calling at the same bus stop during different segments of the trip (loop services) & rename bus stops accordingly
            dup_entries = defaultdict(list)
            for i, entry in enumerate(idx):
                dup_entries[entry].append(i)
            dup_entries = {k: v for k, v in dup_entries.items() if len(v) > 1}

            if (dup_entries):
                for key, value in dup_entries.items():
                    if len(value) == 2:
                        joined.loc[(
                            joined['source'] == key), 'source'] = key + "_O"
                        joined.loc[(joined['destination'] == key
                                    ), 'destination'] = key + "_D"
                        ezlink_sub['BOARDING_STOP_STN'] = ezlink_sub[
                            'BOARDING_STOP_STN'].where(
                                ezlink_sub['BOARDING_STOP_STN'] != key,
                                key + "_O")
                        ezlink_sub['ALIGHTING_STOP_STN'] = ezlink_sub[
                            'ALIGHTING_STOP_STN'].where(
                                ezlink_sub['ALIGHTING_STOP_STN'] != key,
                                key + "_D")
                        idx[value[0]] = idx[value[0]] + "_O"
                        idx[value[1]] = idx[value[1]] + "_D"
                    else:
                        joined.loc[(joined['source_seq'] == (
                            value[0] + 1)), 'source'] = key + "_O"
                        joined.loc[(joined['source_seq'] == (
                            value[1] + 1)), 'source'] = key + "_I"

                        indices = ezlink_sub.index[
                            ezlink_sub['BOARDING_STOP_STN'] == key].tolist()

                        for i in indices:
                            location = idx.index(
                                ezlink_sub.loc[i, 'ALIGHTING_STOP_STN'])
                            if (location <= value[1]):
                                ezlink_sub.loc[
                                    i, 'BOARDING_STOP_STN'] = key + "_O"
                            else:
                                ezlink_sub.loc[
                                    i, 'BOARDING_STOP_STN'] = key + "_I"

                        idx[value[0]] = idx[value[0]] + "_O"
                        idx[value[1]] = idx[value[1]] + "_I"

                        joined.loc[(joined['destination_seq'] == (
                            value[1] + 1)), 'destination'] = key + "_I"
                        joined.loc[(joined['destination_seq'] == (
                            value[2] + 1)), 'destination'] = key + "_D"

                        indices = ezlink_sub.index[
                            ezlink_sub['ALIGHTING_STOP_STN'] == key].tolist()

                        for i in indices:
                            location = idx.index(
                                ezlink_sub.loc[i, 'BOARDING_STOP_STN'])
                            if (location <= value[1]):
                                ezlink_sub.loc[
                                    i, 'ALIGHTING_STOP_STN'] = key + "_I"
                            else:
                                ezlink_sub.loc[
                                    i, 'ALIGHTING_STOP_STN'] = key + "_D"

                        idx[value[2]] = idx[value[2]] + "_D"

            # count number of buses in service for each day at each stop and sum up for all days in dataset of interest
            bus_count = pd.np.zeros(mat_sz, dtype=np.int)
            for date in dates:
                tmp = ezlink_sub[(ezlink_sub['Date'] == date)]
                for i, stop in enumerate(idx):
                    df1 = tmp[tmp['BOARDING_STOP_STN'] == stop]
                    df2 = tmp[tmp['ALIGHTING_STOP_STN'] == stop]
                    bus_count[i] = bus_count[i] + max(
                        df1[['BUS_REG_NUM', 'Bus_Trip_Num'
                             ]].drop_duplicates().shape[0], df2[[
                                 'BUS_REG_NUM', 'Bus_Trip_Num'
                             ]].drop_duplicates().shape[0])

            # Find mean number of buses in the period
            bus_count = np.mean(bus_count)

            I = pd.Index(idx, name="")
            C = pd.Index(idx, name="")
            route_df_mat = pd.DataFrame(
                pd.np.zeros((mat_sz, mat_sz), dtype=np.int),
                index=I,
                columns=C)

            # Based on counts in joined, add the counts to the matrix
            for i in range(0, joined.shape[0]):
                route_df_mat.loc[joined.iloc[i, 1], joined.
                                 iloc[i, 0]] += joined.iloc[i, 2]
            boarding_total = route_df_mat.sum(axis=1).values
            alighting_total = route_df_mat.sum(axis=0).values

            # Create cumulative net passengers on board each service
            route_dist_series = pd.DataFrame({
                "service":
                np.repeat(item[0], mat_sz),
                "direction":
                np.repeat(item[1], mat_sz),
                "bus stop code":
                idx,
                "seq":
                list(range(mat_sz)),
                "bus count":
                np.repeat(bus_count, mat_sz),
                "Net Passengers on Bus at BusStop":
                pd.np.zeros(mat_sz, dtype=np.int)
            })

            route_dist_series.loc[
                0, 'Net Passengers on Bus at BusStop'] = boarding_total[0]
            for j in range(1, route_dist_series.shape[0]):
                route_dist_series.loc[
                    j,
                    'Net Passengers on Bus at BusStop'] = route_dist_series.loc[
                        j - 1, 'Net Passengers on Bus at BusStop'] + (
                            boarding_total[j] - alighting_total[j])

            data_frames.append(route_dist_series)

route_dist_series_all = pd.concat(data_frames)

route_dist_series_all.to_csv("cumulative_net_commuter_on_bus.csv", index=False)