## Import required libraries:

In [28]:
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

output_notebook()

## Load the excel data:

In [2]:
ta = pd.read_excel('data/meta/Transit_Agencies_for_Visualization.xlsx', 
                   sheet_name='TC AgencyList')
ntd22_raw = pd.read_excel('data/ntd/TS2.2TimeSeriesSysWideOpexpSvc_2.xlsx', 
                          sheet_name=['UPT', 'VRM', 'FARES', 'VRH', 'Total OE',
                                      'DRM', 'VOMS', 'PMT'])
ntd21_raw = pd.read_excel('data/ntd/TS2.1TimeSeriesOpExpSvcModeTOS_3.xlsx', 
                      sheet_name='UPT')

print('Data successfully loaded from Excel')

Data successfully loaded from Excel


## Filter data:

In [3]:
# Remove missing NTD ID's
ta = ta.dropna(how='all')

ntd21 = {}
ntd22 = {}

ntd21_raw = ntd21_raw.dropna(subset=['NTD ID'])
for i in ntd22_raw:
    ntd22[i] = ntd22_raw[i].dropna(subset=['NTD ID'])

def filterByMode(df, modes):
    return df[df['Mode'].isin(modes)]

# Filter bus data by required modes
bus_modes = ['MB', 'RB', 'CB', 'TB']
ntd21['bus'] = filterByMode(ntd21_raw, bus_modes)

rail_modes = ['CC', 'CR', 'HR', 'LR', 'MG', 'SR', 'YR']
ntd21['rail'] = filterByMode(ntd21_raw, rail_modes)

# Combine project ID's
ta['Project ID'] = ta['Project ID'].combine_first(
    ta['"Other" primary Project ID']
).astype('int32')

# Drop unused columns
col22 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status', 
         'Reporter Type', 'City', 'State', 'Census Year', 'Primary UZA Name', 
         'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']
col21 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status',
        'Reporter Type', 'City', 'State', 'Census Year', 'UZA Name', 'Mode', 'Service',
        'Mode Status', 'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']

ta_clean = ta.drop(columns=['ShowIndividual', '"Other" primary Project ID', 'Primary UZA',
                            'UZA Name', 'Agency Name', 'Reporter Acronym'])

datasets = {}

for i in ntd21:
    datasets[i] = ntd21[i].drop(columns=col21)

for i in ntd22:
    datasets[i] = ntd22[i].drop(columns=col22)
    
print('Loaded and cleaned data for: ' + str(datasets.keys()))

Loaded and cleaned data for: dict_keys(['bus', 'rail', 'UPT', 'VRM', 'FARES', 'VRH', 'Total OE', 'DRM', 'VOMS', 'PMT'])


## Merge the data with TA metadata:

In [16]:
def ntd_merge(df, name):
    # Merge dataframes
    merge = pd.merge(ta_clean, df, how='left', on='NTD ID')
    group = merge.drop(columns=['NTD ID']).groupby('Project ID')
    
    # Sum, stack, and export to CSV
    stack = group.sum().stack()
    return stack.rename(name)

stacks = {}

for name, df in datasets.items():
    stack = ntd_merge(df, name)
    years = pd.Series(stack.index.levels[1])
    stacks[name] = stack.drop(years[years.astype(int) < 2006], level=1)
    
print('Created stacks for ' + str(stacks.keys()))

Created stacks for dict_keys(['bus', 'rail', 'UPT', 'VRM', 'FARES', 'VRH', 'Total OE', 'DRM', 'VOMS', 'PMT'])


## Calculate derived values:

In [17]:
# Average fares
stacks['fares'] = pd.Series(stacks['FARES'] / stacks['UPT'], name='fares')

# Average speed
stacks['speed'] = pd.Series(stacks['VRM'] / stacks['VRH'], name='speed')

# Farebox recovery
stacks['recovery'] = pd.Series(stacks['FARES'] / stacks['Total OE'], name='recovery')

# Vehicle revenue miles per ride
stacks['vrm_per_ride'] = pd.Series(stacks['VRM'] / stacks['UPT'], name='vrm_per_ride')

# Average headways
stacks['headways'] = pd.Series((stacks['DRM'] / stacks['speed']) / stacks['VOMS'], 
                                 name='headways')

# Average trip length
stacks['trip_length'] = pd.Series(stacks['PMT'] / stacks['UPT'], name='trip_length')

# Delete extra indicators
# del stacks['FARES']
# del stacks['VRH']
# del stacks['DRM']
# del stacks['VOMS']
# del stacks['PMT']

print('Calculated values for ' + str(stacks.keys()))

Calculated values for dict_keys(['bus', 'rail', 'UPT', 'VRM', 'FARES', 'VRH', 'Total OE', 'DRM', 'VOMS', 'PMT', 'fares', 'speed', 'recovery', 'vrm_per_ride', 'headways', 'trip_length'])


## Plotting line graphs:

In [57]:
# create a new plot with a title and axis labels
p = figure(title="simple line example", y_axis_label='y', y_axis_type='log')

# add a line renderer with legend and line thickness

for i in stacks['UPT'].index.levels[0]:
    p.line(stacks['UPT'][i].index, stacks['UPT'][i], line_width=1)

# show the results
show(p)

## Line graph for rate of change:

In [55]:
c = figure(title="UPT rate of change", y_axis_label='% change in UPT', y_range=[-50, 50])

for i in stacks['UPT'].index.levels[0]:
    c.line(stacks['UPT'][i].index, stacks['UPT'][i].pct_change() * 100, line_width=0.5, line_alpha=0.5)
    
show(c)

## Concatenate and export:

In [7]:
# Export to CSV
pd.concat(stacks.values(), axis=1).to_csv('data/output/ntd.csv', index_label=['id', 'year'])