## Import required libraries:

In [33]:
import pandas as pd

## Load the excel data:

In [145]:
ta = pd.read_excel('data/meta/Transit_Agencies_for_Visualization.xlsx', 
                   sheet_name='TC AgencyList')
ntd22_raw = pd.read_excel('data/ntd/TS2.2TimeSeriesSysWideOpexpSvc_2.xlsx', 
                          sheet_name=['UPT', 'VRM', 'FARES', 'VRH', 'Total OE',
                                      'DRM', 'VOMS', 'PMT'])
ntd21_raw = pd.read_excel('data/ntd/TS2.1TimeSeriesOpExpSvcModeTOS_3.xlsx', 
                      sheet_name='UPT')

## Process data:

In [146]:
# Remove missing NTD ID's
ta = ta.dropna(how='all')

ntd21 = {}
ntd22 = {}

ntd21_raw = ntd21_raw.dropna(subset=['NTD ID'])
for i in ntd22_raw:
    ntd22[i] = ntd22_raw[i].dropna(subset=['NTD ID'])

def filterByMode(df, modes):
    return df[df['Mode'].isin(modes)]

# Filter bus data by required modes
bus_modes = ['MB', 'RB', 'CB', 'TB']
ntd21['bus'] = filterByMode(ntd21_raw, bus_modes)

rail_modes = ['CC', 'CR', 'HR', 'LR', 'MG', 'SR', 'YR']
ntd21['rail'] = filterByMode(ntd21_raw, rail_modes)

# Combine project ID's
ta['Project ID'] = ta['Project ID'].combine_first(ta['"Other" primary Project ID']).astype('int32')

# Drop unused columns
col22 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status', 
         'Reporter Type', 'City', 'State', 'Census Year', 'Primary UZA Name', 
         'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']
col21 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status',
        'Reporter Type', 'City', 'State', 'Census Year', 'UZA Name', 'Mode', 'Service',
        'Mode Status', 'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']

ta_clean = ta.drop(columns=['ShowIndividual', '"Other" primary Project ID', 'Primary UZA',
                            'UZA Name', 'Agency Name', 'Reporter Acronym'])

datasets = {}

for i in ntd21:
    datasets[i] = ntd21[i].drop(columns=col21)

for i in ntd22:
    datasets[i] = ntd22[i].drop(columns=col22)
    
# Merge the data
def ntd_merge(df, name):    
    # Merge dataframes
    merge = pd.merge(ta_clean, df, how='left', on='NTD ID')
    group = merge.drop(columns=['NTD ID']).groupby('Project ID')
    
    # Sum, stack, and export to CSV
    stack = group.sum().stack()
    return stack.astype('int64').rename(name)

for name, df in datasets.iteritems():
    datasets[name] = ntd_merge(df, name)
    
# Average fares
datasets['fares'] = pd.Series(datasets['FARES'] / datasets['UPT'], name='fares')

# Average speed
datasets['speed'] = pd.Series(datasets['VRM'] / datasets['VRH'], name='speed')

# Farebox recovery
datasets['recovery'] = pd.Series(datasets['FARES'] / datasets['Total OE'], name='recovery')

# Vehicle revenue miles per ride
datasets['vrm_per_ride'] = pd.Series(datasets['VRM'] / datasets['UPT'], name='vrm_per_ride')

# Average headways
datasets['headways'] = pd.Series((datasets['DRM'] / datasets['speed']) / datasets['VOMS'], 
                                 name='headways')

# Average trip length
datasets['trip_length'] = pd.Series(datasets['PMT'] / datasets['UPT'], name='trip_length')

# Delete extra indicators
del datasets['FARES']
del datasets['VRH']
del datasets['Total OE']
del datasets['DRM']
del datasets['VOMS']
del datasets['PMT']

## Export to CSV:

In [147]:
pd.concat(datasets.values(), axis=1).to_csv('data/output/ntd.csv', index_label=['id', 'year'])