## Import required libraries:

In [10]:
import pandas as pd

## Load the excel data:

In [None]:
ta = pd.read_excel('data/meta/Transit_Agencies_for_Visualization.xlsx', 
                   sheet_name='TC AgencyList')
ntd22 = pd.read_excel('data/ntd/TS2.2TimeSeriesSysWideOpexpSvc_2.xlsx', 
                      sheet_name='UPT')
ntd21 = pd.read_excel('data/ntd/TS2.1TimeSeriesOpExpSvcModeTOS_3.xlsx', 
                      sheet_name='UPT')

## Filter data:

In [14]:
# Remove missing NTD ID's
ta = ta.dropna(how='all')
ntd22 = ntd22.dropna(subset=['NTD ID'])
ntd21 = ntd21.dropna(subset=['NTD ID'])

def filterByMode(df, modes):
    return df[df['Mode'].isin(modes)]

# Filter bus data by required modes
bus_modes = ['MB', 'RB', 'CB', 'TB']
bus = filterByMode(ntd21, bus_modes)

rail_modes = ['CC', 'CR', 'HR', 'LR', 'MG', 'SR', 'YR']
rail = filterByMode(ntd21, rail_modes)
print(rail)

Empty DataFrame
Columns: [Last Report Year, NTD ID, Legacy NTD ID, Agency Name, Agency Status, Reporter Type, City, State, Census Year, UZA Name, UZA, UZA Area SQ Miles, UZA Population, 2017 Status, Mode, Service, Mode Status, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
Index: []

[0 rows x 44 columns]


## Clean the data:

In [11]:
# Combine project ID's
ta['Project ID'] = ta['Project ID'].combine_first(ta['"Other" primary Project ID']).astype('int32')

# Drop unused columns
col22 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status', 
         'Reporter Type', 'City', 'State', 'Census Year', 'Primary UZA Name', 
         'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']
col21 = ['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status',
        'Reporter Type', 'City', 'State', 'Census Year', 'UZA Name', 'Mode', 'Service',
        'Mode Status', 'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status']

ta_clean = ta.drop(columns=['ShowIndividual', '"Other" primary Project ID', 'Primary UZA', 'UZA Name',
                            'Agency Name', 'Reporter Acronym'])

total_clean = ntd22.drop(columns=col22)
bus_clean = bus.drop(columns=col21)
rail_clean = rail.drop(columns=col21)

## Merge the data:

In [12]:
def ntd_merge(df, name):    
    # Merge dataframes
    merge = pd.merge(ta_clean, df, how='left', on='NTD ID')
    group = merge.drop(columns=['NTD ID']).groupby('Project ID')
    
    # Sum, stack, and export to CSV
    stack = group.sum().stack()
    return stack.astype('int64').rename(name)
    
datasets = {
    'total': upt_clean,
    'bus': bus_clean,
    'rail': rail_clean
}
stacks = []

for name, df in datasets.iteritems():
    stacks.append(ntd_merge(df, name))

pd.concat(stacks, axis=1).to_csv('data/output/ntd.csv', index_label=['id', 'year'])