## Import required libraries:

In [10]:
import pandas as pd

## Load the excel data:

In [11]:
ta = pd.read_excel('data/meta/Transit_Agencies_for_Visualization.xlsx', sheet_name='TC AgencyList')
upt = pd.read_excel('data/ntd/TS2.2TimeSeriesSysWideOpexpSvc_2.xlsx', sheet_name='UPT')
bus = pd.read_excel('data/ntd/TS2.1TimeSeriesOpExpSvcModeTOS_3.xlsx', sheet_name='UPT')

## Filter data:

In [12]:
# Remove missing NTD ID's
ta = ta.dropna(how='all')
upt = upt.dropna(subset=['NTD ID'])
bus = bus.dropna(subset=['NTD ID'])

# Filter bus data by required modes
modes = ['MB', 'RB', 'CB', 'TB']
bus_filtered = bus[bus['Mode'].isin(modes)]

## Clean the data:

In [32]:
# Combine project ID's
ta['Project ID'] = ta['Project ID'].combine_first(ta['"Other" primary Project ID']).astype('int32')
print(ta)

# Drop unused columns
ta_clean = ta.drop(columns=['ShowIndividual', '"Other" primary Project ID', 'Primary UZA', 'UZA Name',
                            'Agency Name', 'Reporter Acronym'])
upt_clean = upt.drop(columns=['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status',
                              'Reporter Type', 'City', 'State', 'Census Year', 'Primary UZA Name', 
                              'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status'])
bus_clean = bus_filtered.drop(columns=['Last Report Year', 'Legacy NTD ID', 'Agency Name', 'Agency Status',
                              'Reporter Type', 'City', 'State', 'Census Year', 'UZA Name', 'Mode', 'Service',
                              'Mode Status', 'UZA', 'UZA Area SQ Miles', 'UZA Population', '2017 Status'])

     ShowIndividual  Project ID  "Other" primary Project ID  Primary UZA  \
0               1.0           1                         NaN          NaN   
1               0.0           1                         1.0          1.0   
2               1.0           2                         NaN          1.0   
3               0.0           1                         1.0          1.0   
4               1.0           3                         NaN          1.0   
5               1.0           4                         NaN          1.0   
6               1.0           5                         NaN          1.0   
7               1.0           6                         NaN          1.0   
8               1.0           7                         NaN          1.0   
9               1.0           8                         NaN          1.0   
10              1.0           9                         NaN          1.0   
11              0.0           9                         9.0          1.0   
12          

## Merge the data:

In [34]:
def ntd_merge(df, name):    
    # Merge dataframes
    merge = pd.merge(ta_clean, df, how='left', on='NTD ID')
    group = merge.drop(columns=['NTD ID']).groupby('Project ID')
    
    # Sum, stack, and export to CSV
    stack = group.sum().stack()
    stack.astype('int64').to_csv('data/output/' + name + '.csv', index_label=['id', 'year'], header=[name])
    print('Dataset ' + name + ' written')
    
ntd_merge(upt_clean, 'upt')
ntd_merge(bus_clean, 'bus')

Dataset upt written
Dataset bus written
