In [1]:
import pandas as pd
import glob

## Process 2022 Data

In [2]:
raw_dat = pd.read_csv("../data dump/Capital Expenses_2022.csv")

In [3]:
raw_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1859 entries, 0 to 1858
Data columns (total 38 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Agency                                            1859 non-null   object 
 1   City                                              1845 non-null   object 
 2   State                                             1859 non-null   object 
 3   NTD ID                                            1859 non-null   int64  
 4   Organization Type                                 1859 non-null   object 
 5   Reporter Type                                     1859 non-null   object 
 6   Report Year                                       1859 non-null   int64  
 7   UACE Code                                         1270 non-null   float64
 8   UZA Name                                          1272 non-null   object 
 9   Primary UZA Populat

In [4]:
# Remove all "Questionable" columns from the dataset.
dat = raw_dat.drop(columns=raw_dat.columns[["Questionable" in name for name in raw_dat.columns]])
# Remove the "reporter" suffix from the entries in reporter type.
dat['Reporter Type'] = [words[0] for words in (item.split() for item in raw_dat['Reporter Type'])]
# Remove the "Org Type" column from the dataset.
dat = dat.drop(columns=["Organization Type"])
# Remove the "Reduced Reporter Expenses Column" as it is a duplicate.
dat = dat.drop(columns=["Reduced Reporter Expenses"])
# Remove columns that do not exist in older datasets.
dat = dat.drop(columns=['UACE Code', 'UZA Name', 'Mode Name', 'Capital Use Type'])

# Change datatypes on dollar and population columns to numeric.
def to_num(col_name:str, dat_frame:pd.DataFrame) -> list:
    return [num.replace(',', '') if type(num) == str else num for num in dat_frame[col_name]]

for col in dat.columns[11:]:
    dat[col] = to_num(col, dat)
    dat = dat.astype({col: 'float64'})

dat["Primary UZA Population"] = to_num("Primary UZA Population", dat)
dat = dat.astype({"Primary UZA Population": 'float64'})

In [5]:
# Is there a place where these column names are explained fully?
# Yes. https://data.transportation.gov/Public-Transit/2022-NTD-Annual-Data-Capital-Expenses-by-Capital-U/fphd-jyyj/about_data
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1859 entries, 0 to 1858
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Agency                               1859 non-null   object 
 1   City                                 1845 non-null   object 
 2   State                                1859 non-null   object 
 3   NTD ID                               1859 non-null   int64  
 4   Reporter Type                        1859 non-null   object 
 5   Report Year                          1859 non-null   int64  
 6   Primary UZA Population               1270 non-null   float64
 7   Agency VOMS                          1859 non-null   object 
 8   Mode                                 1859 non-null   object 
 9   TOS                                  1859 non-null   object 
 10  Mode VOMS                            1841 non-null   object 
 11  Guideway                      

## Process 2016 - 2021 Data

In [6]:
raw_21 = pd.read_excel("../data dump/Capital Expenses_2021.xlsx", sheet_name='Total Capital Expenses by Mode')

In [7]:
raw_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3787 entries, 0 to 3786
Data columns (total 38 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Agency                                            3787 non-null   object 
 1   City                                              3763 non-null   object 
 2   State                                             3787 non-null   object 
 3   Legacy NTD ID                                     3428 non-null   object 
 4   NTD ID                                            3787 non-null   object 
 5   Organization Type                                 3787 non-null   object 
 6   Reporter Type                                     3787 non-null   object 
 7   Primary UZA Population                            3787 non-null   int64  
 8   Agency 
VOMS                                      3787 non-null   int64  
 9   Mode               

In [8]:
def process_2021_2016(raw:pd.DataFrame, year:int) -> pd.DataFrame:
    '''Read raw data and return standardized data frame.

    Takes a data frame in the format of the Capital Expenses data from 2016 to 2021
    and removes unneeded columns, trims excess wording in "Reporter Type" field, and
    adds the "Report Year" field. Does not modify data types in the data frame.

    Arguments:
        - raw (pd.DataFrame): The data frame containing the raw data.
        - year (int): The year the observations in the data frame are from.
    
    Returns:
        pd.DataFrame: The processed data frame with the standard columns.
    '''
    # Clean interior and exterior whitespace.
    orig = raw.columns
    clean = [str(name).strip().replace('\n','') for name in orig]
    # One data set has "Name" instead of "Agency"
    clean = ['Agency' if name == 'Name' else name for name in clean]
    data = raw.rename(columns=dict(zip(orig, clean)))
    # Remove "Questionable" columns from data.
    data = data.drop(columns=data.columns[["Questionable" in name for name in data.columns]])
    # Drop unneeded columns.
    data = data.drop(columns=['Organization Type', 'Legacy NTD ID', 'Reduced Reporter Expenses'])
    # Drop all columns after total.
    names = data.columns
    after_tot = names.get_loc('Total') + 1
    if after_tot < len(names):
        data = data.drop(columns=names[after_tot:])
    # Remove the "reporter" suffix from the entries in reporter type.
    data['Reporter Type'] = [words[0] for words in (str(item).split() for item in data['Reporter Type'])]
    # Add the report year column.
    data.insert(loc=5, column='Report Year', value=[year]*len(data['NTD ID']))
    # Return
    return data

In [9]:
files = glob.glob("../data dump/Capital Expenses_20*.xls*")
frames = [0]*(len(files)+1)
for i, file in enumerate(files):
    raw = pd.read_excel(file, sheet_name='Total Capital Expenses by Mode')
    frames[i] = process_2021_2016(raw, int(file.split(sep='_')[1][0:4]))
frames[-1] = dat
all_dat = pd.concat(frames)
all_dat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25356 entries, 0 to 1858
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Agency                               25322 non-null  object 
 1   City                                 25279 non-null  object 
 2   State                                25322 non-null  object 
 3   NTD ID                               25322 non-null  object 
 4   Reporter Type                        25356 non-null  object 
 5   Report Year                          25356 non-null  int64  
 6   Primary UZA Population               24733 non-null  float64
 7   Agency VOMS                          25322 non-null  object 
 8   Mode                                 25117 non-null  object 
 9   TOS                                  25117 non-null  object 
 10  Mode VOMS                            25304 non-null  object 
 11  Guideway                          

In [12]:
all_dat = all_dat.loc[all_dat['Total'].notna(),:]
all_dat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25322 entries, 0 to 1858
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Agency                               25322 non-null  object 
 1   City                                 25279 non-null  object 
 2   State                                25322 non-null  object 
 3   NTD ID                               25322 non-null  object 
 4   Reporter Type                        25322 non-null  object 
 5   Report Year                          25322 non-null  int64  
 6   Primary UZA Population               24733 non-null  float64
 7   Agency VOMS                          25322 non-null  object 
 8   Mode                                 25117 non-null  object 
 9   TOS                                  25117 non-null  object 
 10  Mode VOMS                            25304 non-null  object 
 11  Guideway                          

In [13]:
all_dat.to_csv('../data/NTD_Capital_Expenditures.csv', index=False)