In [58]:
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [59]:
# first set the working directory. This code will be changed based on the relative location of the data files 
# on the local drive of the computer executing the command. 
os.chdir('C:\\Users\\belincoln\\Documents\\! CBP\\!User Fees\\!! Goal 1 Dashboards')

# Works well for Jupyter Notebooks, can be configured in Spyder using file explorer. 
collections = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Collections',
                                         'COBRA_Air','Collections cc495 - FY13 - FY18.xls'))

In [60]:
# Begin Data Cleaning
# delete columns and rows that contain only na
collections = collections.dropna(axis=0, how = 'all')
collections = collections.dropna(axis=1, how = 'all')

In [61]:
# Delete the first two rows and only keep the Period Column and Collection Columns
collections = collections.iloc[2:,[1,-1,-3,-4]]

In [62]:
# Make first row column headers
collections.columns = collections.iloc[0,:]

In [63]:
# Make sure all columns have a name! Rename first column 'Period'
# Delete first two rows
collections.columns.values[0] = 'Period'

collections = collections.iloc[2:,:]

In [64]:
# Delete rows that contain sums for each company (don't want to double count collections)
# Also delete additional NaNs
collections = collections.dropna(axis = 0, subset = ['Period'])

In [65]:
# Sum Interest, Penalty, and Principal Collection Amounts
collections['Collections'] = collections[['Applied Penalty Amount', 'Applied Interest Amount',
       'Applied Principal Amount']].sum(axis = 1)

In [66]:
# remove tilda from index (unclear why it exists in the first place)
collections['Period'] = collections['Period'].str.rstrip('~')
# remove extra space from index
collections['Period'] = collections['Period'].str.rstrip(' ')

In [67]:
# Groups Collections based on Collection Period, across all companies
collections = collections.groupby(collections['Period']).sum()

In [68]:
# Remove audit payments
collections= collections[~collections.index.str.contains("\*")]

# Add an additional column that shows remittance period (independent of year)
collections['Remittance Period'] = collections.index.str.split('20').str[0]


# Create Calendar Year Column
collections['Calendar Year'] = collections.index.str.split(')').str[1]
# Turn Years into integers
collections['Calendar Year'] = collections['Calendar Year'].astype(int)

# Filter on years not a part of analysis
years = [2012,2013,2014,2015,2016,2017,2018]
collections = collections[collections['Calendar Year'].isin(years)]

In [69]:
#remove collection data for which we don't have workload data
searchfor = ['Qtr 01 \(Jan-Mar\) 2012','Qtr 02 \(Apr-Jun\) 2012', 'Qtr 03 \(Jul-Sept\) 2012','Qtr 04 \(Oct-Dec\) 2018']
collections = collections[~collections.index.str.contains('|'.join(searchfor))]


In [81]:
#%% Read Workload Data
workload = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Workload',
                                      'COBRA_Air','fy13-18 stats by_Month National.xlsx'))

In [82]:
workload

Unnamed: 0,Line,Data Id,Data Elements - National,Total,10/1/2012,11/1/2012,12/1/2012,1/1/2013,2/1/2013,3/1/2013,...,12/1/2017,1/1/2018,2/1/2018,3/1/2018,4/1/2018,5/1/2018,6/1/2018,7/1/2018,8/1/2018,9/1/2018
0,1,TRS00001,Trucks,,,,,,,,...,,,,,,,,,,
1,2,,NATIONAL,68590564.0,957580.0,899386.0,760640.0,896682.0,845914.0,898600.0,...,881009.0,992129.0,939048.0,1041046.0,1044131.0,1085237.0,1027005.0,996786.0,1061363.0,974040.0
2,3,TRS00004,Trains,,,,,,,,...,,,,,,,,,,
3,4,,NATIONAL,223977.0,3083.0,3069.0,3015.0,3032.0,2813.0,3219.0,...,2719.0,2787.0,2588.0,2945.0,2880.0,2997.0,2862.0,2836.0,2824.0,2918.0
4,5,AIR00001,Commercial Aircraft,,,,,,,,...,,,,,,,,,,
5,6,,NATIONAL,4284454.0,51720.0,52719.0,57967.0,57428.0,51674.0,61229.0,...,63149.0,61163.0,55043.0,64310.0,62354.0,63855.0,65951.0,68752.0,66915.0,57820.0
6,7,TEC00011,Private Aircraft,,,,,,,,...,,,,,,,,,,
7,8,,NATIONAL,695468.0,8461.0,9586.0,9122.0,9077.0,8583.0,11226.0,...,8691.0,8796.0,8406.0,10850.0,9977.0,9337.0,10145.0,10886.0,9661.0,8056.0
8,9,TRS00005,Vessels,,,,,,,,...,,,,,,,,,,
9,10,,NATIONAL,343428.0,4847.0,4859.0,4783.0,4514.0,4060.0,4594.0,...,4732.0,4425.0,4366.0,4864.0,4668.0,5104.0,5207.0,5321.0,5256.0,4899.0


In [83]:
# Select only Comm Aircraft Pax/Crew Combined (ECAR) Workload metric from PPAE file
workload = workload.iloc[13,:]
# Remove unnecessary and non-numeric columns
workload = workload[4:]

In [84]:
# Turn Series into a dataframe. Rename the first column "Workload"
workload = workload.to_frame()
workload.columns = ['Workload']

In [85]:
workload

Unnamed: 0,Workload
10/1/2012,6.38716e+06
11/1/2012,6.0335e+06
12/1/2012,6.62188e+06
1/1/2013,6.85186e+06
2/1/2013,5.70149e+06
...,...
5/1/2018,9.14076e+06
6/1/2018,9.98753e+06
7/1/2018,1.15051e+07
8/1/2018,1.10381e+07


In [73]:
# Create Calendar Year and Month Columns
workload['Month'] = workload.index.str.split('/').str[0]
workload['Calendar Year'] = workload.index.str.split('/').str[2]


In [74]:
# Filter on years not a part of analysis
years = ['2012','2013','2014','2015','2016','2017','2018']
workload = workload[workload['Calendar Year'].isin(years)]

In [75]:
# Build out Remittance Period Columns
conditions = [(workload['Month'] == '1'), (workload['Month'] == '2'), (workload['Month'] == '3'), 
              (workload['Month'] == '4'), (workload['Month'] == '5'), (workload['Month'] == '6'),
              (workload['Month'] == '7'),(workload['Month'] == '8'),(workload['Month'] == '9'),
              (workload['Month'] == '10'),(workload['Month'] == '11'),(workload['Month'] == '12')] 
choices = ['Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)',
           'Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)',
           'Qtr 03 (Jul-Sept)','Qtr 03 (Jul-Sept)', 'Qtr 03 (Jul-Sept)',
           'Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)']
workload['Remittance Period'] = np.select(conditions, choices, default='error')

In [76]:
# Reset index to Remittance Period plus Calendar Year to merge with Collection data
workload.index = workload['Remittance Period'] + ' ' + workload['Calendar Year']
# Drop unnecssary columns
workload.drop(['Calendar Year', 'Month','Remittance Period'], inplace = True, axis=1)


In [77]:
# Sum on Remittance Period
workload = workload.groupby(workload.index).sum()

In [78]:
workload_collections = pd.merge(workload,collections,how = 'inner', left_index = True, right_index = True)
#%%

In [79]:
corr = workload_collections.corr()

In [80]:
corr

Unnamed: 0,Workload,Collections,Calendar Year
Workload,1.0,-0.146586,0.721411
Collections,-0.146586,1.0,0.17791
Calendar Year,0.721411,0.17791,1.0
