In [114]:
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [115]:
# first set the working directory. This code will be changed based on the relative location of the data files 
# on the local drive of the computer executing the command. 
os.chdir('C:\\Users\\belincoln\\Documents\\! CBP\\!User Fees\\!! Goal 1 Dashboards')

# Works well for Jupyter Notebooks, can be configured in Spyder using file explorer. 
collections = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Collections',
                                         'COBRA_Air','Collections cc495 - FY13 - FY18.xls'))

In [116]:
# Begin Data Cleaning
# delete columns and rows that contain only na
collections = collections.dropna(axis=0, how = 'all')
collections = collections.dropna(axis=1, how = 'all')

In [117]:
# Delete the first two rows and only keep the Period Column and Collection Columns
collections = collections.iloc[2:,[1,-1,-3,-4]]

In [118]:
# Make first row column headers
collections.columns = collections.iloc[0,:]

In [119]:
# Make sure all columns have a name! Rename first column 'Period'
# Delete first two rows
collections.columns.values[0] = 'Period'

collections = collections.iloc[2:,:]

In [120]:
# Delete rows that contain sums for each company (don't want to double count collections)
# Also delete additional NaNs
collections = collections.dropna(axis = 0, subset = ['Period'])

In [121]:
# Sum Interest, Penalty, and Principal Collection Amounts
collections['Collections'] = collections[['Applied Penalty Amount', 'Applied Interest Amount',
       'Applied Principal Amount']].sum(axis = 1)

In [122]:
# remove tilda from index (unclear why it exists in the first place)
collections['Period'] = collections['Period'].str.rstrip('~')
# remove extra space from index
collections['Period'] = collections['Period'].str.rstrip(' ')

In [123]:
# Groups Collections based on Collection Period, across all companies
collections = collections.groupby(collections['Period']).sum()

In [124]:
# Remove audit payments
collections= collections[~collections.index.str.contains("\*")]

# Add an additional column that shows remittance period (independent of year)
collections['Remittance Period'] = collections.index.str.split('20').str[0]


# Create Calendar Year Column
collections['Calendar Year'] = collections.index.str.split(')').str[1]
# Turn Years into integers
collections['Calendar Year'] = collections['Calendar Year'].astype(int)

# Filter on years not a part of analysis
years = [2012,2013,2014,2015,2016,2017,2018]
collections = collections[collections['Calendar Year'].isin(years)]

In [125]:
#remove collection data for which we don't have workload data
searchfor = ['Qtr 01 \(Jan-Mar\) 2012','Qtr 02 \(Apr-Jun\) 2012', 'Qtr 03 \(Jul-Sept\) 2012','Qtr 04 \(Oct-Dec\) 2018']
collections = collections[~collections.index.str.contains('|'.join(searchfor))]


In [126]:
#%% Read Workload Data
workload = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Workload',
                                      'COBRA_Air','fy13-18 stats by_Month National.xlsx'))

In [127]:
# Select only Comm Aircraft Pax/Crew Combined (ECAR) Workload metric from PPAE file
workload = workload.iloc[13,:]
# Remove unnecessary and non-numeric columns
workload = workload[4:]

In [128]:
# Turn Series into a dataframe. Rename the first column "Workload"
workload = workload.to_frame()
workload.columns = ['Comm Aircraft Pax/Crew combined (ECAR)']

In [129]:
# Create Calendar Year and Month Columns
workload['Month'] = workload.index.str.split('/').str[0]
workload['Calendar Year'] = workload.index.str.split('/').str[2]


In [130]:
# Filter on years not a part of analysis
years = ['2012','2013','2014','2015','2016','2017','2018']
workload = workload[workload['Calendar Year'].isin(years)]

In [131]:
# Build out Remittance Period Columns
conditions = [(workload['Month'] == '1'), (workload['Month'] == '2'), (workload['Month'] == '3'), 
              (workload['Month'] == '4'), (workload['Month'] == '5'), (workload['Month'] == '6'),
              (workload['Month'] == '7'),(workload['Month'] == '8'),(workload['Month'] == '9'),
              (workload['Month'] == '10'),(workload['Month'] == '11'),(workload['Month'] == '12')] 
choices = ['Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)',
           'Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)',
           'Qtr 03 (Jul-Sept)','Qtr 03 (Jul-Sept)', 'Qtr 03 (Jul-Sept)',
           'Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)']
workload['Remittance Period'] = np.select(conditions, choices, default='error')

In [132]:
# Reset index to Remittance Period plus Calendar Year to merge with Collection data
workload.index = workload['Remittance Period'] + ' ' + workload['Calendar Year']
# Drop unnecssary columns
workload.drop(['Calendar Year', 'Month','Remittance Period'], inplace = True, axis=1)


In [133]:
# Sum on Remittance Period
workload = workload.groupby(workload.index).sum()

In [134]:
workload_collections = pd.merge(workload,collections,how = 'inner', left_index = True, right_index = True)
#%%

In [135]:
corr = workload_collections.corr()

In [136]:
corr

Unnamed: 0,Comm Aircraft Pax/Crew combined (ECAR),Collections,Calendar Year
Comm Aircraft Pax/Crew combined (ECAR),1.0,-0.146586,0.721411
Collections,-0.146586,1.0,0.17791
Calendar Year,0.721411,0.17791,1.0


In [138]:
workload_collections

Unnamed: 0,Comm Aircraft Pax/Crew combined (ECAR),Collections,Remittance Period,Calendar Year
Qtr 01 (Jan-Mar) 2013,19922659.0,144614200.0,Qtr 01 (Jan-Mar),2013
Qtr 01 (Jan-Mar) 2014,20465041.0,151966900.0,Qtr 01 (Jan-Mar),2014
Qtr 01 (Jan-Mar) 2015,21503062.0,162693600.0,Qtr 01 (Jan-Mar),2015
Qtr 01 (Jan-Mar) 2016,23208448.0,168357000.0,Qtr 01 (Jan-Mar),2016
Qtr 01 (Jan-Mar) 2017,23782842.0,182099400.0,Qtr 01 (Jan-Mar),2017
Qtr 01 (Jan-Mar) 2018,25067750.0,199308500.0,Qtr 01 (Jan-Mar),2018
Qtr 02 (Apr-Jun) 2013,21933564.0,136927300.0,Qtr 02 (Apr-Jun),2013
Qtr 02 (Apr-Jun) 2014,23136714.0,142836700.0,Qtr 02 (Apr-Jun),2014
Qtr 02 (Apr-Jun) 2015,24050999.0,154588000.0,Qtr 02 (Apr-Jun),2015
Qtr 02 (Apr-Jun) 2016,25184172.0,162974400.0,Qtr 02 (Apr-Jun),2016


In [139]:
workload_collections2 = workload_collections

In [140]:
%store workload_collections2

Stored 'workload_collections2' (DataFrame)
