In [1]:
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [20]:
# first set the working directory. This code will be changed based on the relative location of the data files 
# on the local drive of the computer executing the command. 
os.chdir('C:\\Users\\belincoln\\Documents\\! CBP\\!User Fees\\!! Goal 1 Dashboards')

# Works well for Jupyter Notebooks, can be configured in Spyder using file explorer. 
collections = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Collections',
                                         'COBRA_Air','Collections cc495 - FY13 - FY18.xls'))

In [21]:
# Begin Data Cleaning
# delete columns and rows that contain only na
collections = collections.dropna(axis=0, how = 'all')
collections = collections.dropna(axis=1, how = 'all')

In [22]:
# Delete the first two rows and only keep the Period Column and Collection Columns
collections = collections.iloc[2:,[1,-1,-3,-4]]

In [23]:
# Make first row column headers
collections.columns = collections.iloc[0,:]

In [24]:
# Make sure all columns have a name! Rename first column 'Period'
# Delete first two rows
collections.columns.values[0] = 'Period'

collections = collections.iloc[2:,:]

In [25]:
# Delete rows that contain sums for each company (don't want to double count collections)
# Also delete additional NaNs
collections = collections.dropna(axis = 0, subset = ['Period'])

In [26]:
# Sum Interest, Penalty, and Principal Collection Amounts
collections['Collections'] = collections[['Applied Penalty Amount', 'Applied Interest Amount',
       'Applied Principal Amount']].sum(axis = 1)

In [27]:
# remove tilda from index (unclear why it exists in the first place)
collections['Period'] = collections['Period'].str.rstrip('~')
# remove extra space from index
collections['Period'] = collections['Period'].str.rstrip(' ')

In [28]:
# Groups Collections based on Collection Period, across all companies
collections = collections.groupby(collections['Period']).sum()

In [29]:
# Remove audit payments
collections= collections[~collections.index.str.contains("\*")]

# Add an additional column that shows remittance period (independent of year)
collections['Remittance Period'] = collections.index.str.split('20').str[0]


# Create Calendar Year Column
collections['Calendar Year'] = collections.index.str.split(')').str[1]
# Turn Years into integers
collections['Calendar Year'] = collections['Calendar Year'].astype(int)

# Filter on years not a part of analysis
years = [2012,2013,2014,2015,2016,2017,2018]
collections = collections[collections['Calendar Year'].isin(years)]

In [30]:
#remove collection data for which we don't have workload data
searchfor = ['Qtr 01 \(Jan-Mar\) 2012','Qtr 02 \(Apr-Jun\) 2012', 'Qtr 03 \(Jul-Sept\) 2012','Qtr 04 \(Oct-Dec\) 2018']
collections = collections[~collections.index.str.contains('|'.join(searchfor))]


In [46]:
#%% Read Workload Data
workload = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Workload',
                                      'COBRA_Air','FY09-fy18-passenger data air and cruise.xlsx'))

In [47]:
# Drop Cruise and Total Rows
workload.drop([7,8],axis = 0,inplace=True)

In [48]:
names = workload.iloc[[0,2,4],[0,1]]
workload.iloc[[1,3,5],[0,1]] = names

In [49]:
workload.iloc[[1,3,5],[0,1]] = names

In [50]:
workload.iloc[:,[0,1]] = workload.iloc[:,[0,1]].ffill(axis = 0)

In [51]:
workload = workload.iloc[[1,3,5],:]

In [52]:
workload

Unnamed: 0,Data Id,Data Elements - National,10/1/2008,11/1/2008,12/1/2008,1/1/2009,2/1/2009,3/1/2009,4/1/2009,5/1/2009,...,12/1/2017,1/1/2018,2/1/2018,3/1/2018,4/1/2018,5/1/2018,6/1/2018,7/1/2018,8/1/2018,9/1/2018
1,USC00011,Commercial Aircraft Pax,5437977.0,4986218.0,5446923.0,5628757.0,4590526.0,5649832.0,5848890.0,5324342.0,...,7794441.0,8235566.0,6785954.0,8600528.0,8662139.0,8608627.0,9165631.0,10615039.0,10164128.0,8102772.0
3,MPP00013,Private Aircraft Passengers,31531.0,32307.0,29799.0,36279.0,27772.0,33705.0,29347.0,20400.0,...,22546.0,25712.0,23464.0,30079.0,26404.0,22555.0,24388.0,26656.0,23076.0,20025.0
5,USCP0011,Preclear Air Ports Pax/Crew combined (Calc),,,,,,,,,...,1596994.0,1565813.0,1530305.0,1884844.0,1745714.0,1680540.0,1845968.0,2073376.0,2034479.0,1651825.0


In [18]:
# Turn Series into a dataframe. Rename the first column "Workload"
workload = workload.to_frame()
workload.columns = ['Workload']

AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [76]:
# Create Calendar Year and Month Columns
workload['Month'] = workload.index.str.split('/').str[0]
workload['Calendar Year'] = workload.index.str.split('/').str[2]


In [77]:
workload.tail(30)

Unnamed: 0,Workload,Month,Calendar Year
4/1/2016,9123240.0,4,2016
5/1/2016,9368263.0,5,2016
6/1/2016,10054967.0,6,2016
7/1/2016,11932474.0,7,2016
8/1/2016,11469905.0,8,2016
9/1/2016,9201844.0,9,2016
10/1/2016,9066087.0,10,2016
11/1/2016,8221747.0,11,2016
12/1/2016,9105626.0,12,2016
1/1/2017,9639506.0,1,2017


In [78]:
# Filter on years not a part of analysis
years = ['2012','2013','2014','2015','2016','2017','2018']
workload = workload[workload['Calendar Year'].isin(years)]

In [79]:
type(workload.index[0])

str

In [80]:
# Build out Remittance Period Columns
conditions = [(workload['Month'] == '1'), (workload['Month'] == '2'), (workload['Month'] == '3'), 
              (workload['Month'] == '4'), (workload['Month'] == '5'), (workload['Month'] == '6'),
              (workload['Month'] == '7'),(workload['Month'] == '8'),(workload['Month'] == '9'),
              (workload['Month'] == '10'),(workload['Month'] == '11'),(workload['Month'] == '12')] 
choices = ['Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)',
           'Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)',
           'Qtr 03 (Jul-Sept)','Qtr 03 (Jul-Sept)', 'Qtr 03 (Jul-Sept)',
           'Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)']
workload['Remittance Period'] = np.select(conditions, choices, default='error')

In [81]:
# Reset index to Remittance Period plus Calendar Year to merge with Collection data
workload.index = workload['Remittance Period'] + ' ' + workload['Calendar Year']
# Drop unnecssary columns
workload.drop(['Calendar Year', 'Month','Remittance Period'], inplace = True, axis=1)


In [82]:
workload.tail(30)

Unnamed: 0,Workload
Qtr 02 (Apr-Jun) 2016,9123240.0
Qtr 02 (Apr-Jun) 2016,9368263.0
Qtr 02 (Apr-Jun) 2016,10054967.0
Qtr 03 (Jul-Sept) 2016,11932474.0
Qtr 03 (Jul-Sept) 2016,11469905.0
Qtr 03 (Jul-Sept) 2016,9201844.0
Qtr 04 (Oct-Dec) 2016,9066087.0
Qtr 04 (Oct-Dec) 2016,8221747.0
Qtr 04 (Oct-Dec) 2016,9105626.0
Qtr 01 (Jan-Mar) 2017,9639506.0


In [83]:
# Sum on Remittance Period
workload = workload.groupby(workload.index).sum()

In [84]:
workload_collections = pd.merge(workload,collections,how = 'inner', left_index = True, right_index = True)
#%%

In [85]:
workload_collections

Unnamed: 0,Workload,Collections,Remittance Period,Calendar Year
Qtr 01 (Jan-Mar) 2013,22791685.0,144614200.0,Qtr 01 (Jan-Mar),2013
Qtr 01 (Jan-Mar) 2014,23466161.0,151966900.0,Qtr 01 (Jan-Mar),2014
Qtr 01 (Jan-Mar) 2015,24847847.0,162693600.0,Qtr 01 (Jan-Mar),2015
Qtr 01 (Jan-Mar) 2016,26567808.0,168357000.0,Qtr 01 (Jan-Mar),2016
Qtr 01 (Jan-Mar) 2017,27209789.0,182099400.0,Qtr 01 (Jan-Mar),2017
Qtr 01 (Jan-Mar) 2018,28682265.0,199308500.0,Qtr 01 (Jan-Mar),2018
Qtr 02 (Apr-Jun) 2013,24654960.0,136927300.0,Qtr 02 (Apr-Jun),2013
Qtr 02 (Apr-Jun) 2014,26239343.0,142836700.0,Qtr 02 (Apr-Jun),2014
Qtr 02 (Apr-Jun) 2015,27355676.0,154588000.0,Qtr 02 (Apr-Jun),2015
Qtr 02 (Apr-Jun) 2016,28546470.0,162974400.0,Qtr 02 (Apr-Jun),2016


In [86]:
corr = workload_collections.corr()

In [87]:
corr

Unnamed: 0,Workload,Collections,Calendar Year
Workload,1.0,-0.101463,0.738833
Collections,-0.101463,1.0,0.17791
Calendar Year,0.738833,0.17791,1.0
