In [192]:
# Import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [193]:
# first set the working directory. This code will be changed based on the relative location of the data files 
# on the local drive of the computer executing the command. 
os.chdir('C:\\Users\\belincoln\\Documents\\! CBP\\!User Fees\\!! Goal 1 Dashboards')

# Works well for Jupyter Notebooks, can be configured in Spyder using file explorer. 
collections = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Collections',
                                         'COBRA_Air','Collections cc495 - FY13 - FY18.xls'))

In [194]:
# Begin Data Cleaning
# delete columns and rows that contain only na
collections = collections.dropna(axis=0, how = 'all')
collections = collections.dropna(axis=1, how = 'all')

In [195]:
# Delete the first two rows and only keep the Period Column and Collection Columns
collections = collections.iloc[2:,[1,-1,-3,-4]]

In [196]:
# Make first row column headers
collections.columns = collections.iloc[0,:]

In [197]:
# Make sure all columns have a name! Rename first column 'Period'
# Delete first two rows
collections.columns.values[0] = 'Period'

collections = collections.iloc[2:,:]

In [198]:
# Delete rows that contain sums for each company (don't want to double count collections)
# Also delete additional NaNs
collections = collections.dropna(axis = 0, subset = ['Period'])

In [199]:
# Sum Interest, Penalty, and Principal Collection Amounts
collections['Collections'] = collections[['Applied Penalty Amount', 'Applied Interest Amount',
       'Applied Principal Amount']].sum(axis = 1)

In [200]:
# remove tilda from index (unclear why it exists in the first place)
collections['Period'] = collections['Period'].str.rstrip('~')
# remove extra space from index
collections['Period'] = collections['Period'].str.rstrip(' ')

In [201]:
# Groups Collections based on Collection Period, across all companies
collections = collections.groupby(collections['Period']).sum()

In [202]:
# Remove audit payments
collections= collections[~collections.index.str.contains("\*")]

# Add an additional column that shows remittance period (independent of year)
collections['Remittance Period'] = collections.index.str.split('20').str[0]


# Create Calendar Year Column
collections['Calendar Year'] = collections.index.str.split(')').str[1]
# Turn Years into integers
collections['Calendar Year'] = collections['Calendar Year'].astype(int)

# Filter on years not a part of analysis
years = [2012,2013,2014,2015,2016,2017,2018]
collections = collections[collections['Calendar Year'].isin(years)]

In [229]:
#remove collection data for which we don't have workload data
searchfor = ['Qtr 01 \(Jan-Mar\) 2012','Qtr 02 \(Apr-Jun\) 2012', 'Qtr 03 \(Jul-Sept\) 2012','Qtr 04 \(Oct-Dec\) 2018']
collections = collections[~collections.index.str.contains('|'.join(searchfor))]


In [204]:
#%% Read Workload Data
workload = pd.read_excel(os.path.join('Source Emails & Source Files','Files','Workload',
                                      'COBRA_Air','fy13-18 stats by_Month National.xlsx'))

In [205]:
# Select only Comm Aircraft Pax/Crew Combined (ECAR) Workload metric from PPAE file
workload = workload.iloc[13,:]
# Remove unnecessary and non-numeric columns
workload = workload[4:]

In [206]:
# Turn Series into a dataframe. Rename the first column "Workload"
workload = workload.to_frame()
workload.columns = ['Workload']

In [207]:
# Create Calendar Year and Month Columns
workload['Month'] = workload.index.str.split('/').str[0]
workload['Calendar Year'] = workload.index.str.split('/').str[2]


In [208]:
# Filter on years not a part of analysis
years = ['2012','2013','2014','2015','2016','2017','2018']
workload = workload[workload['Calendar Year'].isin(years)]

In [209]:
# Build out Remittance Period Columns
conditions = [(workload['Month'] == '1'), (workload['Month'] == '2'), (workload['Month'] == '3'), 
              (workload['Month'] == '4'), (workload['Month'] == '5'), (workload['Month'] == '6'),
              (workload['Month'] == '7'),(workload['Month'] == '8'),(workload['Month'] == '9'),
              (workload['Month'] == '10'),(workload['Month'] == '11'),(workload['Month'] == '12')] 
choices = ['Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)',
           'Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)',
           'Qtr 03 (Jul-Sept)','Qtr 03 (Jul-Sept)', 'Qtr 03 (Jul-Sept)',
           'Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)']
workload['Remittance Period'] = np.select(conditions, choices, default='error')

In [210]:
# Reset index to Remittance Period plus Calendar Year to merge with Collection data
workload.index = workload['Remittance Period'] + ' ' + workload['Calendar Year']
# Drop unnecssary columns
workload.drop(['Calendar Year', 'Month','Remittance Period'], inplace = True, axis=1)


In [211]:
# Sum on Remittance Period
workload = workload.groupby(workload.index).sum()

In [212]:
workload['Remittance Period'] = workload.index.str.split('\)').str[0] + ")"

In [190]:
len(workload)

24

In [25]:
#Clean Workload Df
#remove cruise data
workload = workload.iloc[0:6,:]
#select data we want to work with
workload2 = workload.iloc[::2,:2]
workload3 = workload.iloc[[1,3,5],2:]
#reset indices for merge
workload2.reset_index(drop = True, inplace = True)
workload3.reset_index(drop = True, inplace = True)

# concat dataframes to get cleaned workload df
workload = pd.concat([workload2, workload3], axis=1, sort=False)
#sum Data Element and Workload ID columns into one descriptor
workload['Workload Element'] = workload.iloc[:,0]+': '+workload.iloc[:,1]
workload = workload.iloc[:,::-1]
workload = workload.iloc[:,:-2]
#%%
workload = workload.transpose()
#promote first row to column headers and drop first row
workload.columns = workload.iloc[0]
workload = workload.iloc[1:]

# Create Calendar Year and Month Columns
workload['Month'] = workload.index.str.split('/').str[0]
workload['Calendar Year'] = workload.index.str.split('/').str[2] + " "
print(workload['Calendar Year'])
# Filter on years not a part of analysis
years = ['2012 ','2013 ','2014 ','2015 ','2016 ','2017 ','2018 ']
workload = workload[workload['Calendar Year'].isin(years)]

#%%
# Build out Remittance Period Columns
conditions = [(workload['Month'] == '1'), (workload['Month'] == '2'), (workload['Month'] == '3'), (workload['Month'] == '4'), (workload['Month'] == '5'), (workload['Month'] == '6'),(workload['Month'] == '7'),(workload['Month'] == '8'),(workload['Month'] == '9'),(workload['Month'] == '10'),(workload['Month'] == '11'),(workload['Month'] == '12')] 
choices = ['Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 01 (Jan-Mar)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 02 (Apr-Jun)','Qtr 03a (Jul-Aug)','Qtr 03a (Jul-Aug)', 'Qtr 03b (Sept)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)','Qtr 04 (Oct-Dec)']
workload['Period'] = np.select(conditions, choices, default='error')


#%%

# Match Period Column to Collections
workload['Period'] = workload['Period'] + ' ' + workload['Calendar Year']
# Set index to Remittance Period
workload = workload.set_index('Period')
# drop unnecssary columns
workload = workload.drop(['Calendar Year','Month'], axis = 1)
# Sum on Remittance Period
workload = workload.groupby(workload.index).sum()



#%%
#remove non FY2013-2018 data
#workload = workload.iloc[1:,:]
#collections_475 = collections_475.iloc[1:-1,:]

#%%
#remove non FY2013-2018 data
#searchfor = ['Qtr 02 \(Apr-Jun\) 2012', 'Qtr 03a \(Jul-Aug\) 2012','Qtr 03b \(Sept\) 2012']
#collections_475 = collections_475[~collections_475.index.str.contains('|'.join(searchfor))]
#workload = workload[~workload.index.str.contains('|'.join(searchfor))]

#Add sum of workload columns
workload['Workload'] = workload.sum(axis = 1)

#%%

9/1/2018     2018 
8/1/2018     2018 
7/1/2018     2018 
6/1/2018     2018 
5/1/2018     2018 
             ...  
2/1/2009     2009 
1/1/2009     2009 
12/1/2008    2008 
11/1/2008    2008 
10/1/2008    2008 
Name: Calendar Year, Length: 120, dtype: object


In [None]:
workload_collections = pd.merge(workload,collections,how = 'inner', left_index = True, right_index = True)
#%%