In [2]:
# Import the necessary libraries 
# Recall highly correlated column (base and exercised options value) is our highly corr. column
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
import pandas as pd

# Plotting library
from matplotlib import pyplot

# import function to split data into train and test dfs
from sklearn.model_selection import train_test_split

# Optimization module in scipy
from scipy import optimize

# Import logistic regression model from SK Learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Import glob as glob, used to consolidate data files
import glob

In [3]:
os.chdir('C:\\Users\\belincoln\\repos\\BudgetPredict')
# Set working directory to the data folder so you can correctly read in the csv files
%cd data
%cd test_data

C:\Users\belincoln\repos\BudgetPredict\data
C:\Users\belincoln\repos\BudgetPredict\data\test_data


In [4]:
# read all csv files within the test_data folder. This should be 2008-2018 (11 years), of DHS contract award data
all_files = glob.glob("*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, header = 0, usecols = ['contract_transaction_unique_key',
                        'contract_award_unique_key',
                        'federal_action_obligation','total_dollars_obligated', 'base_and_exercised_options_value', 
                        'current_total_value_of_award', 'base_and_all_options_value','potential_total_value_of_award',
                        'action_date'],
                 dtype = {'contract_transaction_unique_key':'str','contract_transaction_unique_key':'str',
                          'federal_action_obligation': 'float','total_dollars_obligated': 'float', 
                          'base_and_exercised_options_value': 'float', 'current_total_value_of_award': 'float', 
                          'base_and_all_options_value': 'float','potential_total_value_of_award': 'float',
                          'action_date':'str'})
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)


In [5]:
# This is what we are using for our rate of Execution
df['Percent Cumulatively Obligated over potential total value of award'] = df['total_dollars_obligated'] / df['potential_total_value_of_award']

# Create Indicator Variable
df['Indicator'] = df['federal_action_obligation']<-1000

In [6]:
df = df.fillna(0)
# Convert action date to pd.datetime
df['action_date'] = pd.to_datetime(df['action_date'])
# set index to each contract and date
df.set_index(['contract_award_unique_key','action_date'], inplace = True)
df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,contract_transaction_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,Percent Cumulatively Obligated over potential total value of award,Indicator
contract_award_unique_key,action_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CONT_AWD_00001_7022_HSFE0408A0017_7022,2008-02-15,7022_7022_00001_0_HSFE0408A0017_0,15000.0,0.0,15000.0,0.0,0.0,0.0,0.0,False
CONT_AWD_00001_7022_HSFE0408A0035_7022,2008-02-20,7022_7022_00001_0_HSFE0408A0035_0,10000.0,0.0,10000.0,0.0,0.0,0.0,0.0,False
CONT_AWD_00001_7022_HSFE0408A0066_7022,2008-05-15,7022_7022_00001_0_HSFE0408A0066_0,5000.0,0.0,5000.0,0.0,0.0,0.0,0.0,False
CONT_AWD_00001_7022_HSFE0408A0067_7022,2008-03-31,7022_7022_00001_0_HSFE0408A0067_0,1500.0,0.0,1500.0,0.0,0.0,0.0,0.0,False
CONT_AWD_00001_7022_HSFE0408A0067_7022,2012-07-27,7022_7022_00001_P00001_HSFE0408A0067_0,-30.0,0.0,-30.0,0.0,1470.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...
CONT_IDV_TPDFIGBPA100001_2036,2010-10-26,2036_-NONE-_TPDFIGBPA100001_T501_-NONE-_-NONE-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
CONT_IDV_TPDFIGBPA100001_2036,2010-10-26,2036_-NONE-_TPDFIGBPA100001_500_-NONE-_-NONE-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
CONT_IDV_WRO02005_7012,2017-05-31,7012_-NONE-_WRO02005_XXX9_-NONE-_-NONE-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
CONT_IDV_WRO9812_7012,2017-05-31,7012_-NONE-_WRO9812_XXX9_-NONE-_-NONE-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [7]:
# We want to only focus on execution rate (for now). I'm not sure which column is the best indicator of execution rate
# we can follow up with Eric. I'm going to stick with Percent Cumulatively Obligated over potential total value of award
# as wequal to the execution rate.
drop_columns = [column for column in df.columns if column != 'Percent Cumulatively Obligated over potential total value of award']
df.drop(drop_columns,axis = 1, inplace=True)

df.rename(columns = {'Percent Cumulatively Obligated over potential total value of award':'execution_rate'},inplace = True)

In [8]:
df.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,execution_rate
contract_award_unique_key,action_date,Unnamed: 2_level_1
CONT_AWD_HSBP20080015201839_7014_GS21F0031U_4730,2008-09-30,0.0
CONT_IDV_HSHQDC06C00065_7001,2008-09-30,0.0
CONT_AWD_HSCG2808P7ABEC9_7008_-NONE-_-NONE-,2008-09-30,0.0
CONT_IDV_HSHQDC08C00190_7001,2008-09-30,0.0


In [10]:
contracts = [contract for contract, date in list(df.index)]
unique_k = set(contracts)
print('This is the number of unique Contracts: ' + str(len(unique_k)))
print('This is the number of transactions: ' + str(len(contracts)))

This is the number of unique Contracts: 485094
This is the number of transactions: 1020089


# Selecting contracts w/ 5 or more transactions from 2008-2018

In [32]:
# I grouped the dataframe by contract and used the .count(method)
# to determine the number of entries. I then used an inequality to get a boolean series of the number of contracts 
# that satisfied the condition of at least 5 transactions.
test = df.groupby(level='contract_award_unique_key')['execution_rate'].count() >= 5
test = test[test==True]
# This is the list of contracts that have more than 5 transactions. 
highlighted_transactions = test.index



### Utilize FY19 Data as Test Data

In [20]:
os.chdir('C:\\Users\\belincoln\\repos\\BudgetPredict')

In [21]:
%cd data

C:\Users\belincoln\repos\BudgetPredict\data


In [22]:
filename = 'FY2019_070_Contracts_Full_20200110_1.csv'

In [23]:
test_df = pd.read_csv(filename, header = 0, usecols = ['contract_transaction_unique_key',
                        'contract_award_unique_key',
                        'federal_action_obligation','total_dollars_obligated', 'base_and_exercised_options_value', 
                        'current_total_value_of_award', 'base_and_all_options_value','potential_total_value_of_award',
                        'action_date'],
                 dtype = {'contract_transaction_unique_key':'str','contract_transaction_unique_key':'str',
                          'federal_action_obligation': 'float','total_dollars_obligated': 'float', 
                          'base_and_exercised_options_value': 'float', 'current_total_value_of_award': 'float', 
                          'base_and_all_options_value': 'float','potential_total_value_of_award': 'float',
                          'action_date':'str'})

In [24]:
print('This is the # of transactions is FY19 for DHS: ' + str(len(test_df)))

This is the # of transactions is FY19 for DHS: 66533


In [25]:
test_df = test_df.fillna(0)
# Convert action date to pd.datetime
test_df['action_date'] = pd.to_datetime(test_df['action_date'])
# set index to each contract and date
test_df.set_index(['contract_award_unique_key','action_date'], inplace = True)
test_df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,contract_transaction_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award
contract_award_unique_key,action_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CONT_AWD_70B01C18C00000037_7014_-NONE-_-NONE-,2019-04-16,7014_-NONE-_70B01C18C00000037_P00001_-NONE-_0,30704.07,351950.14,30704.07,351950.14,30704.07,351950.14
CONT_AWD_70B01C18C00000048_7014_-NONE-_-NONE-,2019-05-07,7014_-NONE-_70B01C18C00000048_P00001_-NONE-_0,1333004.00,2653059.87,1333004.00,2653059.87,0.00,3999439.65
CONT_AWD_70B01C18C00000052_7014_-NONE-_-NONE-,2018-10-16,7014_-NONE-_70B01C18C00000052_P00001_-NONE-_0,49500.00,99000.00,49500.00,99000.00,0.00,201000.00
CONT_AWD_70B01C18C00000052_7014_-NONE-_-NONE-,2018-11-05,7014_-NONE-_70B01C18C00000052_P00002_-NONE-_0,0.00,99000.00,0.00,99000.00,0.00,201000.00
CONT_AWD_70B01C18C00000052_7014_-NONE-_-NONE-,2019-03-20,7014_-NONE-_70B01C18C00000052_P00003_-NONE-_0,51000.00,150000.00,51000.00,150000.00,0.00,201000.00
...,...,...,...,...,...,...,...,...
CONT_IDV_HSTS0517ACT4006_7013,2019-02-15,7013_-NONE-_HSTS0517ACT4006_P00003_-NONE-_-NONE-,0.00,0.00,0.00,0.00,0.00,983471.00
CONT_IDV_HSTS0517DOTD331_7013,2019-06-03,7013_-NONE-_HSTS0517DOTD331_P00002_-NONE-_-NONE-,0.00,0.00,0.00,0.00,0.00,2500000.00
CONT_IDV_HSTS0517DOTD331_7013,2019-08-27,7013_-NONE-_HSTS0517DOTD331_P00003_-NONE-_-NONE-,0.00,0.00,0.00,0.00,0.00,2500000.00
CONT_IDV_HSTS0517DPHY018_7013,2019-07-10,7013_-NONE-_HSTS0517DPHY018_P00002_-NONE-_-NONE-,0.00,0.00,0.00,0.00,0.00,2500000.00


In [26]:
# We want to only focus on execution rate (for now). I'm not sure which column is the best indicator of execution rate
# we can follow up with Eric. I'm going to stick with Percent Cumulatively Obligated over potential total value of award
# as wequal to the execution rate.
drop_columns = [column for column in test_df.columns if column != 'Percent Cumulatively Obligated over potential total value of award']
test_df.drop(drop_columns,axis = 1, inplace=True)

test_df.rename(columns = {'Percent Cumulatively Obligated over potential total value of award':'execution_rate'},inplace = True)

In [27]:
contracts = [contract for contract, date in list(test_df.index)]
unique_k_fy19 = set(contracts)
print('This is the number of unique Contracts: ' + str(len(unique_k_fy19)))
print('This is the number of transactions: ' + str(len(contracts)))

This is the number of unique Contracts: 44314
This is the number of transactions: 66533


In [28]:
type(unique_k_fy19)

set

In [29]:
highlighted_transactions = set(highlighted_transactions)

In [30]:
len(unique_k_fy19.intersection(highlighted_transactions))

4022

In [34]:
test_df = unique_k_fy19.intersection(highlighted_transactions)

NameError: name 'unique_k_fy19' is not defined