In [20]:
import os
import requests
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from scipy import optimize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import glob
import json

In [21]:
# read all csv files within the test_data folder. This should be 2008-2018 (11 years), of DHS contract award data
all_files = glob.glob("../data/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, header = 0, usecols = ['contract_transaction_unique_key',
                        'contract_award_unique_key',
                        'federal_action_obligation','total_dollars_obligated', 'base_and_exercised_options_value', 
                        'current_total_value_of_award', 'base_and_all_options_value','potential_total_value_of_award',
                        'action_date'],
                 dtype = {'contract_transaction_unique_key':'str','contract_transaction_unique_key':'str',
                          'federal_action_obligation': 'float','total_dollars_obligated': 'float', 
                          'base_and_exercised_options_value': 'float', 'current_total_value_of_award': 'float', 
                          'base_and_all_options_value': 'float','potential_total_value_of_award': 'float',
                          'action_date':'str'})
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)


In [22]:
df.head()

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date
0,7014_4730_HSBP20080015201839_0_GS21F0031U_0,CONT_AWD_HSBP20080015201839_7014_GS21F0031U_4730,7830.75,,7830.75,,7830.75,,2008-09-30
1,7001_-NONE-_HSHQDC06C00065_P00034_-NONE-_-NONE-,CONT_IDV_HSHQDC06C00065_7001,0.0,,0.0,,0.0,,2008-09-30
2,7008_-NONE-_HSCG2808P7ABEC9_1_-NONE-_0,CONT_AWD_HSCG2808P7ABEC9_7008_-NONE-_-NONE-,-122.0,,-122.0,,-122.0,,2008-09-30
3,7001_-NONE-_HSHQDC08C00190_0_-NONE-_-NONE-,CONT_IDV_HSHQDC08C00190_7001,492248.0,,0.0,,492248.0,,2008-09-30
4,7008_-NONE-_HSCG4508P6K4ZHT_0_-NONE-_0,CONT_AWD_HSCG4508P6K4ZHT_7008_-NONE-_-NONE-,25218.0,,25218.0,,25218.0,,2008-09-30


In [23]:
# This is what we are using for our rate of Execution
df['Percent Cumulatively Obligated over potential total value of award'] = df['total_dollars_obligated'] / df['potential_total_value_of_award']

# Create Indicator Variable
df['Indicator'] = df['federal_action_obligation']<-1000

##Separate contracts that have a deobligation

In [24]:
#Separate contracts that have a deobligation
df_deob = df.copy().loc[df['Indicator'] == 1]

In [25]:
#Get first deobligation date of contracts

df_first_deob = df_deob.copy()
df_first_deob = df_first_deob[['contract_award_unique_key', 'action_date']]
df_first_deob = df_first_deob.groupby(['contract_award_unique_key'], as_index = False).min()
df_first_deob.rename(columns = {'action_date':'first_deob_date'}, inplace = True)
df_first_deob['first_deob_date'] = pd.to_datetime(df_first_deob['first_deob_date'])


In [26]:
#Add indicator flag for contracts that ever have deobligation
df_deob.drop(columns = [c for c in df_deob.columns if c not in ['Indicator', 'contract_award_unique_key']], inplace = True)
df_deob = df_deob.drop_duplicates()
df_deob.rename(columns = {'Indicator': 'Contract_deob'}, inplace = True)
df_ever_deob = df.merge(df_deob, how = 'left', right_on = 'contract_award_unique_key', left_on = 'contract_award_unique_key')
df_ever_deob.rename(columns = {'Indicator': 'Transaction_Deob'}, inplace = True)
df_ever_deob['Contract_deob'] = df_ever_deob['Contract_deob'].fillna(False)
df_ever_deob.head()

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob
0,7014_4730_HSBP20080015201839_0_GS21F0031U_0,CONT_AWD_HSBP20080015201839_7014_GS21F0031U_4730,7830.75,,7830.75,,7830.75,,2008-09-30,,False,False
1,7001_-NONE-_HSHQDC06C00065_P00034_-NONE-_-NONE-,CONT_IDV_HSHQDC06C00065_7001,0.0,,0.0,,0.0,,2008-09-30,,False,False
2,7008_-NONE-_HSCG2808P7ABEC9_1_-NONE-_0,CONT_AWD_HSCG2808P7ABEC9_7008_-NONE-_-NONE-,-122.0,,-122.0,,-122.0,,2008-09-30,,False,False
3,7001_-NONE-_HSHQDC08C00190_0_-NONE-_-NONE-,CONT_IDV_HSHQDC08C00190_7001,492248.0,,0.0,,492248.0,,2008-09-30,,False,False
4,7008_-NONE-_HSCG4508P6K4ZHT_0_-NONE-_0,CONT_AWD_HSCG4508P6K4ZHT_7008_-NONE-_-NONE-,25218.0,,25218.0,,25218.0,,2008-09-30,,False,False


In [27]:
#Merge first deobligation date
contract_df = df_ever_deob.merge(df_first_deob, how = 'left', right_on = 'contract_award_unique_key', left_on = 'contract_award_unique_key')


In [28]:
#test cell, there should be both types of values in both indicator columns
print(contract_df['Contract_deob'].value_counts())
print(contract_df['Transaction_Deob'].value_counts())
print(df_deob.shape)

False    780742
True     306030
Name: Contract_deob, dtype: int64
False    1003521
True       83251
Name: Transaction_Deob, dtype: int64
(70069, 2)


In [29]:
#removing rows after first deobligation
contract_df['action_date'] = pd.to_datetime(contract_df['action_date'])
contract_df['first_deob_date'] = pd.to_datetime(contract_df['first_deob_date'])
contract_df = contract_df[contract_df['action_date'] <= contract_df['first_deob_date']]

In [30]:
#Test cell
dftest3 = contract_df[contract_df['contract_award_unique_key'] == 'CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000'].sort_values(by = 'action_date')

dftest3.head(10)

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob,first_deob_date
178258,7013_8000_HSTS0310JCIO071_0_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3885103.94,,3885103.94,,10223807.69,,2010-09-30,,False,True,2012-06-21
305104,7013_8000_HSTS0310JCIO071_P00001_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3344677.82,,3344677.82,,0.0,,2011-06-16,,False,True,2012-06-21
405575,7013_8000_HSTS0310JCIO071_P00002_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,2994025.93,,2994025.93,,0.0,,2012-05-04,,False,True,2012-06-21
394646,7013_8000_HSTS0310JCIO071_P00003_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,-402426.96,,-402426.96,,-402426.96,,2012-06-21,,True,True,2012-06-21


In [31]:
#Get first action date of all contracts

df_first_date = contract_df.copy()
df_first_date = df_first_date[['contract_award_unique_key', 'action_date']]
df_first_date = df_first_date.groupby(['contract_award_unique_key'] , as_index = False).min()
df_first_date.rename(columns = {'action_date': 'first_date'}, inplace = True)
df_first_date.head()

Unnamed: 0,contract_award_unique_key,first_date
0,CONT_AWD_00003_7022_HSFEHQ08D0105_7022,2009-03-18
1,CONT_AWD_00004_7022_HSFE0408A0017_7022,2008-05-14
2,CONT_AWD_0001_2036_TPDFIGBPA100001_2036,2010-10-01
3,CONT_AWD_0001_7022_HSFE0609A6334_7022,2009-03-13
4,CONT_AWD_0001_7022_HSFEHQ09A0170_7022,2008-12-31


In [32]:
#test cell
df_test_first_date = df_first_date.copy()

df_test_first_date = df_first_date[df_first_date['contract_award_unique_key'] == 'CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000']
df_test_first_date.head()


Unnamed: 0,contract_award_unique_key,first_date
67072,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,2010-09-30


In [33]:
contract_df = contract_df.merge(df_first_date, how = 'left', right_on = 'contract_award_unique_key', left_on = 'contract_award_unique_key')
contract_df['action_date'] = pd.to_datetime(contract_df['action_date'])
contract_df['first_date'] = pd.to_datetime(contract_df['first_date'])
contract_df['days_since_first_action'] = contract_df['action_date'] - contract_df['first_date']

In [34]:
#test cell
contract_df_test = contract_df.copy()
contract_df_test = contract_df_test[contract_df_test['contract_award_unique_key'] == 'CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000'].sort_values(by = 'action_date')
contract_df_test.head()

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob,first_deob_date,first_date,days_since_first_action
47024,7013_8000_HSTS0310JCIO071_0_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3885103.94,,3885103.94,,10223807.69,,2010-09-30,,False,True,2012-06-21,2010-09-30,0 days
80947,7013_8000_HSTS0310JCIO071_P00001_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3344677.82,,3344677.82,,0.0,,2011-06-16,,False,True,2012-06-21,2010-09-30,259 days
108552,7013_8000_HSTS0310JCIO071_P00002_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,2994025.93,,2994025.93,,0.0,,2012-05-04,,False,True,2012-06-21,2010-09-30,582 days
105726,7013_8000_HSTS0310JCIO071_P00003_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,-402426.96,,-402426.96,,-402426.96,,2012-06-21,,True,True,2012-06-21,2010-09-30,630 days


In [35]:

#Contract_df now has the contract/transaction deobligation flags and days since the first action date

In [36]:
# Adding in transaction number
g = contract_df.groupby(['contract_award_unique_key'])

contract_df['transaction_num'] = g['action_date'].rank(method = 'min')
contract_df.drop(columns = ['first_deob_date'], inplace = True)



In [42]:
contract_df.copy()

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob,first_date,days_since_first_action,transaction_num
0,7009_4730_HSSS0108F0077_1_GS35F0503M_0,CONT_AWD_HSSS0108F0077_7009_GS35F0503M_4730,28000.00,,28000.00,,28000.00,,2008-09-30,,False,True,2008-01-22,252 days,2.0
1,7001_-NONE-_HSHQDC08P00224_0_-NONE-_0,CONT_AWD_HSHQDC08P00224_7001_-NONE-_-NONE-,3600.00,,3600.00,,3600.00,,2008-09-30,,False,True,2008-09-30,0 days,1.0
2,7012_4730_HSCEC708F00050_P00002_GS07F0306L_0,CONT_AWD_HSCEC708F00050_7012_GS07F0306L_4730,0.00,,1000.00,,1000.00,,2008-09-30,,False,True,2008-04-01,182 days,3.0
3,7001_7001_HSHQDC08J00375_P00005_HSHQDC06D00048_0,CONT_AWD_HSHQDC08J00375_7001_HSHQDC06D00048_7001,-177065.92,,-177065.92,,-177065.92,,2008-09-30,,True,True,2008-09-30,0 days,1.0
4,7014_7014_HSBP1108J24245_0_HSBP1108D01979_0,CONT_AWD_HSBP1108J24245_7014_HSBP1108D01979_7014,383090.00,,383090.00,,383090.00,,2008-09-30,,False,True,2008-09-30,0 days,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244454,7015_7001_HSFLGL17J00367_P00003_HSHQDC15D00015_0,CONT_AWD_HSFLGL17J00367_7015_HSHQDC15D00015_7001,-7821.65,93097.63,-7821.65,93097.63,-7821.65,93097.63,2018-10-01,1.000000,True,True,2017-06-26,462 days,4.0
244455,7001_7001_70RFP119FRE200001_0_HSHQE214D00004_0,CONT_AWD_70RFP119FRE200001_7001_HSHQE214D00004...,174186.50,4124725.00,174186.50,4124725.00,174186.50,4124725.00,2018-10-01,1.000000,False,True,2018-10-01,0 days,1.0
244456,7001_7001_70RFP419FRE700019_0_HSHQC715D00001_0,CONT_AWD_70RFP419FRE700019_7001_HSHQC715D00001...,15329.08,37471.08,15329.08,37471.08,15329.08,37471.08,2018-10-01,1.000000,False,True,2018-10-01,0 days,1.0
244457,7001_7001_70RDAD19FR0000011_0_HSHQDC17D00014_0,CONT_AWD_70RDAD19FR0000011_7001_HSHQDC17D00014...,0.00,3000000.00,8662057.05,3000000.00,13433211.56,7771154.51,2018-10-01,0.386043,False,True,2018-10-01,0 days,1.0


In [41]:
#test cell
testdf = contract_df.copy()
testdf = testdf[testdf['contract_award_unique_key'] == 'CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000'].sort_values(by = 'transaction_num')
testdf.head(10)

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob,first_date,days_since_first_action,transaction_num
47024,7013_8000_HSTS0310JCIO071_0_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3885103.94,,3885103.94,,10223807.69,,2010-09-30,,False,True,2010-09-30,0 days,1.0
80947,7013_8000_HSTS0310JCIO071_P00001_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,3344677.82,,3344677.82,,0.0,,2011-06-16,,False,True,2010-09-30,259 days,2.0
108552,7013_8000_HSTS0310JCIO071_P00002_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,2994025.93,,2994025.93,,0.0,,2012-05-04,,False,True,2010-09-30,582 days,3.0
105726,7013_8000_HSTS0310JCIO071_P00003_NNG07DA46B_0,CONT_AWD_HSTS0310JCIO071_7013_NNG07DA46B_8000,-402426.96,,-402426.96,,-402426.96,,2012-06-21,,True,True,2010-09-30,630 days,4.0


## End Mike's changes 05/14/2020
## Beginning of Bennett's Edits

In [43]:
from sklearn.model_selection import train_test_split

In [49]:
# Split contracts (not transactions!) into a test train set
contracts_train, contracts_test = train_test_split(contract_df['contract_award_unique_key'], test_size = .2)

In [57]:
# creat new df to test
train_df = contract_df[contract_df['contract_award_unique_key'].isin(contracts_train)]

In [59]:
train_df.isnull()

Unnamed: 0,contract_transaction_unique_key,contract_award_unique_key,federal_action_obligation,total_dollars_obligated,base_and_exercised_options_value,current_total_value_of_award,base_and_all_options_value,potential_total_value_of_award,action_date,Percent Cumulatively Obligated over potential total value of award,Transaction_Deob,Contract_deob,first_date,days_since_first_action,transaction_num
0,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False
1,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False
2,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False
3,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False
4,False,False,False,True,False,True,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244454,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
244455,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
244456,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
244457,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


## Model prep


In [39]:
model_df = contract_df.copy()
model_df.drop(columns = ['contract_transaction_unique_key', 'contract_award_unique_key', 'first_date', 'action_date'], inplace = True)

model_df['Transaction_Deob'] = model_df['Transaction_Deob'].astype(int)
# model_df['Contract_deob'] = model_df['Contract_deob'].astype(int)
model_df['days_since_first_action'] = model_df['days_since_first_action'].astype(int)
model_df['days_since_first_action'] = model_df['days_since_first_action'].apply(lambda x: x/(60 * 60 * 24 * 1000000000))

# model_df.head()
model_df = model_df.fillna(0)
print('row ct before removing potential values of 0', model_df.shape[0])

model_df_new = model_df[model_df['potential_total_value_of_award'] != 0]

print('row ct before after potential values of 0', model_df_new.shape[0])
print('OK, don\'t wanna do that')

model_df.drop(columns = 'Percent Cumulatively Obligated over potential total value of award', inplace = True)



print(model_df.max(axis = 0))

print(model_df['Contract_deob'].value_counts())
print(model_df['Transaction_Deob'].value_counts())


TypeError: cannot astype a timedelta from [timedelta64[ns]] to [int32]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
labels = np.array(model_df['Contract_deob'])
features = model_df.copy()
features.drop(columns = ['Contract_deob'], inplace = True)
features_list = list(features.columns)
features = np.array(features)

print('Features going into model: \n', features_list)


In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
print(test_features.shape)
print(features.shape)
print(train_features.shape)
print(test_labels.shape)

print(np.min(labels))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

rf.fit(train_features, train_labels)

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors

errors = abs(predictions - test_labels)

print(len(errors))

num_errors = len(np.where(errors == 1))

print(num_errors)

conf_matrix = np.column_stack((predictions, test_labels))

print(conf_matrix)
print(type(conf_matrix))

results_df = pd.DataFrame(conf_matrix, columns = ['Pred', 'Actual'])

# results_df.rename(columns = {'0': 'Pred', '1': 'Actual'}, inplace = True)
results_df.head()

results_df.groupby(['Pred', 'Actual']).count()

# print('test labels: \n', test_labels)
# test_labels_1 = np.where(test_labels == 1)
# print(len(test_labels_1))

# # Print out the mean absolute error (mae)
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')


In [None]:
print(np.min(test_labels))
print(np.min(train_labels))

In [None]:
df = df.fillna(0)
# Convert action date to pd.datetime
df['action_date'] = pd.to_datetime(df['action_date'])
# set index to each contract and date
df.set_index(['contract_award_unique_key','action_date'], inplace = True)
df.sort_index()

In [None]:
# We want to only focus on execution rate (for now). I'm not sure which column is the best indicator of execution rate
# we can follow up with Eric. I'm going to stick with Percent Cumulatively Obligated over potential total value of award
# as wequal to the execution rate.
drop_columns = [column for column in df.columns if column != 'Percent Cumulatively Obligated over potential total value of award']
df.drop(drop_columns,axis = 1, inplace=True)

df.rename(columns = {'Percent Cumulatively Obligated over potential total value of award':'execution_rate'},inplace = True)

In [None]:
df.head(4)

In [None]:
contracts = [contract for contract, date in list(df.index)]
unique_k = set(contracts)
print('This is the number of unique Contracts: ' + str(len(unique_k)))
print('This is the number of transactions: ' + str(len(contracts)))

# Selecting contracts w/ 5 or more transactions from 2008-2018

In [None]:
# I grouped the dataframe by contract and used the .count(method)
# to determine the number of entries. I then used an inequality to get a boolean series of the number of contracts 
# that satisfied the condition of at least 5 transactions.
test = df.groupby(level='contract_award_unique_key')['execution_rate'].count() >= 5
test = test[test==True]
# This is the list of contracts that have more than 5 transactions. 
highlighted_transactions = test.index



### Utilize FY19 Data as Test Data

In [None]:
os.chdir('C:\\Users\\belincoln\\repos\\BudgetPredict')

In [None]:
%cd data

In [None]:
filename = 'FY2019_070_Contracts_Full_20200110_1.csv'

In [None]:
test_df = pd.read_csv(filename, header = 0, usecols = ['contract_transaction_unique_key',
                        'contract_award_unique_key',
                        'federal_action_obligation','total_dollars_obligated', 'base_and_exercised_options_value', 
                        'current_total_value_of_award', 'base_and_all_options_value','potential_total_value_of_award',
                        'action_date'],
                 dtype = {'contract_transaction_unique_key':'str','contract_transaction_unique_key':'str',
                          'federal_action_obligation': 'float','total_dollars_obligated': 'float', 
                          'base_and_exercised_options_value': 'float', 'current_total_value_of_award': 'float', 
                          'base_and_all_options_value': 'float','potential_total_value_of_award': 'float',
                          'action_date':'str'})

In [None]:
print('This is the # of transactions is FY19 for DHS: ' + str(len(test_df)))

In [None]:
test_df = test_df.fillna(0)
# Convert action date to pd.datetime
test_df['action_date'] = pd.to_datetime(test_df['action_date'])
# set index to each contract and date
test_df.set_index(['contract_award_unique_key','action_date'], inplace = True)
test_df.sort_index()
fy19_df = test_df.copy()

In [None]:
# We want to only focus on execution rate (for now). I'm not sure which column is the best indicator of execution rate
# we can follow up with Eric. I'm going to stick with Percent Cumulatively Obligated over potential total value of award
# as wequal to the execution rate.
drop_columns = [column for column in test_df.columns if column != 'Percent Cumulatively Obligated over potential total value of award']
test_df.drop(drop_columns,axis = 1, inplace=True)

test_df.rename(columns = {'Percent Cumulatively Obligated over potential total value of award':'execution_rate'},inplace = True)

In [None]:
contracts = [contract for contract, date in list(test_df.index)]
unique_k_fy19 = set(contracts)
print('This is the number of unique Contracts: ' + str(len(unique_k_fy19)))
print('This is the number of transactions: ' + str(len(contracts)))

In [None]:
type(unique_k_fy19)

In [None]:
highlighted_transactions = set(highlighted_transactions)

In [None]:
len(unique_k_fy19.intersection(highlighted_transactions))

In [None]:
mask = list(unique_k_fy19.intersection(highlighted_transactions))

In [None]:
fy19_df.index.get_level_values('contract_award_unique_key')

In [None]:
highlighted_fy19 = fy19_df.loc[(list(highlighted_transactions), slice(None)),:]

In [None]:
highlighted_fy19.head()

In [None]:
highlighted_fy19.index.get_level_values(0).unique()

In [None]:
highlighted_fy19

In [None]:
highlighted_fy19.drop('base_and_exercised_options_value',axis = 1, inplace = True)

In [None]:
df = highlighted_fy19

In [None]:
# Create 3 new features for analysis
df['Percent awarded over potential total awarded'] = df['current_total_value_of_award'] / df['potential_total_value_of_award']
df['Percent Cumulatively Obligated over potential total value of award'] = df['total_dollars_obligated'] / df['potential_total_value_of_award']
df['Percent Cumulatively Obligated over total value already awarded'] = df['total_dollars_obligated'] / df['current_total_value_of_award']

# Create Indicator Variable
df['Indicator'] = df['federal_action_obligation']<-1000

# set index to each transaction key
df.set_index('contract_transaction_unique_key', inplace = True)

In [None]:
df.drop('federal_action_obligation', axis =1, inplace = True)
df = df.fillna(0)

In [None]:
# This is the percentage of De-Obligations in the sample (before test/train split)
print('Total number of De-Obligations: ' +str(df['Indicator'].sum()))
print('Percentage of De-Obligations: '+str(df['Indicator'].sum()/len(df)))

In [None]:
# Create matrix of feature variables and our dependent variable
X, y =  df.iloc[:,1:-1], df.loc[:,'Indicator']

# convert y to np.array of 0s and 1s
y = np.array(y.astype(int))

In [None]:
#train the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [None]:
X_test.shape

In [None]:
# Setup the data matrix appropriately, and add ones for the intercept term
m_train, n_train = X_train.shape
m_test, n_test = X_test.shape
# Add intercept term to X
X_train = np.concatenate([np.ones((m_train, 1)), X_train], axis=1)
X_test = np.concatenate([np.ones((m_test, 1)), X_test], axis=1)



In [None]:
log_model = LogisticRegression().fit(X_train, y_train)

In [None]:
# Coefficents to multiply features by
log_model.coef_

In [None]:
theta = log_model.coef_[0]

In [None]:
print(theta[0])

In [None]:
def sigmoid(z):
    """
    Compute sigmoid function given the input z.
    
    Parameters
    ----------
    z : array_like
        The input to the sigmoid function. This can be a 1-D vector 
        or a 2-D matrix. 
    
    Returns
    -------
    g : array_like
        The computed sigmoid function. g has the same shape as z, since
        the sigmoid is computed element-wise on z.
        
    Instructions
    ------------
    Compute the sigmoid of each value of z (z can be a matrix, vector or scalar).
    """
    # convert input to a numpy array
    z = np.array(z)
    
    # You need to return the following variables correctly 
    g = np.zeros(z.shape)

    temp = 1 + np.power(np.e,-z)
    g = 1 / temp
    

    return g

In [None]:
pred = sigmoid(np.dot(X_test,theta))

In [None]:
fig = pyplot.figure
pyplot.scatter(np.arange(len(pred)), pred)

In [None]:
# Now we will test our model using the test_set
pred = sigmoid(np.dot(X_test,theta))

In [None]:
# Convert probabilty of de-obligation into prediction
pred = pred > .5
# Convert to int
pred = np.array(pred.astype(int))

In [None]:
test = pred == y_test

In [None]:
test.sum()/len(test)

In [None]:
# Really what we are looking for here 
# is how many times a de-obligation occured and we were able to predict it?

In [None]:
# Also percentage of false positives: how many times did we predict a de-obligation and 
# it did not occur?

In [None]:
data = {'predicted value' : pred, 'test value': y_test}
testdf = pd.DataFrame(data)

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:


cm = np.array(confusion_matrix(pred, y_test, labels = [1,0]))
confusion = pd.DataFrame(cm, index=[['Predicted', 'Predicted'], ['Large De-Obligation','Obligation']],
                         columns = [['Test Results', 'Test Results'],['Large De-Obligation','Obligation']])
confusion


In [None]:
confusion.iloc[0].sum()+confusion.iloc[1].sum()

In [None]:
correct_preds = testdf[testdf['test value'] == 1]
correct_preds2 = correct_preds[correct_preds['predicted value'] ==1]
print('Size of Test set: ' + str(len(testdf)))
print('Size of Train set: ' + str(len(X_train)))
print('Percentage of correct predictions (Accuracey): ' + str((confusion.iloc[0,0] + confusion.iloc[1,1])/len(testdf)))
print('Percentage of significant de-obligations: ' + str(len(correct_preds)/len(testdf)))
print('Percentage of de-obligations correctly predicted: ' + str(len(correct_preds2)/len(correct_preds)))

In [None]:
# Rate of False Negatives: A de-obligation occurs, but the model did not predict it. 
false_neg = testdf[testdf['predicted value'] == 0]
false_neg2 = false_neg[false_neg['test value'] == 1]
print('Number of False Negatives : ' + str(len(false_neg2)))
print('Rate of False Negative: ' + str(len(false_neg2)/len(testdf)))

In [None]:
# Show the rate of False Positives: Times the model predicted a de-obligation, yet there wasn't one

false_pos = testdf[testdf['predicted value'] == 1]
false_pos2 = false_pos[false_pos['test value'] == 0]

print('Number of False Positives: ' + str(len(false_pos2)))
print('Rate of False Positives: ' + str(len(false_pos2)/len(testdf)))