In [163]:
#%store -r df

In [164]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np
import pandas as pd

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

In [165]:
# tells matplotlib to embed plots within the notebook
#%matplotlib inline

In [186]:
os.chdir('C:\\Users\\belincoln\\repos\\BudgetPredict')
# Set working directory to the data folder so you can correctly read in the csv files
%cd data

C:\Users\belincoln\repos\BudgetPredict\data


In [187]:
# Read data from DHS Contracts
df = pd.read_csv('FY2019_070_Contracts_Full_20200110_1.csv', header = 0, usecols = ['contract_transaction_unique_key', 
                        'federal_action_obligation','total_dollars_obligated', 'base_and_exercised_options_value', 
                        'current_total_value_of_award', 'base_and_all_options_value','potential_total_value_of_award'],
                 dtype = {'contract_transaction_unique_key':'str','federal_action_obligation': 'float',
                        'total_dollars_obligated': 'float', 'base_and_exercised_options_value': 'float', 
                        'current_total_value_of_award': 'float', 'base_and_all_options_value': 'float',
                        'potential_total_value_of_award': 'float'})

In [188]:
# Create 3 new features for analysis
df['Percent awarded over potential total awarded'] = df['current_total_value_of_award'] / df['potential_total_value_of_award']
df['Percent Cumulatively Obligated over potential total value of award'] = df['total_dollars_obligated'] / df['potential_total_value_of_award']
df['Percent Cumulatively Obligated over total value already awarded'] = df['total_dollars_obligated'] / df['current_total_value_of_award']

# Create Indicator Variable
df['Indicator'] = df['federal_action_obligation']<-1000

# set index to each transaction key
df.set_index('contract_transaction_unique_key', inplace = True)

# Data Dictionary for Column Headers from USASPENDING.GOV

#### federal_action_obligation: 

Amount of Federal government’s obligation, de-obligation, or liability for an award transaction.

#### total_dollars_obligated: This doesn't make sense to me

This is a system generated element providing the sum of all the amounts entered in the "Action Obligation" field for a particular PIID and Agency. Example: Contract has 9 Modifications under "Transaction Number" as '1' and 9 modifications with the same PIID under "Transaction Number" as '2'. The base contracts and all the modifications have "Action Obligation" as $10 each. The value for the field "Total Obligated Amount" when the either of the bases or the modification is retrieved through atom feeds will be $200 ($100 under Transaction Number 1 + $100 under Transaction Number 2). "Total Obligated Amount" is generated irrespective of the "Transaction Number" on the Awards.

#### base_and_exercised_options_value
The change (from this transaction only) to the current contract value (i.e., the base contract and any options that have been exercised).

#### current_total_value_of_award

Total amount obligated to date on an award. For a contract, this amount includes the base and exercised options. For a non-loan financial assistance award (AssistanceType ≠ 07 or 08), this is the sum of all the FederalActionObligation values in transactions with the same AwardingSubTierAgencyCode and FAIN (for RecordType = 2 or 3) or AwardingSubTierAgencyCode and URI (for RecordType = 1). For a loan award (AssistanceType = 07 or 08), this is the sum of all OriginalLoanSubsidyCost values in transactions with the same AwardingSubTierAgencyCode and FAIN (for RecordType = 2 or 3) or AwardingSubTierAgencyCode and URI (for RecordType = 1). In the subaward data context, this element refers to the total amount obligated to date on the prime award.

#### base_and_all_options_value

The change (from this transaction only) to the potential contract value (i.e., the base contract and any exercised or unexercised options).

#### potential_total_value_of_award

Total amount that could be obligated on a contract, if the base and all options are exercised.


In [169]:
print(df.min())

# Why do these two have negative values you think?
    # Because it was a deobligation!

# df['base_and_all_options_value'][df['base_and_all_options_value'] < 0]





federal_action_obligation                                             -75532042.2
total_dollars_obligated                                                       0.0
base_and_exercised_options_value                                      -75532042.2
current_total_value_of_award                                                  0.0
base_and_all_options_value                                           -108425811.2
potential_total_value_of_award                                                0.0
Percent awarded over potential total awarded                                  0.0
Percent Cumulatively Obligated over potential total value of award            0.0
Percent Cumulatively Obligated over total value already awarded               0.0
Indicator                                                                     0.0
dtype: float64


In [170]:
# Is total_dollars_obligated < current_total_value_of_award < potential_value_of_award?
test = df['total_dollars_obligated'] >= df['current_total_value_of_award'] 

# test if this is true for all transactions
print(not any(test))

# shows the percentage of transactions where this inequality does nto hold
print((len(df) - test.sum()) / len(df))

# Show indexes where this inequality does not hold
df[test == False].index

False
0.09850750755264305


Index(['7012_7012_HSCEMS16F00057_P00005_HSCEMS12A00018_0',
       '7008_-NONE-_HSCG3816DL00004_P00009_-NONE-_-NONE-',
       '7013_-NONE-_HSTS0514AMED077_P00001_-NONE-_-NONE-',
       '7008_-NONE-_70Z03818DB2000003_P00002_-NONE-_-NONE-',
       '7008_-NONE-_70Z03819DB2000001_0_-NONE-_-NONE-',
       '7008_-NONE-_70Z03819DM0000001_0_-NONE-_-NONE-',
       '7012_-NONE-_HSCEDM17D00009_P00008_-NONE-_-NONE-',
       '7008_-NONE-_HSCG2317DPXC002_P00003_-NONE-_-NONE-',
       '7008_-NONE-_HSCG3815D202051_P00005_-NONE-_-NONE-',
       '7008_-NONE-_HSCG4016D60308_P00007_-NONE-_-NONE-',
       ...
       '7022_-NONE-_HSFE4014A0168_P00005_-NONE-_-NONE-',
       '7022_-NONE-_HSFE4014A0184_P00005_-NONE-_-NONE-',
       '7001_7001_70RFP119FRE400037_0_70RFP118DE4000005_0',
       '7022_-NONE-_HSFELA15A0084_P00004_-NONE-_-NONE-',
       '7001_-NONE-_HSHQDC17A00012_P00002_-NONE-_-NONE-',
       '7012_7001_HSCETE15F00007_P00033_HSHQDC13A00024_0',
       '7008_-NONE-_70Z04019D57050B00_0_-NONE-_-NONE-',
 

In [171]:
# Is total_dollars_obligated < current_total_value_of_award < potential_value_of_award?
test2 = df['current_total_value_of_award'] <= df['potential_total_value_of_award']

# test if this is true for all transactions
print(not any(test2))

# shows the percentage of transactions where this inequality does nto hold
print((len(df) - test2.sum()) / len(df))

# Show indexes where this inequality does not hold
df[test2 == False].index

False
0.04829182510934424


Index(['7008_-NONE-_HSCG3816DL00004_P00009_-NONE-_-NONE-',
       '7013_-NONE-_HSTS0514AMED077_P00001_-NONE-_-NONE-',
       '7008_-NONE-_70Z03818DB2000003_P00002_-NONE-_-NONE-',
       '7008_-NONE-_70Z03819DB2000001_0_-NONE-_-NONE-',
       '7008_-NONE-_70Z03819DM0000001_0_-NONE-_-NONE-',
       '7012_-NONE-_HSCEDM17D00009_P00008_-NONE-_-NONE-',
       '7008_-NONE-_HSCG2317DPXC002_P00003_-NONE-_-NONE-',
       '7008_-NONE-_HSCG3815D202051_P00005_-NONE-_-NONE-',
       '7008_-NONE-_HSCG4016D60308_P00007_-NONE-_-NONE-',
       '7015_-NONE-_HSFLAR16D00001_P00006_-NONE-_-NONE-',
       ...
       '7012_-NONE-_70CDCR18D00000004_P00002_-NONE-_-NONE-',
       '7008_-NONE-_HSCG8817DPMV093_P00004_-NONE-_-NONE-',
       '7008_-NONE-_HSCG8817DPMV093_P00005_-NONE-_-NONE-',
       '7008_-NONE-_70Z08419DBHQ00400_P00001_-NONE-_-NONE-',
       '7022_-NONE-_HSFE4014A0167_P00005_-NONE-_-NONE-',
       '7022_-NONE-_HSFE4014A0168_P00005_-NONE-_-NONE-',
       '7022_-NONE-_HSFE4014A0184_P00005_-NONE-_-NON

# Beginning of Regression Analysis

In [172]:
df.columns

Index(['federal_action_obligation', 'total_dollars_obligated',
       'base_and_exercised_options_value', 'current_total_value_of_award',
       'base_and_all_options_value', 'potential_total_value_of_award',
       'Percent awarded over potential total awarded',
       'Percent Cumulatively Obligated over potential total value of award',
       'Percent Cumulatively Obligated over total value already awarded',
       'Indicator'],
      dtype='object')

In [173]:
df.drop('federal_action_obligation', axis =1, inplace = True)

In [174]:
X, y =  df.iloc[:,:-1].values, df.loc[:,'Indicator']

In [185]:
y.shape

(66533,)

In [175]:
# This produces a scatter plot with each of the features, 
# but does not cover each permutation of features
# 10 min runtime, I should rethink this

'''
features = len(df.columns) - 2
for i in range(features):
    fig = pyplot.figure()
    mask  = y == 1
    pyplot.plot(X[mask].iloc[:,i], X[mask].iloc[:, i+1], 'k*', lw=2, ms=10)
    pyplot.plot(X[~mask].iloc[:, i], X[~mask].iloc[:, i+1], 'ko', mfc='y', ms=8, mec='k', mew=1)
   
    # add axes labels
    pyplot.xlabel(df.columns[i])
    pyplot.ylabel(df.columns[i+1])
    pyplot.legend(['Sweep', 'Not a Sweep'])
    pass
'''

"\nfeatures = len(df.columns) - 2\nfor i in range(features):\n    fig = pyplot.figure()\n    mask  = y == 1\n    pyplot.plot(X[mask].iloc[:,i], X[mask].iloc[:, i+1], 'k*', lw=2, ms=10)\n    pyplot.plot(X[~mask].iloc[:, i], X[~mask].iloc[:, i+1], 'ko', mfc='y', ms=8, mec='k', mew=1)\n   \n    # add axes labels\n    pyplot.xlabel(df.columns[i])\n    pyplot.ylabel(df.columns[i+1])\n    pyplot.legend(['Sweep', 'Not a Sweep'])\n    pass\n"

<a id="section1"></a>

#### Sigmoid function

the logistic regression hypothesis is defined as:

$$ h_\theta(x) = g(\theta^T x)$$

where function $g$ is the sigmoid function. The sigmoid function is defined as: 

$$g(z) = \frac{1}{1+e^{-z}}$$.

 For large positive values of `x`, the sigmoid should be close to 1, while for large negative values, the sigmoid should be close to 0. Evaluating `sigmoid(0)` should give you exactly 0.5. 
 
 This is used for classification models (rather than regresssion models).
 
<a id="sigmoid"></a>

In [176]:
def sigmoid(z):
    """
    Compute sigmoid function given the input z.
    
    Parameters
    ----------
    z : array_like
        The input to the sigmoid function. This can be a 1-D vector 
        or a 2-D matrix. 
    
    Returns
    -------
    g : array_like
        The computed sigmoid function. g has the same shape as z, since
        the sigmoid is computed element-wise on z.
        
    Instructions
    ------------
    Compute the sigmoid of each value of z (z can be a matrix, vector or scalar).
    """
    # convert input to a numpy array
    z = np.array(z)
    
    # You need to return the following variables correctly 
    g = np.zeros(z.shape)

    temp = 1 + np.power(np.e,-z)
    g = 1 / temp
    

    return g

In [177]:
# Setup the data matrix appropriately, and add ones for the intercept term
m, n = X.shape
# Add intercept term to X
X = np.concatenate([np.ones((m, 1)), X], axis=1)

Develop a function `costFunction` to return the cost and gradient. Recall that the cost function in logistic regression is

$$ J(\theta) = \frac{1}{m} \sum_{i=1}^{m} \left[ -y^{(i)} \log\left(h_\theta\left( x^{(i)} \right) \right) - \left( 1 - y^{(i)}\right) \log \left( 1 - h_\theta\left( x^{(i)} \right) \right) \right]$$

and the gradient (derivative) of the cost is a vector of the same length as $\theta$ where the $j^{th}$
element (for $j = 0, 1, \cdots , n$) is defined as follows:

$$ \frac{\partial J(\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^m \left( h_\theta \left( x^{(i)} \right) - y^{(i)} \right) x_j^{(i)} $$

Note that while this gradient looks identical to the linear regression gradient, the formula is actually different because linear and logistic regression have different definitions of $h_\theta(x)$.


In [178]:
def costFunction(theta, X, y):
    """
    Compute cost and gradient for logistic regression. 
    
    Parameters
    ----------
    theta : array_like
        The parameters for logistic regression. This a vector
        of shape (n+1, ).
    
    X : array_like
        The input dataset of shape (m x n+1) where m is the total number
        of data points and n is the number of features. We assume the 
        intercept has already been added to the input.
    
    y : arra_like
        Labels for the input. This is a vector of shape (m, ).
    
    Returns
    -------
    J : float
        The computed value for the cost function. 
    
    grad : array_like
        A vector of shape (n+1, ) which is the gradient of the cost
        function with respect to theta, at the current values of theta.
        
    Instructions
    ------------
    Compute the cost of a particular choice of theta. You should set J to 
    the cost. Compute the partial derivatives and set grad to the partial
    derivatives of the cost w.r.t. each parameter in theta.
    """
    # Initialize some useful values
    m = y.size  # number of training examples

    # You need to return the following variables correctly 
    J = 0
    grad = np.zeros(theta.shape)

    # ====================== YOUR CODE HERE ======================

    h = sigmoid(np.dot(X,theta))
    temp = -y*np.log(h) - (1-y)*np.log(1-h)
    J = (1/m)*np.sum(temp)
    grad = (1/m)*np.dot(X.T,(h-y))
    # =============================================================
    return J, grad

In [179]:
# Initialize fitting parameters
initial_theta = np.zeros(n+1)

cost, grad = costFunction(initial_theta, X, y)
print(cost)

print('Cost at initial theta (zeros): {:.3f}'.format(cost))
print('Expected cost (approx): 0.693\n')

print('Gradient at initial theta (zeros):')
print('\t[{:.4f}, {:.4f}, {:.4f}]'.format(*grad))
print('Expected gradients (approx):\n\t[-0.1000, -12.0092, -11.2628]\n')

# Compute and display cost and gradient with non-zero theta
test_theta = np.array([-24, 0.2, 0.2])
cost, grad = costFunction(test_theta, X, y)

print('Cost at test theta: {:.3f}'.format(cost))
print('Expected cost (approx): 0.218\n')

print('Gradient at test theta:')
print('\t[{:.3f}, {:.3f}, {:.3f}]'.format(*grad))
print('Expected gradients (approx):\n\t[0.043, 2.566, 2.647]')

0.0
Cost at initial theta (zeros): 0.000
Expected cost (approx): 0.693

Gradient at initial theta (zeros):
	[nan, nan, nan]
Expected gradients (approx):
	[-0.1000, -12.0092, -11.2628]



ValueError: shapes (66533,9) and (3,) not aligned: 9 (dim 1) != 3 (dim 0)

In [None]:
y.shape()