In [1]:
import pandas as pd
import numpy as np
import sys
import os
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [21]:
# Set the directory containing all raw data
rootDir = '/Users/alia/Documents/Github/DoDContractApp/Raw Data/Service Contracts'

# Loop through subfolders in directory
for dirName, subdirList, fileList in os.walk(rootDir):
    print('Found directory: %s' % dirName)
    
    # If there are files in the folder
    if len(fileList)>0:
        
        for fname in fileList:
            if fname.endswith('.csv'): 
                print('\t%s' % fname)
                filepath = f'{dirName}/{fname}'
                df = pd.read_csv(filepath)
                df = df[['contract_award_unique_key',
                         'total_obligated_amount',
                         'award_base_action_date',
                         'awarding_agency_name',
                         'awarding_sub_agency_name',
                         'awarding_office_name',
                         'recipient_name',
                         'primary_place_of_performance_state_code',
                         'product_or_service_code_description',
                         'dod_claimant_program_description',
                         'type_of_contract_pricing',
                         'award_type',
                         'contract_bundling',
                         'solicitation_procedures',
                         'naics_code',
                         'naics_description',
                         'last_modified_date']]
        
                df.to_csv(f'/Users/alia/Documents/Github/DoDContractApp/Clean Data/Service Contracts/{fname}',index=False)

Found directory: /Users/alia/Documents/Github/DoDContractApp/Raw Data/Service Contracts
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv


In [8]:
# Set the directory containing all raw data
rootDir = '/Users/alia/Documents/Github/DoDContractApp/Clean Data/Service Contracts'

# Loop through subfolders in directory
for dirName, subdirList, fileList in os.walk(rootDir):
    print('Found directory: %s' % dirName)
    
    # If there are files in the folder
    if len(fileList)>0:
        
        col_list = ['awarding_sub_agency_name',
                    'awarding_office_name',
                    'recipient_name',
                    'primary_place_of_performance_state_code',
                    'product_or_service_code_description',
                    'dod_claimant_program_description',
                    'type_of_contract_pricing',
                    'award_type',
                    'contract_bundling',
                    'solicitation_procedures',
                    'naics_description']
        
        for col in col_list:
            print(col)
            
            # Create empty dataframe
            df = pd.DataFrame()
        
            for fname in fileList:

                if fname.endswith('.csv'): 
                    print('\t%s' % fname)
                    filepath = f'{dirName}/{fname}'
                    dat = pd.read_csv(filepath)

                    sub = dat.groupby([col])[['total_obligated_amount']].sum()
                    sub.reset_index(inplace=True)
                    sub = sub.sort_values('total_obligated_amount',ascending=False)
                    sub = sub.reset_index(drop=True)
                    yr = int(fname[2:6])
                    sub['fiscal_year']=yr

                    if len(sub)>10 and col!='primary_place_of_performance_state_code':
                        # Add together groups with smaller total obligations as "Other"
                        other = sub[11:]
                        other_sum = other['total_obligated_amount'].sum()

                        sub = sub[:10]
                        sub.loc[len(sub)] = ['OTHER',other_sum,yr]
                        sub = sub.sort_values('total_obligated_amount',ascending=True)
                        sub = sub.reset_index(drop=True)

                    df = df.append(sub)

            df.to_csv(f'/Users/alia/Documents/Github/DoDContractApp/Clean Data/Plot Data/{col}.csv',index=False)

Found directory: /Users/alia/Documents/Github/DoDContractApp/Clean Data/Service Contracts
awarding_sub_agency_name
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv
awarding_office_name
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv
recipient_name
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv
primary_place_of_performance_state_code
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv
product_or_service_code_description
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.csv
	FY2020.csv
	FY2021.csv
	FY2019.csv
	FY2018.csv
dod_claimant_program_description
	FY2016.csv
	FY2017.csv
	FY2015.csv
	FY2014.csv
	FY2013.csv
	FY2012.csv
	FY2022.

In [3]:
def get_data(fname):
    """
    This function imports a file of longitudinal contract spending data from the Github repository for this project.
    Input: file name
    Output: Dataframe of agencies their total spending (pd.Dataframe)
    """
    url = f'https://github.com/abdelkaderalia/DoDContractApp/raw/main/Clean%20Data/Plot%20Data/{fname}.csv'
    df = pd.read_csv(url)
    return df

In [4]:
df_agencies = get_data('compare_agencies')
df_agencies

Unnamed: 0,agency,fiscal_year,spending
0,Department of Defense,2012,151413824648
1,Department of Defense,2013,132010946125
2,Department of Defense,2014,128651011788
3,Department of Defense,2015,120425333223
4,Department of Defense,2016,125523397057
5,Department of Defense,2017,132188208908
6,Department of Defense,2018,148701831637
7,Department of Defense,2019,160019506504
8,Department of Defense,2020,172578596498
9,Department of Defense,2021,160606305545
