In [None]:
from bln.client import Client
from tqdm import tqdm

import csv
import datetime
from decimal import *
from glob import glob
import json
import logging
import os

In [None]:
datadir = "data/"

In [None]:
reasons = {
    "E": "Terminate for default",
    "F": "Terminate for convenience",
    "K": "Close out",
    "N": "Legal contract cancellation",
    "X": "Terminate for cause"
}

In [None]:
from importlib import reload
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')
logger = logging.getLogger()
logger.setLevel('DEBUG')

In [None]:
### Depending on the value of contract_type, one of these gets prefixed to many values. We want to merge these groups into one column.

prefixes = {
    "IDV": "content__IDV__",
    "AWARD": "content__award__",
    "OTHERTRANSACTIONAWARD": "content__OtherTransactionAward__contractDetail__",
    "OTHERTRANSACTIONIDV": "content__OtherTransactionIDV__contractDetail__",
}

In [None]:
roughshortwanted = """
title
contract_type
# link__rel
# link__type
link__href
modified
# content
# content__type
relevantContractDates__signedDate
relevantContractDates__effectiveDate
relevantContractDates__currentCompletionDate
relevantContractDates__ultimateCompletionDate
relevantContractDates__solicitationDate


dollarValues__obligatedAmount
dollarValues__baseAndAllOptionsValue
totalDollarValues__totalObligatedAmount
totalDollarValues__totalBaseAndAllOptionsValue
# placeOfPerformance__principalPlaceOfPerformance
awardID__awardContractID__agencyID
awardID__awardContractID__agencyID__name
awardID__awardContractID__PIID
awardID__awardContractID__modNumber
awardID__awardContractID__transactionNumber
ref_idv_agency': 'content__award__awardID__referencedIDVID__agencyID
ref_idv_procurement_id': 'content__award__awardID__referencedIDVID__PIID
ref_idv_modification_num': 'content__award__awardID__referencedIDVID__modNumber

purchaserInformation__contractingOfficeID
purchaserInformation__contractingOfficeID__name
placeOfPerformance__principalPlaceOfPerformance__stateCode
placeOfPerformance__principalPlaceOfPerformance__stateCode__name
placeOfPerformance__principalPlaceOfPerformance__countryCode
placeOfPerformance__principalPlaceOfPerformance__countryCode__name
placeOfPerformance__principalPlaceOfPerformance__locationCode
placeOfPerformance__placeOfPerformanceZIPCode
placeOfPerformance__placeOfPerformanceZIPCode__county
placeOfPerformance__placeOfPerformanceZIPCode__city
placeOfPerformance__placeOfPerformanceCongressionalDistrict
purchaserInformation__contractingOfficeAgencyID
purchaserInformation__contractingOfficeAgencyID__name
purchaserInformation__contractingOfficeAgencyID__departmentID
purchaserInformation__contractingOfficeAgencyID__departmentName
purchaserInformation__contractingOfficeID
purchaserInformation__contractingOfficeID__name
contractData__contractActionType
contractData__contractActionType__description
contractData__reasonForModification
contractData__reasonForModification__description
contractData__descriptionOfContractRequirement
competition__extentCompeted
competition__extentCompeted__description
# transactionInformation
transactionInformation__createdBy
transactionInformation__createdDate
transactionInformation__lastModifiedBy
transactionInformation__lastModifiedDate
transactionInformation__status
transactionInformation__status__description
transactionInformation__closedStatus
vendor__vendorHeader__vendorDoingAsBusinessName
# placeOfPerformance__principalPlaceOfPerformance
contractData__solicitationID
# vendor__vendorSiteDetails__vendorLocation
# vendor__vendorSiteDetails__entityIdentifiers
# vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation
# vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation__UEI
vendor__vendorSiteDetails__entityIdentifiers__cageCode
vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation__UEILegalBusinessName
vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation__ultimateParentUEI
vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation__ultimateParentUEIName
vendor__vendorSiteDetails__vendorLocation__streetAddress
vendor__vendorSiteDetails__vendorLocation__city
vendor__vendorSiteDetails__vendorLocation__state
vendor__vendorSiteDetails__vendorLocation__state__name
vendor__vendorSiteDetails__vendorLocation__ZIPCode
vendor__vendorSiteDetails__vendorLocation__ZIPCode__city
vendor__vendorSiteDetails__vendorLocation__countryCode
vendor__vendorSiteDetails__vendorLocation__countryCode__name
vendor__vendorSiteDetails__vendorLocation__phoneNo
vendor__vendorSiteDetails__vendorLocation__faxNo
vendor__vendorSiteDetails__vendorLocation__congressionalDistrictCode
vendor__vendorSiteDetails__vendorLocation__entityDataSource

productOrServiceInformation__productOrServiceCode
productOrServiceInformation__productOrServiceCode__description
productOrServiceInformation__principalNAICSCode
productOrServiceInformation__principalNAICSCode__description

# vendor__vendorSiteDetails
# vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isAlaskanNativeOwnedCorporationOrFirm
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isAmericanIndianOwned
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isIndianTribe
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isNativeHawaiianOwnedOrganizationOrFirm
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isTriballyOwnedFirm
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isSmallBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isVeteranOwned
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isServiceRelatedDisabledVeteranOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isWomenOwned
# vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isMinorityOwned
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isSubContinentAsianAmericanOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isAsianPacificAmericanOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isBlackAmericanOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isHispanicAmericanOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isNativeAmericanOwnedBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__isOtherMinorityOwned
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isVerySmallBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isWomenOwnedSmallBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isEconomicallyDisadvantagedWomenOwnedSmallBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isJointVentureWomenOwnedSmallBusiness
vendor__vendorSiteDetails__vendorSocioEconomicIndicators__isJointVentureEconomicallyDisadvantagedWomenOwnedSmallBusiness
# vendor__vendorSiteDetails__vendorBusinessTypes
vendor__vendorSiteDetails__vendorBusinessTypes__isCommunityDevelopedCorporationOwnedFirm
vendor__vendorSiteDetails__vendorBusinessTypes__isLaborSurplusAreaFirm
# vendor__vendorSiteDetails__vendorBusinessTypes__federalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__federalGovernment__isFederalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__federalGovernment__isFederallyFundedResearchAndDevelopmentCorp
vendor__vendorSiteDetails__vendorBusinessTypes__federalGovernment__isFederalGovernmentAgency
vendor__vendorSiteDetails__vendorBusinessTypes__isStateGovernment
# vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isCityLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isCountyLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isInterMunicipalLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isLocalGovernmentOwned
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isMunicipalityLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isSchoolDistrictLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__isTownshipLocalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__isTribalGovernment
vendor__vendorSiteDetails__vendorBusinessTypes__isForeignGovernment
# vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isCorporateEntityNotTaxExempt
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isCorporateEntityTaxExempt
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isPartnershipOrLimitedLiabilityPartnership
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isSolePropreitorship
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isSmallAgriculturalCooperative
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isInternationalOrganization
vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__isUSGovernmentEntity
# vendor__vendorSiteDetails__vendorLineOfBusiness
vendor__vendorSiteDetails__vendorLineOfBusiness__isCommunityDevelopmentCorporation
vendor__vendorSiteDetails__vendorLineOfBusiness__isDomesticShelter
vendor__vendorSiteDetails__vendorLineOfBusiness__isEducationalInstitution
vendor__vendorSiteDetails__vendorLineOfBusiness__isFoundation
vendor__vendorSiteDetails__vendorLineOfBusiness__isHospital
vendor__vendorSiteDetails__vendorLineOfBusiness__isManufacturerOfGoods
vendor__vendorSiteDetails__vendorLineOfBusiness__isVeterinaryHospital
vendor__vendorSiteDetails__vendorLineOfBusiness__isHispanicServicingInstitution
# vendor__vendorSiteDetails__vendorRelationshipWithFederalGovernment
vendor__vendorSiteDetails__vendorRelationshipWithFederalGovernment__receivesContracts
vendor__vendorSiteDetails__vendorRelationshipWithFederalGovernment__receivesGrants
vendor__vendorSiteDetails__vendorRelationshipWithFederalGovernment__receivesContractsAndGrants
# vendor__vendorSiteDetails__typeOfGovernmentEntity
vendor__vendorSiteDetails__typeOfGovernmentEntity__isAirportAuthority
vendor__vendorSiteDetails__typeOfGovernmentEntity__isCouncilOfGovernments
vendor__vendorSiteDetails__typeOfGovernmentEntity__isHousingAuthoritiesPublicOrTribal
vendor__vendorSiteDetails__typeOfGovernmentEntity__isInterstateEntity
vendor__vendorSiteDetails__typeOfGovernmentEntity__isPlanningCommission
vendor__vendorSiteDetails__typeOfGovernmentEntity__isPortAuthority
vendor__vendorSiteDetails__typeOfGovernmentEntity__isTransitAuthority
# vendor__vendorSiteDetails__vendorOrganizationFactors
vendor__vendorSiteDetails__vendorOrganizationFactors__isSubchapterSCorporation
vendor__vendorSiteDetails__vendorOrganizationFactors__isLimitedLiabilityCorporation
vendor__vendorSiteDetails__vendorOrganizationFactors__isForeignOwnedAndLocated
# vendor__vendorSiteDetails__vendorOrganizationFactors__profitStructure
vendor__vendorSiteDetails__vendorOrganizationFactors__profitStructure__isForProfitOrganization
vendor__vendorSiteDetails__vendorOrganizationFactors__profitStructure__isNonprofitOrganization
vendor__vendorSiteDetails__vendorOrganizationFactors__profitStructure__isOtherNotForProfitOrganization
vendor__vendorSiteDetails__vendorOrganizationFactors__isShelteredWorkshop
# vendor__vendorSiteDetails__typeOfEducationalEntity
vendor__vendorSiteDetails__typeOfEducationalEntity__is1862LandGrantCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__is1890LandGrantCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__is1994LandGrantCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__isHistoricallyBlackCollegeOrUniversity
vendor__vendorSiteDetails__typeOfEducationalEntity__isMinorityInstitution
vendor__vendorSiteDetails__typeOfEducationalEntity__isPrivateUniversityOrCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__isSchoolOfForestry
vendor__vendorSiteDetails__typeOfEducationalEntity__isStateControlledInstitutionofHigherLearning
vendor__vendorSiteDetails__typeOfEducationalEntity__isTribalCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__isVeterinaryCollege
vendor__vendorSiteDetails__typeOfEducationalEntity__isAlaskanNativeServicingInstitution
vendor__vendorSiteDetails__typeOfEducationalEntity__isNativeHawaiianServicingInstitution
# vendor__vendorSiteDetails__vendorCertifications
vendor__vendorSiteDetails__vendorCertifications__isDOTCertifiedDisadvantagedBusinessEnterprise
vendor__vendorSiteDetails__vendorCertifications__isSelfCertifiedSmallDisadvantagedBusiness
vendor__vendorSiteDetails__vendorCertifications__isSBACertifiedSmallDisadvantagedBusiness
vendor__vendorSiteDetails__vendorCertifications__isSBACertified8AProgramParticipant
vendor__vendorSiteDetails__vendorCertifications__isSelfCertifiedHUBZoneJointVenture
vendor__vendorSiteDetails__vendorCertifications__isSBACertifiedHUBZone
vendor__vendorSiteDetails__vendorCertifications__isSBACertified8AJointVenture

"""
shortwanted = []
for shortthing in roughshortwanted.splitlines():
    shorterthing = shortthing.split("#")[0].strip()
    if len(shorterthing) > 3:    # Drop commented-out rows, drop comments
        shortwanted.append(shorterthing)

In [None]:
def deeper_field_clean(text):
    ## These are different from prefixes in that they don't create distinct columns that need to be consolidated
    ## they're just junk text that, for our purposes, unnecessarily lengthens colu
    subprefixes = [
        "vendor__vendorSiteDetails__entityIdentifiers__vendorUEIInformation__",
        "vendor__vendorSiteDetails__typeOfEducationalEntity__",
        "vendor__vendorSiteDetails__vendorCertifications__",
        "vendor__vendorSiteDetails__vendorOrganizationFactors__",
        "vendor__vendorSiteDetails__typeOfGovernmentEntity__",
        "vendor__vendorSiteDetails__vendorLineOfBusiness__",
        "vendor__vendorSiteDetails__vendorBusinessTypes__businessOrOrganizationType__",
        "vendor__vendorSiteDetails__vendorBusinessTypes__localGovernment__",
        "vendor__vendorSiteDetails__vendorBusinessTypes__federalGovernment__",
        "vendor__vendorSiteDetails__vendorBusinessTypes__",
        "vendor__vendorSiteDetails__vendorSocioEconomicIndicators__minorityOwned__",
        "content__award__vendor__vendorHeader__",
        "placeOfPerformance__",
        
        # Deeper cuts below
                "vendor__vendorSiteDetails__vendorSocioEconomicIndicators__",
                "vendor__vendorSiteDetails__",
                "awardID__",
                "purchaserInformation__",
                "vendor__vendorHeader__",
    ]
    for subprefix in subprefixes:
        text = text.replace(subprefix, "")
    return(text)

In [None]:
def in_production():
    if "GITHUB_RUN_ID" in os.environ or socket.gethostname() in [
        "mikelight",
        "racknerd-26f61a",
    ]:
        return True
    else:
        return False


In [None]:
def send_files():
    # Start by seeing what we have
    rawfilenames = list(glob(datadir + "*"))
    basefilenames = []
    for rawfilename in rawfilenames:
        basefilename = rawfilename.replace("\\", "/").replace(datadir, "")
        basefilenames.append(basefilename)

    bln_api = os.environ["BLN_API_TOKEN"]
    bln = Client(bln_api)
    project = bln.get_project_by_name("Federal contract cancellations")
    project_id = project["id"]

    files_to_send = []
    # Get all the files in the project.
    archived_files = {}
    for f in project["files"]:
        archived_files[f["name"]] = f["updatedAt"]

    for basefilename in basefilenames:
        if basefilename not in archived_files or basefilename.endswith(".csv"):
            files_to_send.append(basefilename)

    logger.debug(f"{len(archived_files):,} archived files found.")
    logger.debug(f"{len(files_to_send):,} files to send to Big Local News.")
    if len(files_to_send) == 0:
        pass
    else:
        project_id = project["id"]
        for file_to_send in tqdm(files_to_send):
            bln.upload_file(project_id, datadir + file_to_send)
    return

In [None]:
def get_zip_lookup():
    """Build a ZIP/ZCTA lookup table, if needed
    
    Arguments:
        None
    Returns:
        global dictionary named ziplookup
    Uses:
        zip-lookup.csv from mable-raw.csv via parse-zips.ipynb
    """
    if 'ziplookup' in globals():
        pass        # ZIP lookup table already initialized
    else:
        logger.debug("ZIP lookup table being initialized")
        global ziplookup
        ziplookup = {}
        with open("zip-lookup.csv", "r", encoding="utf-8") as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                ziplookup[row['zip_code']] = row
    return()

In [None]:
def add_county_details(row: dict):
    """Take a list returned from the API or read from a CSV, and append geographic details.
    
    Arguments:
        row, a dictionary
    Returns:
        row, still a dictionary
    """
    get_zip_lookup()            # Read in table    
    if "geo_fips" not in row or row['geo_fips'] == "Unknown":     # If we need to do a lookup
        if not row["placeOfPerformanceZIPCode"]:
            for item in ["geo_fips", "geo_county_name", "geo_zip_name"]:
                row[item] = "Unknown"
        else:
            rowzip = row['placeOfPerformanceZIPCode'][0:5]             # Lose the extension for 9-digit ZIP codes
            if rowzip not in ziplookup:     # If we can't look up
                for item in ["geo_fips", "geo_county_name", "geo_zip_name"]:
                    row[item] = "Unknown"
            else:     # We need to look up, and we can look up
                row['geo_fips'] = ziplookup[rowzip]['zip_fips']
                row['geo_county_name'] = ziplookup[rowzip]['zip_county_name']
                row['geo_zip_name'] = ziplookup[rowzip]['zip_place_name']
    return(row)



In [None]:
def dedupe_by_contract_id(records):
    """Keep only the most recent record for each unique contract combination"""
    from collections import defaultdict
    from datetime import datetime
    
    grouped = defaultdict(list)
    
    for record in records:
        # Create the unique key from the three contract ID fields
        key = (
#            record.get('agency_id'),
            record.get('contract_id'), 
            record.get('modification_number')
        )
        
        # Parse the date for comparison
        
        record['_parsed_date'] = datetime.strptime(record['modified'], "%Y-%m-%d %H:%M:%S")
        
        grouped[key].append(record)
    
    # Keep only the most recent record from each group
    deduped_records = []
    for records_group in grouped.values():
        most_recent = max(records_group, key=lambda x: x['_parsed_date'])
        # Clean up the temporary field
        del most_recent['_parsed_date']
        deduped_records.append(most_recent)
    
    return deduped_records

In [None]:
# Now, composite the JSONs into CSVs.

extraheaders = []
for item in shortwanted:
    extraheaders.append(deeper_field_clean(item))
for reason in reasons:
    locallist = []
    rawfilenames = list(sorted(glob(datadir + f"*_{reason}.json")))
    for rawfilename in tqdm(rawfilenames):
        basefilename = rawfilename.replace("\\", "/").replace(datadir, "")
        filedate = basefilename.split("contracts-")[-1].split("_")[0]
        # reason = basefilename.split(".json")[0].split("_")[-1]
        # localdict['reason'] = reason
        with open(rawfilename, "r", encoding="utf-8") as infile:
            rawjson = json.load(infile)
        for entry in rawjson:
            localdict = {
                "state": None,
                "county": None,
                "filedate": filedate,
                "cancelled": None,
                "change": None,
                "business": None,
                "agency": None,
                "url": None,
                "contract_requirement": None,
                "general_service_description": None,
                "reasoncode": f"{reason}: {reasons[reason]}",
                "filename": basefilename,                
            }
            try:
                prefix = prefixes[entry['contract_type']]
            except:
                logger.error(entry)
            for item in extraheaders:
                localdict[item] = None
            for field in entry:
                fieldshort = field.replace(prefix, "")
                if fieldshort in shortwanted:
                    localdict[deeper_field_clean(fieldshort)] = entry[field].strip()
            localdict = add_county_details(localdict)

            # Now we need to fill in some blanks at the beginning
            localdict["state"] = localdict["principalPlaceOfPerformance__stateCode"]
            localdict["county"] = localdict["geo_county_name"]
            localdict["cancelled"] = localdict["modified"]
            localdict["change"] = localdict["dollarValues__obligatedAmount"]
            localdict["business"] = localdict["UEILegalBusinessName"]
            localdict["agency"] = localdict["awardContractID__agencyID__name"]
            localdict["url"] = localdict["link__href"]
            localdict["contract_requirement"] = localdict["contractData__descriptionOfContractRequirement"]
            localdict["general_service_description"] = localdict["productOrServiceInformation__productOrServiceCode__description"]
            locallist.append(localdict)

    # Now we need to deduplicate by contract ID 
    logger.debug(f"Before deduping: {len(locallist):,} records")
    locallist = dedupe_by_contract_id(locallist)
    logger.debug(f"After deduping: {len(locallist):,} records")

    # Let's get some basic sorting in here
    locallist = sorted(locallist, key=lambda x: (x['filedate']), reverse=True)
    
    with open(f"{datadir}collected_{reason}.csv", "w", encoding="utf-8", newline="") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(list(localdict.keys()))
        for line in locallist:
            writer.writerow(list(line.values()))

    locallist = None
                

In [None]:
# We should only run this if it's in production. But for now:
if in_production:
    logger.debug("We're in production")
    send_files()
