#### Daily Data Pull

In [1]:
# imports
import pandas as pd
import numpy as np
import requests
import time
import datetime
from nltk.tokenize import RegexpTokenizer

import os

In [2]:
# set display options 
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

In [3]:
# define function to return minimum postedDate as string
def get_min_post_date(filename):
    # read in file of downloaded contract opportunities
    df = pd.read_csv(filename)
    # get min posted date
    min_date = pd.to_datetime(df['postedDate']).min()
    return min_date.strftime('%m/%d/%Y')

In [4]:
# define function to return maximum postedDate as string
def get_max_post_date(filename):
    # read in file of downloaded contract opportunities
    df = pd.read_csv(filename)
    # get min posted date
    max_date = pd.to_datetime(df['postedDate']).max()
    return max_date.strftime('%m/%d/%Y')

In [5]:
# define function to return today's date as string
def today():
    now = datetime.datetime.now()
    return now.strftime('%m/%d/%Y')

In [6]:
# pull contract information from api.sam.gov

# import api key from environment 
api_key = os.environ.get('beta_sam_gov_key')

# set base url
url = 'https://api.sam.gov/prod/opportunities/v1/search'

# create empty list to store results
result = []
# initialize counter
count = 0
# downloaded contract opportunities - file name
# file = './data/combined.csv'

# set postedTo date to today's date by calling today() function
postedTo = today()

# set posted from date
postedFrom = get_max_post_date('./data/combined.csv')

# for loop to pull contracts
for i in range(2):
    count += 1
    
    # do a get request
    req = requests.get(url,
                      params={
                          'api_key': api_key,
                          'postedFrom': postedFrom,
                          'postedTo': postedTo,
                          'limit': 1000,
                          'offset': (count-1) * 1000
                      })
    
    # add response to result list
    result.append(req)
    
    now = datetime.datetime.now()
    print('Time:', now.strftime("%Y-%m-%d %H:%M:%S"))
    time.sleep(5)
    
    
    
# source for datetime - https://www.w3resource.com/python-exercises/python-basic-exercise-3.php

Time: 2020-06-28 20:09:49
Time: 2020-06-28 20:10:13


In [7]:
# unpack list of json objects from response data
ops = []
for item in result:
    print(item.headers)
    ops.append(item.json())

{'Age': '23', 'Content-Type': 'application/hal+json', 'Date': 'Mon, 29 Jun 2020 00:09:49 GMT', 'Server': 'openresty', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'Via': 'http/1.1 api-umbrella (ApacheTrafficServer [cMsSf ])', 'X-Cache': 'MISS', 'X-Forwarded-For': '74.96.156.35, 10.177.16.72, 10.177.52.86, 10.177.52.86', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains; preload', 'Set-Cookie': 'citrix_ns_id=tJc98jUBMWiVzNAePWNcGjVXmt40001; Domain=.sam.gov; Path=/; Secure; HttpOnly', 'Cache-Control': 'private', 'Content-Encoding': 'gzip', 'Transfer-Encoding': 'chunked'}
{'Age': '19', 'Content-Type': 'application/hal+json', 'Date': 'Mon, 29 Jun 2020 00:10:13 GMT', 'Server': 'openresty', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'Via': 'http/1.1 api-umbrella (ApacheTrafficServer [cMsSf ])', 'X-Cache': 'MISS', 'X-Forwarded-For': '74.96.156.35, 10.177.16.72, 10.177.54.205, 

In [8]:
# parse json objects
ls_data = []
for i in range(len(ops)):
    print(ops[i].keys())
    df = pd.DataFrame(ops[i]['opportunitiesData'])
    ls_data.append(df)
    data = pd.concat(ls_data)

dict_keys(['totalRecords', 'limit', 'offset', 'opportunitiesData', 'links'])
dict_keys(['totalRecords', 'limit', 'offset', 'opportunitiesData', 'links'])


In [9]:
# check how many new data points were downloaded
data.shape

(1576, 27)

In [10]:
data.tail(10)

Unnamed: 0,noticeId,title,solicitationNumber,department,subTier,office,postedDate,type,baseType,archiveType,archiveDate,typeOfSetAsideDescription,typeOfSetAside,responseDeadLine,naicsCode,classificationCode,active,award,pointOfContact,description,organizationType,officeAddress,placeOfPerformance,additionalInfoLink,uiLink,links,resourceLinks
566,028cf5870db945b99d937e4ded2a9d0a,J045--Chilled Water Coil replacement,36C24620Q0495_3,"VETERANS AFFAIRS, DEPARTMENT OF","VETERANS AFFAIRS, DEPARTMENT OF",246-NETWORK CONTRACTING OFFICE 6 (36C246),2020-06-26,Presolicitation,Sources Sought,autocustom,2020-08-26,Service-Disabled Veteran-Owned Small Business (SDVOSB) Set-Aside (FAR 19.14),SDVOSBC,2020-06-26T15:00:00-04:00,238220,J045,Yes,,"[{'fax': None, 'type': 'primary', 'email': 'breon.campbell@va.gov', 'phone': None, 'title': None, 'fullName': 'Breon Campbell breon.campbell@va.gov'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=028cf5870db945b99d937e4ded2a9d0a,OFFICE,"{'zipcode': '23667', 'city': 'HAMPTON', 'countryCode': 'USA', 'state': 'VA'}","{'city': {'code': '67000', 'name': 'Richmond'}, 'state': {'code': 'VA', 'name': 'Virginia'}, 'zip': '23249', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/028cf5870db945b99d937e4ded2a9d0a/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=028cf5870db945b99d937e4ded2a9d0a&limit=1'}]",[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/9bbde7388f5746cfb066c777f5607109/download?api_key=null&token=]
567,0277e56e75db49a89ac6b70cde4ec7e8,Internal Combustion Lift Truck - Forklifts,W912L920R0024,DEPT OF DEFENSE,DEPT OF THE ARMY,W7M7 USPFO ACTIVITY IN ARNG,2020-06-26,Combined Synopsis/Solicitation,Combined Synopsis/Solicitation,auto15,2020-07-25,Total Small Business Set-Aside (FAR 19.5),SBA,2020-07-10T10:00:00-04:00,333924,2420,Yes,,"[{'fax': '', 'type': 'primary', 'email': 'brenda.j.simmons6.civ@mail.mil', 'phone': '3172473154', 'title': None, 'fullName': 'Brenda J. Simmons'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=0277e56e75db49a89ac6b70cde4ec7e8,OFFICE,"{'zipcode': '46241-4839', 'city': 'INDIANAPOLIS', 'countryCode': 'USA', 'state': 'IN'}","{'city': {'code': '36000', 'name': 'Indianapolis'}, 'state': {'code': 'IN', 'name': 'Indiana'}, 'zip': '46241', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/0277e56e75db49a89ac6b70cde4ec7e8/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=0277e56e75db49a89ac6b70cde4ec7e8&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/0cc756763a8c4791912ab16e0c8ac64a/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/a219ba96b57d473eafcebd1984bbd6ed/download?api_key=null&token=]"
568,027229651ce3463fba2a625c5e27da4f,B--North American Bat Monitoring Program (NABat) Surv,140P6220Q0015,"INTERIOR, DEPARTMENT OF THE",NATIONAL PARK SERVICE,MWR MISSOURI MABO(62000),2020-06-26,Solicitation,Solicitation,auto15,2020-07-16,Total Small Business Set-Aside (FAR 19.5),SBA,2020-07-01,541620,B,Yes,{'awardee': {'location': {}}},"[{'fax': None, 'type': 'primary', 'email': 'Kathryn_Logsdon@nps.gov', 'phone': None, 'title': None, 'fullName': 'Logsdon, Kathryn'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=027229651ce3463fba2a625c5e27da4f,OFFICE,"{'zipcode': '63102', 'city': 'SAINT LOUIS', 'countryCode': 'USA', 'state': 'MO'}","{'streetAddress': 'Ozark National Scenic Riverways, NABat Grid Cell Numbers , 5999, 10095, 1775, Van Buren, MO', 'zip': '63965-9603', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/027229651ce3463fba2a625c5e27da4f/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=027229651ce3463fba2a625c5e27da4f&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/7281abae448743f19712c3146d9af1fe/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/247abb2b10b54e5485f51d20fec15e08/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/f6baf5b1f5174692912d9bb82010ed60/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/815ef1f0b0c14deead8dace7c64484d4/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/fb23be4314264dfcad8dc3755bc28103/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/dc7aaa4f924742cb8209964fbfb763b8/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/401084b152c14bff80a3c9dc4adf0eb0/download?api_key=null&token=]"
569,024a12fcc9df439b81c44e839a33d264,Z--Roof Repairs SAN FRANCISCO MARITIME NATIONAL HISTO,140P8620Q0014,"INTERIOR, DEPARTMENT OF THE",NATIONAL PARK SERVICE,PWR GOGA(86000),2020-06-26,Award Notice,Solicitation,autocustom,2021-06-24,,,,238160,Z,Yes,"{'date': '2020-06-26', 'number': '140P8620P0072', 'amount': '109900', 'awardee': {'name': 'CRAWFORD R W LLC', 'location': {}, 'duns': '032447405', 'cageCode': '8DKP4'}}","[{'fax': None, 'type': 'primary', 'email': 'Gary_Kramer@nps.gov', 'phone': None, 'title': None, 'fullName': 'Kramer, Gary'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=024a12fcc9df439b81c44e839a33d264,OFFICE,"{'zipcode': '94123', 'city': 'SAN FRANCISCO', 'countryCode': 'USA', 'state': 'CA'}",{},,https://beta.sam.gov/opp/024a12fcc9df439b81c44e839a33d264/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=024a12fcc9df439b81c44e839a33d264&limit=1'}]",
570,01cfe01a9fe84379b57af8a8a58b8745,Card Box Assembly- Cold Gas (Solicitation),N0025320R0021,DEPT OF DEFENSE,DEPT OF THE NAVY,NAVAL UNDERSEA WARFARE CENTER,2020-06-26,Presolicitation,Presolicitation,auto15,2020-08-11,Total Small Business Set-Aside (FAR 19.5),SBA,2020-07-27T12:00:00-07:00,334418,5998,Yes,,"[{'fax': '', 'type': 'primary', 'email': 'nishanth.sukasi@navy.mil', 'phone': '3603962168', 'title': None, 'fullName': 'Nishanth Sukasi'}, {'fax': '', 'type': 'secondary', 'email': 'anita.moosmiller@navy.mil', 'phone': '3603158501', 'title': None, 'fullName': 'Anita Moosmiller'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=01cfe01a9fe84379b57af8a8a58b8745,OFFICE,"{'zipcode': '98345-7610', 'city': 'KEYPORT', 'countryCode': 'USA', 'state': 'WA'}","{'city': {'code': '35625', 'name': 'Keyport'}, 'state': {'code': 'WA', 'name': 'Washington'}, 'zip': '98345', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/01cfe01a9fe84379b57af8a8a58b8745/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=01cfe01a9fe84379b57af8a8a58b8745&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/522ca183c93647628b8f0d474b1b8c81/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/c43790737a4c4325a28f0351b6e77dba/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/04d97a667098462788aba57f62a47b59/download?api_key=null&token=]"
571,0139f71ab6a640ffb9cb63c43f174f15,"53--COVER,ACCESS",SPE5E820Q0282,DEPT OF DEFENSE,DEFENSE LOGISTICS AGENCY (DLA),DLA TROOP SUPPORT,2020-06-26,Combined Synopsis/Solicitation,Combined Synopsis/Solicitation,autocustom,2020-07-30,,,2020-06-30,332722,53,Yes,{'awardee': {'location': {}}},"[{'fax': None, 'type': 'primary', 'email': 'DibbsBSM@dla.mil', 'phone': None, 'title': None, 'fullName': 'Questions regarding this solicitation should be emailed to the buyer listed in block 5 of the solicitation document which can be found under the Additional Information link. If the Additional Information link does not work, please go to https://www.dibbs.'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=0139f71ab6a640ffb9cb63c43f174f15,OFFICE,"{'zipcode': '19111', 'city': 'PHILADELPHIA', 'countryCode': 'USA', 'state': 'PA'}",{},,https://beta.sam.gov/opp/0139f71ab6a640ffb9cb63c43f174f15/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=0139f71ab6a640ffb9cb63c43f174f15&limit=1'}]",
572,009937033cc44619a17974cb83c5a541,"48--VALVE,REGULATING,FL",SPE7M320U1036,DEPT OF DEFENSE,DEFENSE LOGISTICS AGENCY (DLA),DLA LAND AND MARITIME,2020-06-26,Award Notice,Award Notice,auto15,2020-07-11,,,,332912,48,Yes,"{'date': '2020-06-26', 'number': 'SPE7M320D60ZG', 'amount': '250000.00', 'awardee': {'name': 'KAMPI COMPONENTS CO., INC. 88 CANAL RD FAIRLESS HILLS 19030-4302 US', 'location': {}, 'duns': '122679228', 'cageCode': '7Z016'}}","[{'fax': None, 'type': 'primary', 'email': 'COLIN.BROWN@DLA.MIL', 'phone': None, 'title': None, 'fullName': 'Colin Brown614-692-8156'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=009937033cc44619a17974cb83c5a541,OFFICE,"{'zipcode': '43218-3990', 'city': 'COLUMBUS', 'countryCode': 'USA', 'state': 'OH'}",{},,https://beta.sam.gov/opp/009937033cc44619a17974cb83c5a541/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=009937033cc44619a17974cb83c5a541&limit=1'}]",
573,0079b1c6fb4f47939859d19f9bce0c57,"Hazardous Waste Removal and Disposal at Camp Pendleton, CA",SP450020R0019,DEPT OF DEFENSE,DEFENSE LOGISTICS AGENCY (DLA),DLA DISPOSTION SERVICE - EBS,2020-06-26,Solicitation,Solicitation,auto15,2020-07-18,,,2020-07-03T15:00:00-04:00,562211,S222,Yes,,"[{'fax': '', 'type': 'primary', 'email': 'robert.seekamp@dla.mil', 'phone': '2699617216', 'title': None, 'fullName': 'Robert Seekamp'}, {'fax': '', 'type': 'secondary', 'email': 'erik.rundquist@dla.mil', 'phone': '2699614891', 'title': None, 'fullName': 'Erik Rundquist'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=0079b1c6fb4f47939859d19f9bce0c57,OFFICE,"{'zipcode': '49037-3092', 'city': 'BATTLE CREEK', 'countryCode': 'USA', 'state': 'MI'}","{'city': {'code': '10550', 'name': 'Camp Pendleton'}, 'state': {'code': 'CA', 'name': 'California'}, 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/0079b1c6fb4f47939859d19f9bce0c57/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=0079b1c6fb4f47939859d19f9bce0c57&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/d6d2a50160f64c469a0da38965fca813/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/c969a4cd29b847ccbb3a413ba7c92153/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/b1154b633c5144509c37cceb1c32000b/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/3794d891d6c24539b3bcf8a450d6be21/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/6be8452d35e14395a37ae20ef17c58af/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/e2a90b306a834b82bd3dd148835020a1/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/f894e5f6c99f4e5e88ed9dca055cfb7b/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/9a4be6d4d97c4d038938c36e3e2f45f4/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/5d3eb39e02944aa2b2ce9b0c25d42e7b/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/5aecec53fcb54a7aa9ab7abd63f93b9d/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/91417387c1bd434d9d6d5e3d5e652717/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/34e06d19a2fe45248c028457d0a3cb5f/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/1738819ee0fe4246aefc654cff1afc30/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/e0a128132f0c4b2286ddfbdad83773a8/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/28fef6e527ce4583b93cadcc0dcfd858/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/ff42f291f9e541869d2fa39e04c94b58/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/e3cdb6e45e1c4337adf834a972f8288a/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/9e93e051ddde4f7e927cd455ffdbc05b/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/67508a6f3ef0460488849860432a4836/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/998e03c2f163422994433b757e6f133b/download?api_key=null&token=]"
574,005626af1465469daf25f4a1cc3ad259,"Installation of Fence & Gates at the USARC in Ayers, Devens & Taunton, MA",W15QKN-20-Q-5241,DEPT OF DEFENSE,DEPT OF THE ARMY,W6QK ACC-PICA,2020-06-26,Solicitation,Solicitation,autocustom,2020-07-31,Total Small Business Set-Aside (FAR 19.5),SBA,2020-06-30T12:00:00-04:00,238990,Z2PZ,Yes,,"[{'fax': None, 'type': 'primary', 'email': 'deborah.e.terrell.civ@mail.mil', 'phone': '6095622093', 'title': None, 'fullName': 'Deborah E. Terrell'}, {'fax': None, 'type': 'secondary', 'email': 'sharon.wilson-emmons.civ@mail.mil', 'phone': '6095626405', 'title': None, 'fullName': 'Sharon Wilson-Emmons'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=005626af1465469daf25f4a1cc3ad259,OFFICE,"{'zipcode': '07806-5000', 'city': 'PICATINNY ARSENAL', 'countryCode': 'USA', 'state': 'NJ'}","{'city': {'code': '69170', 'name': 'Taunton'}, 'state': {'code': 'MA', 'name': 'Massachusetts'}, 'zip': '02780', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/005626af1465469daf25f4a1cc3ad259/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=005626af1465469daf25f4a1cc3ad259&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/110485a1c77e4a5f8cac4030b78326c7/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/ce37ed519d5641258a82f9ca32306dda/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/0124a437b13e45638fc754927c2ba2f8/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/0af2e61c270947ce81cc0a276ecdc028/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/6317dd132b6b4fc1829cd29db1b7d0b0/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/61ca71663dda414e973bf19cdd787243/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/5bc00d68fd0a4d14b89b3e191e24de06/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/3328e813aabd4baca70918fdc837ffe3/download?api_key=null&token=]"
575,00101a60204d480bb11b0ee8ab1d03dd,403 MXS Fall Restraint System,F2JTF80135AW01,DEPT OF DEFENSE,DEPT OF THE AIR FORCE,FA3010 81 CONS CC,2020-06-26,Combined Synopsis/Solicitation,Combined Synopsis/Solicitation,auto15,2020-07-18,,,2020-07-03T17:00:00-05:00,333923,3950,Yes,,"[{'fax': '', 'type': 'primary', 'email': 'john_andre.mabida@us.af.mil', 'phone': '228-377-8589', 'title': None, 'fullName': 'John Andre N. Mabida'}, {'fax': '', 'type': 'secondary', 'email': 'john_rexter.de_pedro@us.af.mil', 'phone': '228-377-1814', 'title': None, 'fullName': 'John Rexter De Pedro'}]",https://api.sam.gov/prod/opportunities/v1/noticedesc?noticeid=00101a60204d480bb11b0ee8ab1d03dd,OFFICE,"{'zipcode': '39534-2701', 'city': 'KEESLER AFB', 'countryCode': 'USA', 'state': 'MS'}","{'city': {'code': '6220', 'name': 'Biloxi'}, 'state': {'code': 'MS', 'name': 'Mississippi'}, 'zip': '39534', 'country': {'code': 'USA', 'name': 'UNITED STATES'}}",,https://beta.sam.gov/opp/00101a60204d480bb11b0ee8ab1d03dd/view,"[{'rel': 'self', 'href': 'https://api.sam.gov/prod/opportunities/v1/search?noticeid=00101a60204d480bb11b0ee8ab1d03dd&limit=1'}]","[https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/37ad445ffbba49cdbc37f0318694cdc3/download?api_key=null&token=, https://beta.sam.gov/api/prod/opps/v3/opportunities/resources/files/0687478e28bb4b21b9b9d6897621f0a3/download?api_key=null&token=]"


In [11]:
# check if there are duplicate documents in the newly pulled data
data.duplicated('noticeId').sum()

0

In [12]:
# if duplicates found, drop those
data.drop_duplicates('noticeId', inplace=True)

In [12]:
# save data into a new file
data.to_csv('./data/6_28_pull.csv', index=False)

**Note:** Because in Notebook 008 the combined.csv dataframe gets two additional columns -  'cleaned_titles' and 'date_posted', before concatenating the new pull with the combined.csv, we need a function that transforms the newly acquired data into the same shape.

In [13]:
# define function to tokenize a column 
# modified to return title cased titles

def tokenizer_function(column):
    """
    Takes in a text column
        tokenizes the text in each row
        using pattern [[a-zA-Z]\w+]
        which matches every lowercase and upperase character between a-z that are word characters
    Returns list of strings
    """
    
    # instantiate empty list of tokenized text
    texts = []
    
    # define tokenizer pattern
    pattern = '[a-zA-Z]\w+'
    # instantiate tokenizer
    tokenizer = RegexpTokenizer(pattern=pattern)
    
    # create for loop to tokenize each row and add the list of tokens to texts
    for text in column:
        tokens = tokenizer.tokenize(text)
        
        # transform tokens into lower case strings
        tokens = [token.title() for token in tokens]
        texts.append(' '.join(tokens))
    return texts

In [14]:
def transform_new_pull(new_data):
    """function
       accepts new_data
       adds a tokenized title column to new_data
       adds a column of posted date in datetime format to new_data
       returns transformed new_data as dataframe"""
    # create tokenized title column
    new_data['cleaned_titles'] = tokenizer_function(new_data['title'])
    
    # create date-posted column in datetime format
    new_data['date_posted'] = pd.to_datetime(new_data['postedDate'])
    
    return new_data

In [15]:
# transform new dataframe
data = transform_new_pull(data)

In [16]:
# save transformed dataframe into a csv file
data.to_csv('./data/6_28_pull_transformed.csv', index=False)

In [17]:
# read in both the combined dataframe and the new-tranformed dataframe
df1 = pd.read_csv('./data/combined.csv')
df2 = pd.read_csv('./data/6_28_pull_transformed.csv')

In [18]:
# concatenate the new dataframe with the combined dataframe
df = pd.concat([df1, df2])

In [19]:
# check new shape
df.shape

(66614, 29)

In [20]:
# overwrite the combined csv file with the newly combined dataframe
df.to_csv('./data/combined.csv', index=False)