## Data acquisition - NSF funding data 

## Step 1 : download files 

In [7]:
import requests
import datetime
import os 
import time 


def download_file(x_url, x_filename_local):
    ''' download files,
    e.g. zip files of XML

    TO DO : add proxy options
    '''
    # NOTE the stream=True parameter
    r = requests.get(x_url, stream=True)
    with open(x_filename_local, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
                f.flush()
    # return x_filename_local


In [13]:
## N.B: THERE IS A PROBLEM WITH THE WEBSITE : 
## November 14. 2023 at 15:00 
## https://www.nsf.gov/awardsearch/download?DownloadFileName=2001&All=true
## one gets an error with "This page is not available" 

In [8]:
# xUrl Paths 
xUrl_Path = 'https://www.nsf.gov/awardsearch/download?DownloadFileName='

# Destionation folder 
xFld_Dest = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/__downloaded/02_NSF'


for x_year in [str(x) for x in range(2007, 2023)]:
    x_UrlFile = xUrl_Path  + x_year + '&All=true'
    #file_name 
    xfile_name = 'NSF_' + x_year + '.zip'
    x_DestFileName =  os.path.join(xFld_Dest, xfile_name)
    
    time.sleep(1)
    try:
        download_file(x_UrlFile,  x_DestFileName )
        #pass 
        
    except:
        print ('!!! error in:', x_year)

print ('all done')


all done


## STEP 2 : parse the downloaded data and save into an SQL database 

In [8]:
import xml.etree.ElementTree as ET
import pandas as pd 
import re
import xmltodict
import os 
import time 
import zipfile

In [9]:
def is_valid_xml(xml_string):
    try:
        ET.fromstring(xml_string)
        return True
    except ET.ParseError:
        return False


def flatten_dict(xdict):
    # Convert the nested dictionary to a DataFrame
    xdf = pd.json_normalize(xdict)
    # this can converted back to a dictionary 
    xflat_dict = xdf.to_dict(orient='records')[0]
    return xflat_dict

def to_snake_case(s):
    # Check if the string is already in snake_case
    if re.search(r'[a-z][A-Z]', s):
        # If camelCase patterns are found, convert to snake_case
        snake_case_str = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', s).lower()
        return snake_case_str
    else:
        # If no camelCase patterns are found, return the original string
        return s.lower()



def nsf_process_record(xrecord_xml):
    '''process one record of NSF grant
    input: one xml record  
    '''
    
    #### RESULT _LISTS 
        
    xlst_lst_dict_grant_info = []
    xlst_lst_dict_inst = []
    xlst_lst_dict_investigators = []    
    
    # check if valid xml 
    
    if is_valid_xml(xrecord_xml):
        
        q1 = xmltodict.parse(xrecord_xml)
        q2 = q1.get('rootTag').get('Award')

        xid = q2.get('AwardID')
        #atomic_fields_str
        xlst_fields_atomic = ['AwardID',  'AwardTitle',  'AbstractNarration', 
                               'AwardEffectiveDate',  'AwardExpirationDate', 
                              'MaxAmdLetterDate',  'MinAmdLetterDate',
                               'AwardAmount',   'AwardTotalIntnAmount',
                               'AGENCY',  'AWDG_AGCY_CODE',  'FUND_AGCY_CODE',
                              'CFDA_NUM',      'NSF_PAR_USE_FLAG',  'TRAN_TYPE']
        xlst_fields_to_flatten = ['Organization', 'ProgramOfficer', 'ProgramElement']



        ### GRANT INFO 
        xdict_grant_info = {}
        #add atomic values
        for xfield in xlst_fields_atomic:
            xfield_name = to_snake_case(xfield)
            xdict_grant_info[xfield_name ] = q2.get(xfield)
        #add instrument 
        xdict_grant_info['award_instrument'] = q2.get('AwardInstrument').get('Value')
        ## add field to flatten 
        for xfield in xlst_fields_to_flatten:
            if q2.get(xfield):
                xfield_name = to_snake_case(xfield)
                xdict_flat = flatten_dict(q2.get(xfield))
                for xval in xdict_flat:
                    xfield_name_2 = xval.lower().replace('.', '_')
                    xdict_grant_info[xfield_name + '_' + xfield_name_2] = xdict_flat.get(xval)
        ## fields to make sure they are lists 
        xlst_lst_dict_grant_info.append(xdict_grant_info)

        ### INSTITUTION 
        if q2.get('Institution'):
            xinst_list = q2.get('Institution')
            if not isinstance(xinst_list, list):
                xinst_list = [xinst_list]

            for xinst in xinst_list:
                xinst['award_id']= xid
                xlst_lst_dict_inst.append(xinst)


        ### INVESTIGATOR 
        if q2.get('Investigator'):
            xinv_list = q2.get('Investigator')
            if not isinstance(xinv_list, list):
                xinv_list = [xinv_list]

            for xinv in xinv_list:
                xinv['award_id'] = xid
                xlst_lst_dict_investigators.append(xinv)

        
    return xlst_lst_dict_grant_info, xlst_lst_dict_inst, xlst_lst_dict_investigators



def nsf_process_zip_file(xFile):
    '''
    process NSF zipped file 
    returns: 
    three dataframes :
    - df_grant
    - df_inst
    - df_inv
    '''

    ##XRESULTS_LIST 

    xRES_LST_GRANT = []
    xRES_LST_INST = []
    xRES_LST_RES = []

    with zipfile.ZipFile(xFile, "r") as f:
        for xnr, xname in enumerate(f.namelist()):
            #if xnr == 110:
            #    break 

            xdata = f.read(xname)

            xid = xname.replace('.xml', '')
            xgrant, xinst, xres = nsf_process_record(xdata)
            xRES_LST_GRANT.extend(xgrant)
            xRES_LST_INST.extend(xinst)
            xRES_LST_RES.extend(xres)

    df_grant = pd.DataFrame(xRES_LST_GRANT)
    df_inst = pd.DataFrame(xRES_LST_INST)
    df_inv = pd.DataFrame(xRES_LST_RES) 

    return df_grant, df_inst, df_inv



In [15]:
## Resulting database - we use sqlite for easy replication 

xPath_DB = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/'
xDB = xPath_DB  + 'grants_datasets.db'
xDBCon = 'sqlite:///' + xDB



## as there is a problem with the website 
## we use data downloaded earlier 
#xFld = '/home/mike/xTemp_data_infrastructure/_staging_funding_dataset/__downloaded/02_NSF/'

xFld = '/media/mike/DATA4T/_data_repository/__others/__funding_data/5000_NSF/'

xFiles = [xFld + x for x in os.listdir(xFld)]
#xFile_rnd = random.sample(xFiles, 1)[0]


In [16]:
len(xFiles )

16

In [17]:
print(time.ctime())
for xFile in xFiles:
    
    #if xFile != xFile_rnd:
    #    continue 
    
    xyear = xFile.split('/')[-1:][0].replace('.zip', '')
    
    print('processing:', xyear)
    
    xtab_name_suffix = 'nsf_' + xyear 
    
    xtabname_grant = xtab_name_suffix + '_grant_info'
    xtabname_inv   = xtab_name_suffix + '_grant_investigators'
    xtabname_inst = xtab_name_suffix + '_grant_institutions'
    
    

    
    dgrant, dinst, dinv = nsf_process_zip_file(xFile)
    
    
    #print (xtabname_grant)
    
    ## save 
    dgrant.to_sql(name = xtabname_grant, con = xDBCon, if_exists='replace', index=False, chunksize=10000)
    dinst.to_sql(name = xtabname_inst, con = xDBCon, if_exists='replace', index=False, chunksize=10000)
    dinv.to_sql(name = xtabname_inv, con = xDBCon, if_exists='replace', index=False, chunksize=10000)    

    
    
print('all done!')

print(time.ctime()) 

Tue Nov 14 15:41:26 2023
processing: 2012
processing: 2019
processing: 2021
processing: 2020
processing: 2007
processing: 2008
processing: 2013
processing: 2014
processing: 2016
processing: 2018
processing: 2009
processing: 2015
processing: 2022
processing: 2010
processing: 2011
processing: 2017
all done!
Tue Nov 14 15:44:33 2023
