# Process raw data into a structured format

Raw data is organized by folder named after fiscal years*. Each folder is composed of multiple xml files. Each xml file contains information about one award.
There is no detailed description of the award data besides the xml tree composition. Some tags are self explanatory while others needed some investigation by throughout fully reading the NSF website.
Here is a non-exhaustive list of tags and its description:

1. **AwardTitle:** Title of award
 
2. **AwardEffectiveDate:** Month,Day,Year when funding started

3. **AwardExpirationDate:** Month,Day,Year when funding ended

4. **AwardAmount:** Amount of money in USD awarded to date

5. **AwardInstrument:** Award type (Standard Grant, Continuing Grant,...)

6. **Organization:** NSF organization (Directorate and related Division) funding the grant

7. **Investigator:** name of supervisor(s) (Principal Investigator, Co-Principal Investigator,...), contact info,...

8. **Institution:** name of institution(s) receiving the award, phone number(s), address(es), 

9. **AwardID:** unique 7 digits identifiers of award

*fiscal year Y starts October 1st,Y-1 and ends September 30th,Y

Xml files are unstructured because there could be missing tags or new tags added over the years.
From 1960 to 2017, there are about 450,000 awards which means as many files to read!

Therefore one solution is to condense all that data into 2 CSV files. One containing "short" information (low byte size) and another one containing "long" information (Basically just ID and abstract) 

In [1]:
import glob
import os
import time
import json
import warnings
import multiprocessing
from itertools import chain
from bs4 import BeautifulSoup

## Function to extract relevant xml tags

In [2]:
def extract_xml_tag(input_xml, filename):
    """
        sort out input_soup tags and return list of values
        INPUT: input_xml is the content read from a xml file
        RETURN two dictionaries, one for short information and one for abstracts
    """
    # dictionaries to record data
    shortinfo = {}
    abstract = {}
    
    # make soup and extract tags
    input_soup = BeautifulSoup(input_xml, 'lxml-xml')
    
    # anything fails, that means file structure is corrupted
    try:
        # award identification (keep it as top level key value)
        award_id_string = input_soup.find('AwardID').text
        try:
            award_id = int(award_id_string)
        except:
            award_id = award_id_string
            warnings.warn('Could NOT convert award id {} to an integer'.format(award_id_string), UserWarning)

        # create dictionary for short information
        award_elements = {}
        
        # Title, dates, amount
        award_elements['title'] = input_soup.find('AwardTitle').text
        award_elements['eff_date'] = input_soup.find('AwardEffectiveDate').text
        award_elements['exp_date'] = input_soup.find('AwardExpirationDate').text 
        amount_string = input_soup.find('AwardAmount').text
        # attempt to convert amount to integer
        try:
            award_elements['amount'] = int(amount_string)
        except:
            award_elements['amount'] = amount_string
            warnings.warn('Could NOT convert awarded amount {} to an integer'.format(amount_string), UserWarning)

        # award type
        award_elements['award_instr'] = input_soup.find('AwardInstrument').find('Value').text

        # organization info
        org_tree = input_soup.find('Organization')
        org_code_string = org_tree.find('Code').text
        try:
            award_elements['org_code'] = int(org_code_string)
        except:
            award_elements['org_code'] = org_code_string
            warnings.warn('Could NOT convert org code {} to an integer'.format(org_code_string), UserWarning)

        award_elements['org_direct'] = org_tree.find('Directorate').find('LongName').text
        award_elements['org_div'] = org_tree.find('Division').find('LongName').text

        # nsf officer who approved grant
        award_elements['nsf_officer'] = input_soup.find('ProgramOfficer').find('SignBlockName').text

        # record all investigator
        award_elements['Investigator'] = []
        inv_trees = input_soup.Award.find_all('Investigator', recursive=False)
        for inv in inv_trees:
            this_investigator = {}
            this_investigator['FirstName'] =  inv.find('FirstName').text
            this_investigator['LastName'] =  inv.find('LastName').text
            this_investigator['Role'] = inv.find('RoleCode').text
            award_elements['Investigator'].append(this_investigator)

        # record all participating institutions
        award_elements['Institution'] = []
        institution_trees = input_soup.Award.find_all('Institution', recursive=False)
        for ins in institution_trees:
            this_institution = {}
            this_institution['Name'] =  ins.find('Name').text
            this_institution['StreetAddress'] =  ins.find('StreetAddress').text
            this_institution['City'] = ins.find('CityName').text
            this_institution['State'] = ins.find('StateCode').text
            this_institution['Country'] = ins.find('CountryName').text
            award_elements['Institution'].append(this_institution)

        # record program elements (research qualifier)
        award_elements['ProgramElement'] = []
        progele_trees = input_soup.Award.find_all('ProgramElement', recursive=False)
        for pe in progele_trees:
            this_progelement = {}
            this_progelement['Text'] =  pe.find('Text').text
            code_string =  pe.find('Code').text
            try:
                this_progelement['Code'] = int(code_string)
            except:
                this_progelement['Code'] = code_string
                warnings.warn('Could NOT convert Program element code {} to an integer'.format(code_string), UserWarning)
            award_elements['ProgramElement'].append(this_progelement)

        # add award id as top level key
        shortinfo[award_id] = award_elements
        
        # take care of abstract
        this_abstract = input_soup.find('AbstractNarration').text
        # make sure abstract is not empty (tag can exist but text associated with it)
        if not not this_abstract:
            abstract[award_id] = this_abstract
    except:
        warnings.warn( \
            'File {} does not comply with xml schema! It will be skipped'.format(os.path.basename(filename)), UserWarning)
        
    # return both dictionaries
    return shortinfo, abstract

In [3]:
def read_extract(file_list):
    """
        Read files and extract info using Beautiful soup
        INPUT: file_list is a list of file names
        RETURN two list of dictionaries (short info and abstract)
    """
    # list of dictionaries for short element
    awards_short_elements = []
    # list of dictionaries for abstract
    awards_abstract = []
    
    # read data in each xml file
    for thisfile in file_list:

        with open(thisfile, encoding='utf-8') as f:
            xml_text = f.read()
        
        # extract info from xml
        award_info, award_text = extract_xml_tag(xml_text, thisfile)
        
        # populate list of dictionaries unless it is empty
        if not not award_info:
            awards_short_elements.append(award_info)
        if not not award_text:
            awards_abstract.append(award_text)
        
    return awards_short_elements, awards_abstract

## Prepare variable to loop over all xml files

In [4]:
# make sure output csv files do not exist, otherwise delete them
short_elements_output = os.path.join(os.pardir,'data', 'interim', 'test_short_element.json')   
if os.path.isfile(short_elements_output):
    os.remove(short_elements_output)

abstract_output = os.path.join(os.pardir,'data', 'interim', 'test_abstract.json')
if os.path.isfile(abstract_output):
    os.remove(abstract_output)

# number of processes (quad cores have 8 CPU, 1 CPU = 1 process at most)
NUM_PROCESS = 8

# file count and cumulative file count read
cumfilecount = 0

# number of files to distribute to each task
num_file_partition = 200

# year range for url, REMINDER: start at 1960
years = range(1960,2017+1)

## Multiprocessing main loop

In [None]:
if __name__ == "__main__":
    # get start time of timer for processing time
    start_time = time.time()
    # create pool
    pool = multiprocessing.Pool(processes=NUM_PROCESS, maxtasksperchild=None)

    for ny,y in enumerate(years):

        # number of files read in current folder
        filecount = 0

        # list all xml files in current folder
        xml_list = glob.glob(os.path.join(os.pardir, 'data', 'raw', str(y), '*.xml'))

        # partition list of files
        intervals = range(0,len(xml_list), num_file_partition)
        xml_partition = [ xml_list[nfile:nfile+num_file_partition] for nfile in intervals ]

        # number of task to distribute among cores
        num_task = len(xml_partition)

        # number of file in last task (most likely different from num_file_partition)
        num_lastfile_partition = len(xml_partition[-1])

        # feed pool with all files from current year
        pool_short, pool_abstract = pool.map(read_extract, xml_partition)

        # unpack list of list
        short_element = list(chain.from_iterable(pool_short))
        abstract = list(chain.from_iterable(pool_abstract))

        # write dict to file
        with open(short_elements_output, "a", encoding='utf-8') as f:
            json.dump(short_element, f, ensure_ascii=False)

        with open(abstract_output, "a", encoding='utf-8') as f:
            json.dump(abstract, f, ensure_ascii=False)

        # file counters
        filecount += (num_task - 1)*num_file_partition + num_lastfile_partition
        cumfilecount += filecount

        # print progress
        print('\rYear {}, File #{:6d},Total File {:6d}'.format(y, filecount, cumfilecount) ,end='', flush=True)


    # close pool
    pool.close()
    # make sure all processes are fisnished, map() does it too!
    pool.join()

    # closing print statement
    print('\rYear {}, File #{:6d},Total File {:6d}'.format(y, filecount, cumfilecount), end='\n', flush=True)
    print("--- %s seconds ---" % (time.time() - start_time))