# Download and unzip data folders from NSF website

This is a script to download zipfiles from each year on the [NSF website](https://www.nsf.gov/awardsearch/download.jsp).
After download of a file is completed, extract it so that it will create
a folder named after its year. Each year folder has a one xml file per award.

For this project, I downloaded data from 1960 to 2017 which represents approximately 1.5 GB (unzipped in aggregate)

In [1]:
# imports necessary for zip file download
import requests
# part of standard library below
from zipfile import ZipFile
import re
import os

In [2]:
# year range
start_year = 1960
stop_year = 2017
years = range(start_year,stop_year+1)

In [None]:
for y in years:
    
    # construct url for current year
    url = 'https://www.nsf.gov/awardsearch/download?DownloadFileName={}&All=true'.format(y)
    # retrieve data in memory at this url
    requested_year = requests.get(url)
    
    # recover name of zipped folder (usually year.zip)
    # headers is a dictionary containing information about your request
    content_name = requested_year.headers.get('Content-Disposition')
    zip_name = re.findall('\w+.zip', content_name)[0]
    
    # write zipped folder to disk locally (binary data located in request content)
    zipped_dir = os.path.join(os.pardir, 'data', 'raw', zip_name)
    with open(zipped_dir,'wb') as f:
        f.write(request.content)
    
    # extract zip file
    with ZipFile(zipped_dir, 'r') as thiszip:
        # unzipped folder name
        folder_path = os.path.join(os.path.dirname(zipped_dir), zip_name.split('.')[0])
        # MUST filter list of files in zipped folder
        # In 2002 data, there is one named '0225630.xml\r' and '0225630.xml'
        # so use strip() to remove carriage return (\r)
        # Even if there are duplicate of xml file, it will overwrite and keep one only
        xml_list = [ xml.strip() for xml in thiszip.namelist() ]
        # execute extraction
        thiszip.extractall(path = folder_path, members = xml_list)

#### Test on one year only

In [3]:
y=2018
# construct url for current year
url = 'https://www.nsf.gov/awardsearch/download?DownloadFileName={}&All=true'.format(y)
request = requests.get(url)

In [4]:
request.headers

{'Server': 'Apache-Coyote/1.1', 'Content-Disposition': 'attachment; filename="2018.zip"', 'Date': 'Thu, 24 May 2018 16:49:44 GMT', 'Strict-Transport-Security': 'max-age=31536000', 'Content-Length': '3378436', 'Content-Type': 'application/zip'}

In [5]:
# recover name of zip files
content_name = request.headers.get('Content-Disposition')

In [6]:
content_name

'attachment; filename="2018.zip"'

In [7]:
zip_name = re.findall('\w+.zip', content_name)[0]

In [8]:
zip_name

'2018.zip'

In [9]:
# write zipped folder to disk locally
zipped_dir = os.path.join(os.pardir, 'data', 'raw', zip_name)
with open(zipped_dir,'wb') as f:
    f.write(request.content)

In [10]:
# extract zip file
with ZipFile(zipped_dir, 'r') as thiszip:
    # unzipped folder name
    folder_path = os.path.join(os.path.dirname(zipped_dir), zip_name.split('.')[0])
    # MUST filter list of files in zipped folder
    # In 2002 data, there is one named '0225630.xml\r' and '0225630.xml'
    # so use strip() to remove carriage return (\r)
    # Even if there are duplicate of xml file, it will overwrite and keep one only
    xml_list = [ xml.strip() for xml in thiszip.namelist() ]
    # execute extraction
    thiszip.extractall(path = folder_path, members = xml_list)