In [1]:
import requests
import zipfile
import io
import json
import os
import sagemaker

In [2]:
def upload_files_to_s3(endpoint, target_directory, session):
    download_metadata = get_download_metadata()
    source_paths = get_file_paths(download_metadata, endpoint)
    for source_path in source_paths:
        print('uploading file {} ...'.format(source_path))
        upload_file_to_s3(source_path, target_directory, endpoint, session)
    print('done')

In [3]:
def get_download_metadata():
    r = requests.get('https://api.fda.gov/download.json')
    return r.json()

In [4]:
def get_file_paths(download_metadata, endpoint):
    partitions = download_metadata['results']['drug'][endpoint]['partitions']
    file_paths = [p['file'] for p in partitions]
    return file_paths

In [5]:
def upload_file_to_s3(source_path, target_directory, endpoint, session):
    r = requests.get(source_path) 
    z = zipfile.ZipFile(io.BytesIO(r.content))
    for n in z.namelist():
        session.upload_string_as_file_body(
            body=z.read(n),
            bucket=session.default_bucket(),
            key=os.path.join(target_directory, endpoint, n)
        )

In [6]:
session = sagemaker.Session()

In [7]:
endpoints = ['drugsfda', 'ndc', 'label']

In [8]:
for e in endpoints:
    upload_files_to_s3(e, 'data/raw', session)

uploading file https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip ...
done
uploading file https://download.open.fda.gov/drug/ndc/drug-ndc-0001-of-0001.json.zip ...
done
uploading file https://download.open.fda.gov/drug/label/drug-label-0001-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0002-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0003-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0004-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0005-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0006-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0007-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/drug-label-0008-of-0010.json.zip ...
uploading file https://download.open.fda.gov/drug/label/dr