In [1]:
import sagemaker
import boto3
import json
import pandas as pd
import os

In [2]:
def read_s3_files(client, bucket_name, prefix, columns):
    keys = get_keys(client, bucket_name, prefix)
    dfs = []
    for key in keys:
        df = read_s3_file(client, bucket_name, key, columns)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [3]:
def get_keys(client, bucket_name, prefix):
    response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return [o['Key'] for o in response['Contents']]

In [4]:
def read_s3_file(client, bucket_name, key, columns):
    response = s3.get_object(Bucket=bucket_name, Key=key)
    content = response['Body'].read()
    data = json.loads(content)
    return pd.json_normalize(data['results'])[columns]

In [5]:
session = sagemaker.Session()

In [6]:
s3 = boto3.client('s3')

In [7]:
bucket = session.default_bucket()

In [8]:
drugsfda_columns = [
    'application_number',
    'openfda.spl_id',
    'openfda.spl_set_id',
    'openfda.product_ndc',
    'openfda.substance_name'
]

In [9]:
ndc_columns = [
    'product_id',
    'product_ndc',
    'spl_id',
    'application_number',
    'dea_schedule',
    'dosage_form',
    'finished',
    'marketing_category',
    'marketing_start_date',
    'marketing_end_date',
    'openfda.pharm_class_cs',
    'openfda.pharm_class_epc',
    'openfda.pharm_class_pe',
    'openfda.pharm_class_moa',
    'pharm_class',
    'product_type',
    'route'
]

In [10]:
label_columns = [
    'id',
    'set_id',
    'version',
    'effective_time',
    'drug_interactions'
]

In [11]:
os.makedirs('data', exist_ok=True) # make a directory to store the files

In [12]:
drugsfda = read_s3_files(s3, bucket, 'data/raw/drugsfda/', drugsfda_columns)

In [13]:
drugsfda.to_csv('data/drugsfda.csv', index=False)

In [14]:
del drugsfda # delete dataframe to save memory

In [15]:
ndc = read_s3_files(s3, bucket, 'data/raw/ndc/', ndc_columns)

In [16]:
ndc.to_csv('data/ndc.csv', index=False)

In [17]:
del ndc # delete dataframe to save memory

In [18]:
label = read_s3_files(s3, bucket, 'data/raw/label/', label_columns)

In [19]:
label.to_csv('data/label.csv', index=False)

In [20]:
del label # delete dataframe to save memory