In [1]:
import sagemaker
import boto3
import json
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def read_s3_files(client, bucket_name, prefix, columns):
    keys = get_keys(client, bucket_name, prefix)
    dfs = []
    for key in keys:
        df = read_s3_file(client, bucket_name, key, columns)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [3]:
def get_keys(client, bucket_name, prefix):
    response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    return [o['Key'] for o in response['Contents']]

In [4]:
def read_s3_file(client, bucket_name, key, columns):
    response = s3.get_object(Bucket=bucket_name, Key=key)
    content = response['Body'].read()
    data = json.loads(content)
    return pd.json_normalize(data['results'])[columns]

In [5]:
session = sagemaker.Session()

In [6]:
s3 = boto3.client('s3')

In [7]:
bucket = session.default_bucket()

In [8]:
columns = [
    'id',
    'set_id',
    'effective_time',
    'openfda.substance_name',
    'controlled_substance'
]

In [9]:
label = read_s3_files(s3, bucket, 'data/raw/label/', columns)

In [10]:
label.head()

Unnamed: 0,id,set_id,effective_time,openfda.substance_name,controlled_substance
0,4dcb8674-48f7-4229-993d-bca9795b1676,d1697161-ebe8-42b0-83e7-c9de4412546c,20210824,[GALANTAMINE HYDROBROMIDE],
1,7519ffff-75f4-4ede-9ee9-b75870506b80,a582d57f-d191-4ca5-b105-585b2d8a0e3e,20190204,"[AMPHETAMINE ASPARTATE MONOHYDRATE, AMPHETAMIN...",
2,7aa7efea-fa82-5a1f-e053-2a91aa0a5d7d,6401da8a-8cd5-72f6-e053-2991aa0ad5e0,20181114,"[OCTINOXATE, TITANIUM DIOXIDE, ZINC OXIDE]",
3,ad529830-ad3f-4973-95cf-1e3ed5d15d26,ad529830-ad3f-4973-95cf-1e3ed5d15d26,20220308,[PHENOBARBITAL],
4,c52f0d41-044f-4e59-8c06-7c799c48baf7,6eb6ddac-fa26-43b3-bf3e-5b7b58138fdc,20200731,"[FLUTICASONE PROPIONATE, SALMETEROL XINAFOATE]",


In [11]:
columns = [
    'product_id',
    'product_ndc',
    'spl_id',
    'application_number',
    'dea_schedule',
    'dosage_form',
    'finished',
    'marketing_category',
    'marketing_start_date',
    'marketing_end_date',
    'openfda.pharm_class_cs',
    'openfda.pharm_class_epc',
    'openfda.pharm_class_pe',
    'pharm_class',
    'product_type',
    'route'
]

In [12]:
ndc = read_s3_files(s3, bucket, 'data/raw/ndc/', columns)

In [13]:
ndc.head()

Unnamed: 0,product_id,product_ndc,spl_id,application_number,dea_schedule,dosage_form,finished,marketing_category,marketing_start_date,marketing_end_date,openfda.pharm_class_cs,openfda.pharm_class_epc,openfda.pharm_class_pe,pharm_class,product_type,route
0,0002-0800_662164fd-5ea0-4a08-bfd1-6b08bdd73342,0002-0800,662164fd-5ea0-4a08-bfd1-6b08bdd73342,BLA018781,,"INJECTION, SOLUTION",True,BLA,19870710,,[Insulin [CS]],[Insulin [EPC]],,,HUMAN OTC DRUG,[SUBCUTANEOUS]
1,72449-101_a0fc7c8c-133f-275d-e053-2995a90a6af6,72449-101,a0fc7c8c-133f-275d-e053-2995a90a6af6,part333E,,GEL,True,OTC MONOGRAPH NOT FINAL,20200316,,,,,,HUMAN OTC DRUG,[TOPICAL]
2,72476-025_cb05b9c6-c1bc-b210-e053-2a95a90afaea,72476-025,cb05b9c6-c1bc-b210-e053-2a95a90afaea,part358H,,SHAMPOO,True,OTC MONOGRAPH FINAL,20210826,,,,,,HUMAN OTC DRUG,[TOPICAL]
3,72476-102_af106cce-5feb-4fd9-e053-2a95a90aa378,72476-102,af106cce-5feb-4fd9-e053-2a95a90aa378,part340,,TABLET,True,OTC MONOGRAPH FINAL,20200801,,[Xanthines [CS]],"[Central Nervous System Stimulant [EPC], Methy...",[Central Nervous System Stimulation [PE]],"[Central Nervous System Stimulant [EPC], Centr...",HUMAN OTC DRUG,[ORAL]
4,72476-200_5dd1af8d-f774-4d57-b55f-b4059acf5654,72476-200,5dd1af8d-f774-4d57-b55f-b4059acf5654,part333E,,GEL,True,OTC MONOGRAPH NOT FINAL,20201006,,,,,,HUMAN OTC DRUG,[TOPICAL]


In [16]:
columns = [
    'openfda.spl_id',
    'openfda.spl_set_id',
    'openfda.substance_name',
    'application_number'
]

In [17]:
drugsfda = read_s3_files(s3, bucket, 'data/raw/drugsfda/', columns)

In [18]:
drugsfda.head()

Unnamed: 0,openfda.spl_id,openfda.spl_set_id,openfda.substance_name,application_number
0,,,,ANDA070137
1,,,,NDA013553
2,,,,NDA013601
3,,,,NDA013615
4,,,,ANDA070193
