# Load Health Authority Data into Amazon S3

---
## 1. Setup

In [None]:
%pip install -U -r requirements.txt

In [None]:
from src import helpers
import os

s3_bucket = os.environ.get("S3_BUCKET_NAME")
s3_document_folder = "drugs-at-fda"

---
## 2. Download Drugs@FDA metadata file

In [None]:
from urllib.request import urlretrieve
import json
import zipfile
import tempfile

with tempfile.NamedTemporaryFile() as fp:
    url = "https://download.open.fda.gov/drug/drugsfda/drug-drugsfda-0001-of-0001.json.zip"
    urlretrieve(url, fp.name)
    with zipfile.ZipFile(fp, "r") as z:
        for filename in z.namelist():
            print(filename)
            with z.open(filename) as f:
                data = f.read()
                results = json.loads(data).get('results')

---
## 3. Download pdfs and generate Kendra metadata files

Specify which drugs to include

In [None]:
drugs_to_include = [
    "DARZALEX",
    "DUPIXENT",
    "ELIQUIS",
    "EYLEA",
    "HUMIRA",
    "KEYTRUDA",
    "MOUNJARO",
    "OPDIVO",
    "OZEMPIC",
    "SKYRIZI",
    "STELARA",
    "TRULICITY",
    "ZEPBOUND",
]

Load pdf and metadata files into S3

In [None]:
import os
import re
from tqdm import tqdm


result_count = 0
drug_count = 0
doc_count = 0

doc_urls = {}
for result in results:
    result_count += 1
    doc_info = {}
    if not "openfda" in result:
        continue
    elif not "brand_name" in result.get("openfda"):
        continue

    else:
        result_count += 1

    drug_name = (
        result.get("openfda").get("brand_name")[0]
        or result.get("openfda").get("generic_name")[0]
        or "OTHER"
    )
    if not drug_name in drugs_to_include:
        continue

    doc_info["drug_name"] = drug_name
    doc_urls[drug_name] = []
    drug_count += 1
    for submission in result.get("submissions"):
        if not "application_docs" in submission:
            continue
        submission_id = (
            submission.get("submission_type")
            + "-"
            + submission.get("submission_number")
        )
        for doc in tqdm(
            submission.get("application_docs"), desc=drug_name + " " + submission_id
        ):
            try:
                doc_info = helpers.parse_fda_doc_info(doc)

                if doc_info.get("extension") == ".cfm":
                    child_docs = helpers.parse_cfm(doc_info.get("url"))
                    for child_doc in child_docs:
                        try:
                            # Copy document to S3
                            child_doc_info = helpers.parse_fda_doc_info(child_doc)
                            document_prefix = os.path.join(
                                s3_document_folder,
                                drug_name,
                                submission_id,
                                child_doc_info.get("name"),
                            )

                            helpers.copy_url_to_s3(
                                child_doc_info.get("url"),
                                s3_bucket,
                                document_prefix,
                            )

                            # Copy metadata to S3
                            metadata = helpers.create_doc_metadata(
                                result,
                                submission,
                                child_doc_info,
                                s3_bucket,
                                document_prefix,
                                doc_info,
                            )

                            helpers.write_string_to_s3(
                                metadata,
                                s3_bucket,
                                document_prefix + ".metadata.json",
                            )
                            doc_urls[drug_name].append(doc.get("url"))
                            doc_count += 1
                        except Exception as e:
                            print(e)
                            continue
                elif re.search("[#&?]", doc_info.get("extension")):
                    continue
                else:
                    document_prefix = os.path.join(
                        s3_document_folder,
                        drug_name,
                        submission_id,
                        doc_info.get("name"),
                    )
                    metadata = helpers.create_doc_metadata(
                        result, submission, doc_info, s3_bucket, document_prefix
                    )
                    helpers.copy_url_to_s3(doc.get("url"), s3_bucket, document_prefix)
                    helpers.write_string_to_s3(
                        metadata,
                        s3_bucket,
                        document_prefix + ".metadata.json",
                    )
                    doc_urls[drug_name].append(doc.get("url"))
                    doc_count += 1

            except Exception as e:
                print(e)
                continue

print(f"{result_count} Drugs@FDA records processed")
print(f"{drug_count} drugs examined")
print(f"{doc_count} documents added")

(Optional) Capture list of pdf urls 

In [None]:
import json

for drug in doc_urls.keys():
    doc_urls[drug] = list(set(doc_urls[drug]))

with open("urls.json", "w") as f:
    json.dump(doc_urls, f)