# FDA NTC ETL Pipeline
- API website: https://open.fda.gov/data/downloads/
- Objective: Collect drug information from the FDA NDC (National Drug Code Directory) API

In [41]:
# Requesting the API and getting a status code
import requests
response = requests.get("https://api.fda.gov/drug/ndc.json")
print(response.status_code)

200


In [42]:
import json

# Request the API
response = requests.get("https://api.fda.gov/drug/ndc.json")
data = response.json()

# Checking if the request was successful then returning a sample of the data
if response.status_code == 200:
    data = response.json()
    print(json.dumps(data['meta'], indent=4, sort_keys=True))
else:
    print('Failed to fetch data', response.status_code)

{
    "disclaimer": "Do not rely on openFDA to make decisions regarding medical care. While we make every effort to ensure that data is accurate, you should assume all results are unvalidated. We may limit or otherwise restrict your access to the API in line with our Terms of Service.",
    "last_updated": "2025-02-07",
    "license": "https://open.fda.gov/license/",
    "results": {
        "limit": 1,
        "skip": 0,
        "total": 128775
    },
    "terms": "https://open.fda.gov/terms/"
}


In [43]:
import psycopg2
from dotenv import load_dotenv
import os

# load credentials
load_dotenv()
db_user = os.getenv('db_user')
db_password = os.getenv('db_password')

# connect to the postgres database
conn = psycopg2.connect(
    host = 'localhost',
    database = 'postgres',
    user = db_user,
    password = db_password)

cursor = conn.cursor()

In [44]:
# reconnect to the access the new database
conn = psycopg2.connect(
    host = 'localhost',
    database = 'Postgres 16 - Localhost - FDA-NDC-ETL_db',
    user = db_user,
    password = db_password)

cursor = conn.cursor()

In [45]:
# create the table
cursor.execute(
'''
CREATE TABLE IF NOT EXISTS FDA_Drugs_db (
    product_id VARCHAR PRIMARY KEY,
    product_ndc VARCHAR,
    generic_name TEXT,
    labeler_name TEXT,
    brand_name TEXT,
    active_ingredients VARCHAR,  -- This needs to be serialized
    finished BOOLEAN,
    packaging VARCHAR,  -- This needs to be serialized
    listing_expiration_date DATE,
    openfda VARCHAR,
    marketing_category VARCHAR,
    dosage_form VARCHAR,
    spl_id VARCHAR,
    product_type VARCHAR,
    route TEXT[],
    marketing_start_date DATE,
    brand_name_base VARCHAR,
    pharm_class VARCHAR  -- This needs to be serialized
                                        );
'''
)
# Commit
conn.commit()

In [46]:
# Print each items in 'results' as a list for easy viewing and editing
for result in data['results']:
    print("[", end="")
    print(*result, sep=", ", end="")
    print("]")

# Using a manual approach for data extraction and handling to break out each item for SQL insertion instead of relying on pd.json_normalize
for result in data['results']:
    product_ndc = result['product_ndc']
    generic_name = result['generic_name']
    labeler_name = result['labeler_name']
    brand_name = result['brand_name']
    # active_ingredients = result['active_ingredients']  # this is a dictionary
    finished = result['finished']
    # packaging = result['packaging']  - this is a dictionary
    listing_expiration_date = result['listing_expiration_date']
    # openfda = result['openfda'] - this is a dictionary
    marketing_category = result['marketing_category']
    dosage_form = result['dosage_form']
    spl_id = result['spl_id']
    product_type = result['product_type']
    route = result['route']
    marketing_start_date = result['marketing_start_date']
    product_id = result['product_id']
    brand_name_base = result['brand_name_base']
    # pharm_class = result['pharm_class'] - this is a dictionary

[product_ndc, generic_name, labeler_name, brand_name, active_ingredients, finished, packaging, listing_expiration_date, openfda, marketing_category, dosage_form, spl_id, product_type, route, marketing_start_date, product_id, brand_name_base, pharm_class]


In [47]:
# Add data to the table
cursor.execute(
    '''
    INSERT INTO FDA_Drugs_db (
    "product_ndc",
    "generic_name",
    "labeler_name",
    "brand_name",
    "finished",
    "listing_expiration_date",
    "marketing_category",
    "dosage_form",
    "spl_id",
    "product_type",
    "route",
    "marketing_start_date",
    "product_id",
    "brand_name_base"
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT ("product_id") DO NOTHING
    ''', 
    (
        product_ndc,
        generic_name,
        labeler_name,
        brand_name,
        finished,
        listing_expiration_date,
        marketing_category,
        dosage_form,
        spl_id,
        product_type,
        route,
        marketing_start_date,
        product_id,
        brand_name_base
    )
)

# Commit the transaction
conn.commit()

In [48]:
# add data to the table
sql_query = '''
            SELECT * FROM FDA_Drugs_db
            '''
    
# Execute the query
cursor.execute(sql_query)
# Fetch all the results
results = cursor.fetchall()

In [49]:
import pandas as pd

# Add column names to dataframe
column_names = [x[0] for x in cursor.description]
df = pd.DataFrame(results, columns=column_names)

# See results
df

Unnamed: 0,product_id,product_ndc,generic_name,labeler_name,brand_name,active_ingredients,finished,packaging,listing_expiration_date,openfda,marketing_category,dosage_form,spl_id,product_type,route,marketing_start_date,brand_name_base,pharm_class
0,82429-126_08368bc6-272c-a9f2-e063-6394a90a3ab9,82429-126,"SULFACETAMIDE SODIUM, SULFUR",Gabar Health Sciences Corp.,Sodium Sulfacetamide 8% and Sulfur 4% Cleanser,,True,,2025-12-31,,UNAPPROVED DRUG OTHER,LIQUID,08368bc6-272c-a9f2-e063-6394a90a3ab9,HUMAN PRESCRIPTION DRUG,[TOPICAL],2023-04-05,Sodium Sulfacetamide 8% and Sulfur 4% Cleanser,
