# FDA NTC ETL Pipeline
- API website: https://open.fda.gov/data/downloads/
- Objective: Collect drug information from the FDA NDC (National Drug Code Directory) API

In [1]:
# Requesting the API and getting a status code
import requests
response = requests.get("https://api.fda.gov/drug/ndc.json")
print(response.status_code)

200


In [2]:
import json
import time
from dotenv import load_dotenv
import os

# load credentials
load_dotenv()
api_key = os.getenv('api_key')

limit = 1000 # I got this number by testing on Postman
all_results = []

# get the total number of results from the request
if response.status_code == 200:
    data = response.json()
    total_results = data['meta']['results']['total']

    # print out the count
    print(f"Total results: {total_results}")

else:
    print('Failed to fetch data', response.status_code)



# loop through all the data and add to it the 'all_results' open list 
for x in range(0,total_results,limit):
    if x >= total_results:
        break  # this is to prevent the issue of skipping past the results


    site_map = requests.get(f"https://api.fda.gov/drug/ndc.json?api_key={api_key}&limit={limit}&skip={x}")

    if site_map.status_code == 200:
        data = site_map.json()
        if 'results' in data:
            all_results.extend(data['results'])

    else:
        print(f'failed to get data starting at {x} with a status code of {site_map.status_code}')
        print(site_map.text)
        break

    # short delay to avoid rate limit
    time.sleep(1) 

# print out the count
print(f"Final total results: {len(all_results)}")

Total results: 128775
failed to get data starting at 26000 with a status code of 400
{
  "error": {
    "code": "BAD_REQUEST",
    "message": "Skip value must 25000 or less."
  }
}
Final total results: 26000


In [3]:
import psycopg2
from dotenv import load_dotenv
import os

# load credentials
load_dotenv()
db_user = os.getenv('db_user')
db_password = os.getenv('db_password')

# connect to the postgres database
conn = psycopg2.connect(
    host = 'localhost',
    database = 'postgres',
    user = db_user,
    password = db_password)

cursor = conn.cursor()

In [4]:
# reconnect to the access the new database
conn = psycopg2.connect(
    host = 'localhost',
    database = 'Postgres 16 - Localhost - FDA-NDC-ETL_db',
    user = db_user,
    password = db_password)

cursor = conn.cursor()

In [5]:
# create the table
cursor.execute(
'''
CREATE TABLE IF NOT EXISTS FDA_Drugs_db (
    product_id VARCHAR PRIMARY KEY,
    product_ndc VARCHAR,
    generic_name TEXT,
    labeler_name TEXT,
    brand_name TEXT,
    -- active_ingredients TEXT,  -- This needs to be serialized
    finished BOOLEAN,
    -- packaging VARCHAR,  -- This needs to be serialized
    listing_expiration_date DATE,
    -- openfda VARCHAR, -- This so far seems blank. Need to investigate
    marketing_category VARCHAR,
    dosage_form VARCHAR,
    spl_id VARCHAR,
    product_type VARCHAR,
    route TEXT[],
    marketing_start_date DATE,
    brand_name_base VARCHAR
    -- pharm_class VARCHAR  -- This needs to be serialized
                                        );
'''
)
# Commit
conn.commit()

In [6]:
# Using a manual approach for data extraction and handling to break out each item for SQL insertion instead of relying on pd.json_normalize
for result in all_results:
    product_ndc = result.get('product_ndc')
    generic_name = result.get('generic_name')
    labeler_name = result.get('labeler_name')
    brand_name = result.get('brand_name')
    active_ingredients = result.get('active_ingredients')  # this is a dictionary
    finished = result.get('finished')
    packaging = result.get('packaging')  # this is a dictionary
    listing_expiration_date = result.get('listing_expiration_date')
    openfda = result.get('openfda') # this is a dictionary
    marketing_category = result.get('marketing_category')
    dosage_form = result.get('dosage_form')
    spl_id = result.get('spl_id')
    product_type = result.get('product_type')
    route = result.get('route')
    marketing_start_date = result.get('marketing_start_date')
    product_id = result.get('product_id')
    brand_name_base = result.get('brand_name_base')
    pharm_class = result.get('pharm_class') # this is a dictionary


    # Add data to the table
    cursor.execute(
        '''
        INSERT INTO FDA_Drugs_db (
        "product_ndc",
        "generic_name",
        "labeler_name",
        "brand_name",
        "finished",
        "listing_expiration_date",
        "marketing_category",
        "dosage_form",
        "spl_id",
        "product_type",
        "route",
        "marketing_start_date",
        "product_id",
        "brand_name_base"
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT ("product_id") DO NOTHING
        ''', 
        (
            product_ndc,
            generic_name,
            labeler_name,
            brand_name,
            finished,
            listing_expiration_date,
            marketing_category,
            dosage_form,
            spl_id,
            product_type,
            route,
            marketing_start_date,
            product_id,
            brand_name_base
        )
    )

    # Commit the transaction
    conn.commit()

In [7]:
# add data to the table
sql_query = '''
            SELECT * FROM FDA_Drugs_db
            '''
    
# Execute the query
cursor.execute(sql_query)
# Fetch all the results
results = cursor.fetchall()

In [8]:
import pandas as pd

# Add column names to dataframe
column_names = [x[0] for x in cursor.description]
df = pd.DataFrame(results, columns=column_names)

# See results
df.head()

Unnamed: 0,product_id,product_ndc,generic_name,labeler_name,brand_name,finished,listing_expiration_date,marketing_category,dosage_form,spl_id,product_type,route,marketing_start_date,brand_name_base
0,82429-126_08368bc6-272c-a9f2-e063-6394a90a3ab9,82429-126,"SULFACETAMIDE SODIUM, SULFUR",Gabar Health Sciences Corp.,Sodium Sulfacetamide 8% and Sulfur 4% Cleanser,True,2025-12-31,UNAPPROVED DRUG OTHER,LIQUID,08368bc6-272c-a9f2-e063-6394a90a3ab9,HUMAN PRESCRIPTION DRUG,[TOPICAL],2023-04-05,Sodium Sulfacetamide 8% and Sulfur 4% Cleanser
1,82520-263_09e3474c-5795-44f0-e063-6294a90a8568,82520-263,ZINC OXIDE,"Caribbean Sol International, LLC",HAWAIIAN SOL Natural Sunscreen SPF-30,True,2025-12-31,OTC MONOGRAPH DRUG,SOLUTION,09e3474c-5795-44f0-e063-6294a90a8568,HUMAN OTC DRUG,[TOPICAL],2022-01-24,HAWAIIAN SOL Natural Sunscreen SPF-30
2,82706-010_0aeeedc8-dcb9-087d-e063-6394a90ad1f5,82706-010,"Acetaminophen, Caffeine",VIVUNT PHARMA LLC,NOTTS - Tension Headache,True,2025-12-31,OTC MONOGRAPH DRUG,TABLET,0aeeedc8-dcb9-087d-e063-6394a90ad1f5,HUMAN OTC DRUG,[ORAL],2023-10-30,NOTTS - Tension Headache
3,82804-065_dbed06f9-7758-4c96-9af0-1ed6103db7df,82804-065,Glimepiride,Proficient Rx LP,Glimepiride,True,2025-12-31,ANDA,TABLET,dbed06f9-7758-4c96-9af0-1ed6103db7df,HUMAN PRESCRIPTION DRUG,[ORAL],2012-06-29,Glimepiride
4,0574-0122_076638ec-85bd-4665-afc4-0af0c73076be,0574-0122,Activated charcoal,Padagis US LLC,EZ Char,True,,OTC MONOGRAPH NOT FINAL,PELLET,076638ec-85bd-4665-afc4-0af0c73076be,HUMAN OTC DRUG,[ORAL],2002-01-14,EZ Char
