In [2]:
import requests
import json
import time
from tqdm import tqdm


In [7]:
def download_all_fda_data(output_filename="enforcement_drug.json", max_results=5000):
    """
    Downloads all available drug label data from the openFDA API using pagination.
    
    Args:
        output_filename (str): The name of the file to save the JSON data.
        max_results (int): The approximate maximum number of records to download.
                           Set to a lower number for testing (e.g., 500).
    """
    API_URL = "https://api.fda.gov/drug/enforcement.json"
    LIMIT = 100  # Max limit per request is 100
    skip = 0
    
    all_drug_data = []
    
    # Get the total number of records to set up the progress bar
    try:
        total_response = requests.get(f"{API_URL}?limit=1")
        total = total_response.json()['meta']['results']['total']
        print(f"Total records available in the API: {total}")
        # Use the smaller of the API total or the user-defined max_results
        records_to_fetch = min(total, max_results)
    except Exception as e:
        print(f"Could not determine total records. Error: {e}")
        records_to_fetch = max_results
    
    with tqdm(total=records_to_fetch, desc="Downloading FDA Data") as pbar:
        while skip < records_to_fetch:
            params = {
                "limit": LIMIT,
                "skip": skip
            }
            
            try:
                response = requests.get(API_URL, params=params)
                response.raise_for_status()
                data = response.json()
                
                if "results" in data and len(data["results"]) > 0:
                    batch_data = data["results"]
                    all_drug_data.extend(batch_data)
                    pbar.update(len(batch_data))
                    skip += LIMIT
                else:
                    # No more results to fetch
                    print("\nNo more results found. Ending download.")
                    break
                
                # Be polite to the API
                time.sleep(0.1)

            except requests.exceptions.HTTPError as http_err:
                print(f"\nHTTP error occurred: {http_err}. Stopping.")
                break
            except Exception as err:
                print(f"\nAn error occurred: {err}. Stopping.")
                break

    # Save the collected data to a file
    print(f"\nSaving {len(all_drug_data)} records to {output_filename}...")
    with open(output_filename, "w") as f:
        json.dump(all_drug_data, f, indent=4)
        
    print(f"✅ Download complete. Data saved to {output_filename}.")


In [8]:
if __name__ == "__main__":
    # You can set a max_results limit to avoid downloading millions of records.
    # For a full download, you might remove this limit, but be aware it will be a very large file.
    download_all_fda_data(max_results=5000)

Total records available in the API: 17134


Downloading FDA Data: 100%|█████████████████| 5000/5000 [01:04<00:00, 77.16it/s]



Saving 5000 records to enforcement_drug.json...
✅ Download complete. Data saved to enforcement_drug.json.
