In [1]:
# Notebook 1: Data Collection - GOVERNMENT SOURCES ONLY
# FDA (fda.gov) + MedlinePlus/NIH (nih.gov)

# %%
# Cell 1 - Setup
import sys
import os
sys.path.append('..')

import json
from pathlib import Path
import logging
from datetime import datetime

from src.collectors import FDACollector, MedlinePlusCollector

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

ROOT_DIR = Path('..').resolve()
RAW_DATA_DIR = ROOT_DIR / 'data' / 'raw'

print("="*80)
print("DATA COLLECTION - U.S. GOVERNMENT SOURCES ONLY")
print("="*80)
print(f"\nSources:")
print(f"  1. FDA (fda.gov) - Drug labels")
print(f"  2. MedlinePlus (NIH/nih.gov) - Health topics")
print(f"\nRoot directory: {ROOT_DIR}")
print(f"Raw data directory: {RAW_DATA_DIR}")

# %%
# Cell 2 - Collect FDA Drug Labels
print("\n" + "="*80)
print("1. FDA - U.S. FOOD & DRUG ADMINISTRATION")
print("   Source: https://open.fda.gov")
print("="*80)

fda_collector = FDACollector(RAW_DATA_DIR)

fda_conditions = [
    'diabetes',
    'hypertension',
    'heart disease',
    'high blood pressure',
    'asthma'
]

fda_files = []
for condition in fda_conditions:
    print(f"\n  Fetching drug labels for: {condition}")
    file = fda_collector.fetch_drug_labels(condition, limit=100)
    if file:
        fda_files.append(file)
        print(f"  ✓ Saved: {file.name}")

print(f"\n✓ Collected {len(fda_files)} FDA drug label sets")

# %%
# Cell 3 - Collect MedlinePlus Health Topics
print("\n" + "="*80)
print("2. MEDLINEPLUS - NATIONAL LIBRARY OF MEDICINE (NIH)")
print("   Source: https://medlineplus.gov")
print("="*80)

medline_collector = MedlinePlusCollector(RAW_DATA_DIR)

# Government health topics
medlineplus_topics = [
    ('https://medlineplus.gov/diabetes.html', 'Diabetes'),
    ('https://medlineplus.gov/diabetescomplications.html', 'Diabetes Complications'),
    ('https://medlineplus.gov/diabetesdiet.html', 'Diabetes Diet'),
    ('https://medlineplus.gov/diabetestype2.html', 'Type 2 Diabetes'),
    ('https://medlineplus.gov/highbloodpressure.html', 'High Blood Pressure'),
    ('https://medlineplus.gov/howtolowerbloodpressure.html', 'How to Lower Blood Pressure'),
    ('https://medlineplus.gov/heartdiseases.html', 'Heart Diseases'),
    ('https://medlineplus.gov/heartattack.html', 'Heart Attack'),
    ('https://medlineplus.gov/howtopreventheartdisease.html', 'Heart Disease Prevention'),
    ('https://medlineplus.gov/asthma.html', 'Asthma'),
    ('https://medlineplus.gov/cholesterol.html', 'Cholesterol'),
    ('https://medlineplus.gov/howtolowercholesterol.html', 'How to Lower Cholesterol'),
]

medline_files = medline_collector.fetch_multiple_topics(medlineplus_topics)

print(f"\n✓ Collected {len(medline_files)} MedlinePlus topics")

# %%
# Cell 4 - Generate Summary
print("\n" + "="*80)
print("DATA COLLECTION SUMMARY - GOVERNMENT SOURCES ONLY")
print("="*80)

summary = {
    'timestamp': datetime.now().isoformat(),
    'data_policy': 'U.S. Government sources only',
    'sources': {
        'FDA': {
            'full_name': 'U.S. Food and Drug Administration',
            'website': 'https://www.fda.gov',
            'api': 'https://open.fda.gov',
            'files': len(fda_files),
            'file_list': [f.name for f in fda_files]
        },
        'MedlinePlus': {
            'full_name': 'National Library of Medicine (NIH)',
            'website': 'https://medlineplus.gov',
            'operated_by': 'National Institutes of Health',
            'files': len(medline_files),
            'file_list': [f.name for f in medline_files]
        }
    },
    'total_files': len(fda_files) + len(medline_files)
}

print(f"\nCollection Date: {summary['timestamp']}")
print(f"\n✓ Total files collected: {summary['total_files']}")
print(f"\nBreakdown:")
print(f"  FDA Drug Labels:          {len(fda_files)} files")
print(f"  MedlinePlus Health Topics: {len(medline_files)} files")

print(f"\n✓ ALL SOURCES ARE U.S. GOVERNMENT (.gov domains)")

# Save summary
summary_path = RAW_DATA_DIR / 'collection_summary.json'
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Summary saved to: {summary_path}")

print("\n" + "="*80)
print("✓ DATA COLLECTION COMPLETE!")
print("="*80)
print("\nWhat you have now:")
print("  ✅ Medication information (FDA)")
print("  ✅ Symptoms, causes, treatment (MedlinePlus/NIH)")
print("  ✅ Diet and lifestyle advice (MedlinePlus/NIH)")
print("  ✅ ALL from trusted U.S. Government sources")
print("\nNext: Run notebook 2 to process this data")
print("="*80)

# %%

DATA COLLECTION - U.S. GOVERNMENT SOURCES ONLY

Sources:
  1. FDA (fda.gov) - Drug labels
  2. MedlinePlus (NIH/nih.gov) - Health topics

Root directory: C:\Users\Boris\Desktop\code\multilingual-rag
Raw data directory: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw

1. FDA - U.S. FOOD & DRUG ADMINISTRATION
   Source: https://open.fda.gov

  Fetching drug labels for: diabetes


2025-10-16 03:50:32,850 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\diabetes_drugs.json


  ✓ Saved: diabetes_drugs.json

  Fetching drug labels for: hypertension


2025-10-16 03:50:35,037 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\hypertension_drugs.json


  ✓ Saved: hypertension_drugs.json

  Fetching drug labels for: heart disease


2025-10-16 03:50:37,100 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\heart_disease_drugs.json


  ✓ Saved: heart_disease_drugs.json

  Fetching drug labels for: high blood pressure


2025-10-16 03:50:38,997 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\high_blood_pressure_drugs.json


  ✓ Saved: high_blood_pressure_drugs.json

  Fetching drug labels for: asthma


2025-10-16 03:50:40,673 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\asthma_drugs.json
2025-10-16 03:50:40,677 - INFO - Collecting from MedlinePlus (NIH - U.S. Government)
2025-10-16 03:50:40,679 - INFO -   Fetching: Diabetes


  ✓ Saved: asthma_drugs.json

✓ Collected 5 FDA drug label sets

2. MEDLINEPLUS - NATIONAL LIBRARY OF MEDICINE (NIH)
   Source: https://medlineplus.gov


2025-10-16 03:50:41,213 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\medlineplus\diabetes.json
2025-10-16 03:50:41,214 - INFO - ✓ Scraped government source: Diabetes
2025-10-16 03:50:43,217 - INFO -   Fetching: Diabetes Complications
2025-10-16 03:50:43,620 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\medlineplus\diabetes_complications.json
2025-10-16 03:50:43,621 - INFO - ✓ Scraped government source: Diabetes Complications
2025-10-16 03:50:45,623 - INFO -   Fetching: Diabetes Diet
2025-10-16 03:50:46,619 - ERROR - Error scraping MedlinePlus Diabetes Diet: 404 Client Error: Not Found for url: https://medlineplus.gov/diabetesdiet.html
2025-10-16 03:50:46,621 - INFO -   Fetching: Type 2 Diabetes
2025-10-16 03:50:47,105 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\medlineplus\type_2_diabetes.json
2025-10-16 03:50:47,106 - INFO - ✓ Scraped government source: Type 2 Diabetes
2025-10-16 03:50:49,1


✓ Collected 10 MedlinePlus topics

DATA COLLECTION SUMMARY - GOVERNMENT SOURCES ONLY

Collection Date: 2025-10-16T03:51:07.656153

✓ Total files collected: 15

Breakdown:
  FDA Drug Labels:          5 files
  MedlinePlus Health Topics: 10 files

✓ ALL SOURCES ARE U.S. GOVERNMENT (.gov domains)

✓ Summary saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\collection_summary.json

✓ DATA COLLECTION COMPLETE!

What you have now:
  ✅ Medication information (FDA)
  ✅ Symptoms, causes, treatment (MedlinePlus/NIH)
  ✅ Diet and lifestyle advice (MedlinePlus/NIH)
  ✅ ALL from trusted U.S. Government sources

Next: Run notebook 2 to process this data
