In [1]:
# Cell 1 - Setup imports
import sys
import os
sys.path.append('..')  # Add parent directory to path

import json
import pandas as pd
from pathlib import Path
import logging
from datetime import datetime

# Import our custom collectors
from src.collectors import CDCCollector, FDACollector, PubMedCollector

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Set up paths
ROOT_DIR = Path('..').resolve()
RAW_DATA_DIR = ROOT_DIR / 'data' / 'raw'
print(f"Root directory: {ROOT_DIR}")
print(f"Raw data directory: {RAW_DATA_DIR}")

Root directory: C:\Users\Boris\Desktop\code\multilingual-rag
Raw data directory: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw


In [2]:
# Cell 2 - Collect CDC Data
# Initialize CDC collector
cdc_collector = CDCCollector(RAW_DATA_DIR)

# Fetch different CDC datasets
print("Fetching CDC datasets...")
cdc_files = []

# Diabetes data
diabetes_file = cdc_collector.fetch_diabetes_data(limit=500)
if diabetes_file:
    cdc_files.append(diabetes_file)
    print(f"✓ Diabetes data saved to: {diabetes_file}")

# Vaccination data
vax_file = cdc_collector.fetch_vaccination_data(limit=500)
if vax_file:
    cdc_files.append(vax_file)
    print(f"✓ Vaccination data saved to: {vax_file}")

# Heart disease data
heart_file = cdc_collector.fetch_heart_disease_data(limit=500)
if heart_file:
    cdc_files.append(heart_file)
    print(f"✓ Heart disease data saved to: {heart_file}")

Fetching CDC datasets...


2025-10-13 01:34:17,372 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\diabetes_stats.json


✓ Diabetes data saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\diabetes_stats.json


2025-10-13 01:34:26,495 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\vaccinations.json


✓ Vaccination data saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\vaccinations.json


2025-10-13 01:34:27,402 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\heart_disease.json


✓ Heart disease data saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\cdc\heart_disease.json


In [3]:
# Cell 3 - Collect FDA Data
# Initialize FDA collector
fda_collector = FDACollector(RAW_DATA_DIR)

# Fetch drug data for different conditions
print("\nFetching FDA drug data...")
fda_files = []

conditions = ['diabetes', 'hypertension', 'heart disease']

for condition in conditions:
    drug_file = fda_collector.fetch_drug_labels(condition, limit=50)
    if drug_file:
        fda_files.append(drug_file)
        print(f"✓ {condition} drugs saved to: {drug_file}")


Fetching FDA drug data...


2025-10-13 01:34:29,149 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\diabetes_drugs.json


✓ diabetes drugs saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\diabetes_drugs.json


2025-10-13 01:34:30,895 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\hypertension_drugs.json


✓ hypertension drugs saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\hypertension_drugs.json


2025-10-13 01:34:32,645 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\heart_disease_drugs.json


✓ heart disease drugs saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\fda\heart_disease_drugs.json


In [4]:
# Cell 4 - Collect PubMed Data
# Initialize PubMed collector
pubmed_collector = PubMedCollector(RAW_DATA_DIR)

# Search for medical literature
print("\nFetching PubMed articles...")
pubmed_files = []

search_queries = [
    'diabetes treatment guidelines 2024',
    'cardiovascular disease prevention',
    'hypertension management'
]

for query in search_queries:
    search_file = pubmed_collector.search_articles(query, max_results=30)
    if search_file:
        pubmed_files.append(search_file)
        print(f"✓ '{query}' saved to: {search_file}")


Fetching PubMed articles...


2025-10-13 01:34:33,311 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\diabetes_treatment_guidelines_2024_search.json


✓ 'diabetes treatment guidelines 2024' saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\diabetes_treatment_guidelines_2024_search.json


2025-10-13 01:34:33,994 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\cardiovascular_disease_prevention_search.json


✓ 'cardiovascular disease prevention' saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\cardiovascular_disease_prevention_search.json


2025-10-13 01:34:34,695 - INFO - Saved data to C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\hypertension_management_search.json


✓ 'hypertension management' saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\pubmed\hypertension_management_search.json


In [5]:
# Cell 5 - Display Sample Data
def display_sample(file_path: Path, num_samples: int = 2):
    """Display sample data from a JSON file"""
    print(f"\n{'='*60}")
    print(f"File: {file_path.name}")
    print(f"Path: {file_path.relative_to(ROOT_DIR)}")
    print(f"Size: {file_path.stat().st_size / 1024:.2f} KB")
    print(f"{'-'*60}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Handle different JSON structures
    if isinstance(data, dict):
        print(f"Data type: Dictionary with keys: {list(data.keys())[:5]}")
        
        if 'results' in data:  # FDA format
            print(f"Number of results: {len(data['results'])}")
            samples = data['results'][:num_samples]
        elif 'data' in data:  # CDC download format
            print(f"Number of data rows: {len(data['data'])}")
            samples = data['data'][:num_samples]
        elif 'esearchresult' in data:  # PubMed format
            result = data['esearchresult']
            print(f"Number of IDs found: {result.get('count', 0)}")
            print(f"Sample IDs: {result.get('idlist', [])[:5]}")
            return
        else:
            samples = [data]
    elif isinstance(data, list):
        print(f"Data type: List with {len(data)} items")
        samples = data[:num_samples]
    else:
        samples = [data]
    
    # Display samples
    for i, sample in enumerate(samples, 1):
        print(f"\nSample {i}:")
        if isinstance(sample, dict):
            # Show first few key-value pairs
            for key, value in list(sample.items())[:5]:
                value_str = str(value)[:100] + '...' if len(str(value)) > 100 else str(value)
                print(f"  {key}: {value_str}")
        else:
            print(f"  {str(sample)[:200]}...")

# Display samples from all collected files
print("\n" + "="*70)
print("DATA SAMPLES")
print("="*70)

all_files = cdc_files + fda_files + pubmed_files
for file_path in all_files:
    if file_path and file_path.exists():
        display_sample(file_path)


DATA SAMPLES

File: diabetes_stats.json
Path: data\raw\cdc\diabetes_stats.json
Size: 521.58 KB
------------------------------------------------------------
Data type: List with 500 items

Sample 1:
  year: 2023
  locationabbr: AK
  locationdesc: Alaska
  class: Chronic Health Indicators
  topic: Depression

Sample 2:
  year: 2023
  locationabbr: AK
  locationdesc: Alaska
  class: Chronic Health Indicators
  topic: Depression

File: vaccinations.json
Path: data\raw\cdc\vaccinations.json
Size: 67565.90 KB
------------------------------------------------------------
Data type: Dictionary with keys: ['meta', 'data']
Number of data rows: 38488

Sample 1:
  ['row-xzd8.wppa.wh7z', '00000000-0000-0000-D566-763F726B1BD5', 0, 1683869426, None, 1683869426, None, '{ }', '2021-10-30T00:00:00', '43', 'TX', '44779195', '2217800', '17258060', '25303335', None, '0'...

Sample 2:
  ['row-ag7q~rupr-e9ja', '00000000-0000-0000-C5F6-DDFF2457D6EA', 0, 1683869426, None, 1683869426, None, '{ }', '2021-10-30T0

In [6]:
# Cell 6 - Generate Summary
def generate_summary():
    """Generate summary of all collected data"""
    summary = {
        'timestamp': datetime.now().isoformat(),
        'sources': {},
        'total_files': 0,
        'total_size_mb': 0
    }
    
    for source_dir in RAW_DATA_DIR.iterdir():
        if source_dir.is_dir():
            files = list(source_dir.glob('*.json'))
            total_size = sum(f.stat().st_size for f in files)
            
            summary['sources'][source_dir.name] = {
                'files': len(files),
                'size_mb': round(total_size / (1024 * 1024), 2),
                'file_list': [f.name for f in files]
            }
            
            summary['total_files'] += len(files)
            summary['total_size_mb'] += total_size / (1024 * 1024)
    
    summary['total_size_mb'] = round(summary['total_size_mb'], 2)
    
    # Display summary
    print("\n" + "="*70)
    print("DATA COLLECTION SUMMARY")
    print("="*70)
    print(f"Timestamp: {summary['timestamp']}")
    print(f"Total files collected: {summary['total_files']}")
    print(f"Total size: {summary['total_size_mb']} MB")
    print("\nBreakdown by source:")
    
    for source, info in summary['sources'].items():
        print(f"\n{source.upper()}:")
        print(f"  Files: {info['files']}")
        print(f"  Size: {info['size_mb']} MB")
        print(f"  Files: {', '.join(info['file_list'])}")
    
    # Save summary
    summary_path = RAW_DATA_DIR / 'collection_summary.json'
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    print(f"\n✓ Summary saved to: {summary_path}")
    
    return summary

# Generate and display summary
summary = generate_summary()


DATA COLLECTION SUMMARY
Timestamp: 2025-10-13T01:34:35.425928
Total files collected: 17
Total size: 103.82 MB

Breakdown by source:

CDC:
  Files: 7
  Size: 70.06 MB
  Files: diabetes.json, diabetes_stats.json, hd_stroke_rates_trends.json, heart_disease.json, heart_disease_mortality.json, hypertension_cvd_mortality.json, vaccinations.json

FDA:
  Files: 6
  Size: 33.75 MB
  Files: asthma_drugs.json, diabetes_drugs.json, diabetes_drug_labels.json, heart disease_drugs.json, heart_disease_drugs.json, hypertension_drugs.json

PUBMED:
  Files: 4
  Size: 0.01 MB
  Files: cardiovascular_disease_prevention_search.json, diabetes_treatment_guidelines_2024_search.json, diabetes_treatment_guidelines_search.json, hypertension_management_search.json

✓ Summary saved to: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw\collection_summary.json
