In [1]:
# Cell 1 - Setup
import sys
import os
sys.path.append('..')

import json
import logging
from pathlib import Path
from typing import List
import pandas as pd
from tqdm import tqdm

from src.data_processor import HealthDataProcessor
from src.rag_system import MultilingualRAG
from langchain.schema import Document

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

ROOT_DIR = Path('..').resolve()
RAW_DATA_DIR = ROOT_DIR / 'data' / 'raw'
PROCESSED_DATA_DIR = ROOT_DIR / 'data' / 'processed'
EMBEDDINGS_DIR = ROOT_DIR / 'data' / 'embeddings'

print(f"Root directory: {ROOT_DIR}")
print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")
print(f"Embeddings directory: {EMBEDDINGS_DIR}")

Root directory: C:\Users\Boris\Desktop\code\multilingual-rag
Raw data directory: C:\Users\Boris\Desktop\code\multilingual-rag\data\raw
Processed data directory: C:\Users\Boris\Desktop\code\multilingual-rag\data\processed
Embeddings directory: C:\Users\Boris\Desktop\code\multilingual-rag\data\embeddings


In [2]:
# Cell 2 - Process Raw Data
# Initialize data processor
processor = HealthDataProcessor(chunk_size=1000, chunk_overlap=200)

# Process all data files
print("Processing raw data files...")
all_documents = processor.process_all_files(RAW_DATA_DIR)

print(f"\nTotal documents created: {len(all_documents)}")
print(f"Sample document:")
if all_documents:
    print(f"Content: {all_documents[0].page_content[:200]}...")
    print(f"Metadata: {all_documents[0].metadata}")

2025-10-15 00:30:51,402 - INFO - Processing FDA (U.S. Gov): asthma_drugs.json
2025-10-15 00:30:51,498 - INFO - Processed 100 FDA records into 2847 chunks
2025-10-15 00:30:51,517 - INFO - Processing FDA (U.S. Gov): diabetes_drugs.json


Processing raw data files...


2025-10-15 00:30:51,781 - INFO - Processed 100 FDA records into 3373 chunks
2025-10-15 00:30:51,784 - INFO - Processing FDA (U.S. Gov): heart_disease_drugs.json
2025-10-15 00:30:51,947 - INFO - Processed 100 FDA records into 2674 chunks
2025-10-15 00:30:51,963 - INFO - Processing FDA (U.S. Gov): high_blood_pressure_drugs.json
2025-10-15 00:30:52,179 - INFO - Processed 100 FDA records into 2683 chunks
2025-10-15 00:30:52,202 - INFO - Processing FDA (U.S. Gov): hypertension_drugs.json
2025-10-15 00:30:52,377 - INFO - Processed 100 FDA records into 4365 chunks
2025-10-15 00:30:52,382 - INFO - Processing MedlinePlus (NIH): asthma.json
2025-10-15 00:30:52,395 - INFO - Processed 1 MedlinePlus topics into 10 chunks
2025-10-15 00:30:52,395 - INFO - Processing MedlinePlus (NIH): cholesterol.json
2025-10-15 00:30:52,424 - INFO - Processed 1 MedlinePlus topics into 13 chunks
2025-10-15 00:30:52,425 - INFO - Processing MedlinePlus (NIH): diabetes.json
2025-10-15 00:30:52,448 - INFO - Processed 1 M


Total documents created: 16036
Sample document:
Content: INDICATIONS AND USAGE: 1 INDICATIONS AND USAGE Naproxen tablets and naproxen sodium tablets are indicated for: the relief of the signs and symptoms of: • rheumatoid arthritis • osteoarthritis • ankylo...
Metadata: {'source': 'FDA', 'source_type': 'U.S. Government', 'file': 'asthma_drugs.json', 'drug_name': 'Naproxen'}


In [3]:
# Cell 3 - Save Processed Documents
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Convert documents to serializable format
docs_data = []
for doc in all_documents:
    docs_data.append({
        'content': doc.page_content,
        'metadata': doc.metadata
    })

# Save to JSON
processed_path = PROCESSED_DATA_DIR / 'processed_documents.json'
with open(processed_path, 'w', encoding='utf-8') as f:
    json.dump(docs_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(docs_data)} processed documents to {processed_path}")
print(f"File size: {processed_path.stat().st_size / (1024*1024):.2f} MB")

Saved 16036 processed documents to C:\Users\Boris\Desktop\code\multilingual-rag\data\processed\processed_documents.json
File size: 15.68 MB


In [4]:
# Cell 4 - Create Vector Store with Multilingual Embeddings
print("\nCreating multilingual embedding vector store (no API key required)...")

rag_multilingual = MultilingualRAG(
    embedding_model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    vector_store_type='faiss',
    persist_directory=str(EMBEDDINGS_DIR / 'multilingual')
)

# Create vector store
rag_multilingual.create_vector_store(all_documents)
print("✓ Multilingual vector store created")


Creating multilingual embedding vector store (no API key required)...


2025-10-15 00:30:57,773 - INFO - Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/502 [00:00<?, ?it/s]

2025-10-15 00:38:00,341 - INFO - Loading faiss with AVX2 support.
2025-10-15 00:38:00,341 - INFO - Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2025-10-15 00:38:00,341 - INFO - Loading faiss.
2025-10-15 00:38:01,040 - INFO - Successfully loaded faiss.
2025-10-15 00:38:02,359 - INFO - Created faiss vector store with 16036 documents


✓ Multilingual vector store created


In [5]:
# Cell 5 - Test Vector Store
# Test retrieval with sample queries
test_queries = [
    "What are the symptoms of diabetes?",
    "मधुमेह के लक्षण क्या हैं?",  # Hindi: diabetes symptoms
]

print("Testing retrieval with sample queries...\n")

for query in test_queries:
    print(f"Query: {query}")
    
    # Test with multilingual embeddings
    if 'rag_multilingual' in locals():
        docs = rag_multilingual.vector_store.similarity_search(query, k=2)
        print(f"  Retrieved {len(docs)} documents")
        if docs:
            print(f"  Top result: {docs[0].page_content[:100]}...")
    print()

Testing retrieval with sample queries...

Query: What are the symptoms of diabetes?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved 2 documents
  Top result: of diabetes?The symptoms of diabetes may include:Feeling very thirstyFeeling very hungryUrinating (p...

Query: मधुमेह के लक्षण क्या हैं?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved 2 documents
  Top result: of diabetes?The symptoms of diabetes may include:Feeling very thirstyFeeling very hungryUrinating (p...



In [7]:
# Cell 6 - Save Configuration
# Save configuration for experiments
experiment_config = {
    'data_stats': {
        'total_documents': len(all_documents),
        'chunk_size': 1000,
        'chunk_overlap': 200,
        'sources': ['FDA (fda.gov) ', 'MedlinePlus/NIH (nih.gov)']
    },
    'vector_stores': [
        {
            'name': 'multilingual',
            'model': 'paraphrase-multilingual-MiniLM-L12-v2',
            'type': 'faiss',
            'path': str(EMBEDDINGS_DIR / 'multilingual')
        }
    ],
    'languages': ['en', 'hi', 'zh', 'es', 'fr']
}

# Save configuration
config_path = PROCESSED_DATA_DIR / 'experiment_config.json'
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(experiment_config, f, indent=2)

print(f"Experiment configuration saved to {config_path}")
print(json.dumps(experiment_config, indent=2))

Experiment configuration saved to C:\Users\Boris\Desktop\code\multilingual-rag\data\processed\experiment_config.json
{
  "data_stats": {
    "total_documents": 16036,
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "sources": [
      "FDA (fda.gov) ",
      "MedlinePlus/NIH (nih.gov)"
    ]
  },
  "vector_stores": [
    {
      "name": "multilingual",
      "model": "paraphrase-multilingual-MiniLM-L12-v2",
      "type": "faiss",
      "path": "C:\\Users\\Boris\\Desktop\\code\\multilingual-rag\\data\\embeddings\\multilingual"
    }
  ],
  "languages": [
    "en",
    "hi",
    "zh",
    "es",
    "fr"
  ]
}
