In [None]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyBfEQzksQxE0JzMiT5rnVhEUrRyIYkBM6k")

print("Checking available Gemini models...")

try:
    models = genai.list_models()
    for model in models:
        if 'gemini' in model.name.lower():
            print(f" Found: {model.name}")
            print(f"   Supported methods: {model.supported_generation_methods}")
except Exception as e:
    print(f" Error listing models: {e}")

Checking available Gemini models...
 Found: models/gemini-2.5-flash
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
 Found: models/gemini-2.5-pro
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
 Found: models/gemini-2.0-flash-exp
   Supported methods: ['generateContent', 'countTokens', 'bidiGenerateContent']
 Found: models/gemini-2.0-flash
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
 Found: models/gemini-2.0-flash-001
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
 Found: models/gemini-2.0-flash-exp-image-generation
   Supported methods: ['generateContent', 'countTokens', 'bidiGenerateContent']
 Found: models/gemini-2.0-flash-lite-001
   Supported methods: ['generateContent', 'countTokens', 'createCachedContent', 'batchGenerateContent']
 Found: models/gemini-2.

In [None]:
# Step 1: Test with the actual available models
import google.generativeai as genai
genai.configure(api_key="AIzaSyDudXUzGqF6f5DQOBYH_AyaR8I7IpVxTls")

print("Testing with available models...")

# Use the models that are actually available
available_models = [
    "models/gemini-2.0-flash",
    "models/gemini-2.0-flash-001",
    "models/gemini-pro-latest",
    "models/gemini-2.5-flash",
]

for model_name in available_models:
    try:
        print(f"Testing: {model_name}")
        model = genai.GenerativeModel(model_name)
        response = model.generate_content("Say 'Medical RAG System Active' in one sentence.")
        print(f"SUCCESS with {model_name}!")
        print(f"Response: {response.text}")
        WORKING_MODEL = model_name
        break
    except Exception as e:
        print(f" Failed with {model_name}: {str(e)[:100]}...")

print(f"\n Working model: {WORKING_MODEL}")

Testing with available models...
Testing: models/gemini-2.0-flash
SUCCESS with models/gemini-2.0-flash!
Response: Medical RAG System Active.


 Working model: models/gemini-2.0-flash


In [None]:
# Step 2: Configure the working model for our RAG project
import google.generativeai as genai

genai.configure(api_key="AIzaSyDudXUzGqF6f5DQOBYH_AyaR8I7IpVxTls")

# Use the working model (gemini-2.0-flash)
MODEL_NAME = "models/gemini-2.0-flash"  # or use WORKING_MODEL from above

def query_gemini(prompt, context=""):
    """Function to query Gemini with optional context"""
    try:
        model = genai.GenerativeModel(MODEL_NAME)

        if context:
            full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer based on the context above:"
        else:
            full_prompt = prompt

        response = model.generate_content(full_prompt)
        return response.text
    except Exception as e:
        return f"Error: {str(e)}"

# Testing the function
print(" Testing our Gemini query function...")
test_response = query_gemini("What is the purpose of a RAG system?")
print(f" Test successful!")
print(f" Response: {test_response}")
print(f" Model: {MODEL_NAME}")

 Testing our Gemini query function...
 Test successful!
 Response: The purpose of a Retrieval-Augmented Generation (RAG) system is to **enhance the capabilities of Large Language Models (LLMs) by grounding them with external knowledge sources.**  In simpler terms, it helps LLMs provide more accurate, relevant, and up-to-date information by supplementing their internal training data with information retrieved from a knowledge base.

Here's a breakdown of the core purpose and benefits:

*   **Overcome LLM Limitations:** LLMs, while powerful, have limitations:
    *   **Knowledge Cutoff:**  They are trained on a specific dataset, meaning they lack awareness of events, facts, or data that occurred after their training period.
    *   **Hallucinations:** LLMs can sometimes generate factually incorrect or nonsensical information (hallucinations) due to biases or gaps in their training data.
    *   **Lack of Specificity:**  They might struggle to answer questions requiring specific, contextu

In [None]:
# Step 4: Load and explore your medical dataset
import zipfile
import pandas as pd
import os

print("Loading your medical dataset...")

# Extract the zip file
dataset_path = "/content/mtsamples.csv.zip"
extract_path = "medical_rag/data/"

try:
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(" Dataset extracted successfully!")

    # List extracted files
    print("\n Extracted files:")
    for file in os.listdir(extract_path):
        print(f"   üìÑ {file}")

except Exception as e:
    print(f" Extraction error: {e}")

# Find and load the CSV file
csv_files = [f for f in os.listdir(extract_path) if f.endswith('.csv')]
if csv_files:
    csv_file = csv_files[0]
    csv_path = os.path.join(extract_path, csv_file)
    medical_df = pd.read_csv(csv_path)
    print(f"\n Loaded: {csv_file}")
else:
    print(" No CSV file found in the archive")
    # Listing all files to see what we have
    all_files = os.listdir(extract_path)
    print(f"All files: {all_files}")

Loading your medical dataset...
 Dataset extracted successfully!

 Extracted files:
   üìÑ mtsamples.csv

 Loaded: mtsamples.csv


In [None]:
# Step 5: Explore the medical dataset structure
print(" EXPLORING MEDICAL DATASET")
print("=" * 50)

print(f" Dataset shape: {medical_df.shape}")
print(f" Columns: {list(medical_df.columns)}")

print(f"\n First 3 rows:")
print(medical_df.head(3))

print(f"\n Dataset info:")
print(medical_df.info())

print(f"\n Basic statistics:")
print(medical_df.describe(include='all'))

# Check for missing values
print(f"\n Missing values:")
print(medical_df.isnull().sum())

# Check unique values in key columns
print(f"\n Unique values in categorical columns:")
for col in medical_df.columns:
    if medical_df[col].dtype == 'object':
        print(f"{col}: {medical_df[col].nunique()} unique values")
        if medical_df[col].nunique() < 10:
            print(f"   Values: {medical_df[col].unique()}")

 EXPLORING MEDICAL DATASET
 Dataset shape: (4999, 6)
 Columns: ['Unnamed: 0', 'description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']

 First 3 rows:
   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   

       medical_specialty                                sample_name  \
0   Allergy / Immunology                         Allergic Rhinitis    
1             Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2             Bariatrics   Laparoscopic Gastric Bypass Consult - 1    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL HISTORY:, He has difficulty climb...   
2  HISTORY OF PRESENT ILLNESS: , I have seen ABC ...   

                                           

In [None]:
# Step 6: Preprocess the medical data for RAG
print(" PREPROCESSING MEDICAL DATA")
print("=" * 50)

# Cleaning data
def preprocess_medical_data(df):

    processed_df = df.copy()

    # 1. Handle missing values
    print("1. Handling missing values...")
    initial_rows = len(processed_df)
    processed_df = processed_df.dropna()
    print(f"   Removed {initial_rows - len(processed_df)} rows with missing values")

    # 2. Basic text cleaning
    print("2. Cleaning text data...")
    text_columns = []
    for col in processed_df.columns:
        if processed_df[col].dtype == 'object':
            text_columns.append(col)
            # Remove extra whitespace
            processed_df[col] = processed_df[col].str.strip()
            # Replace NaN strings
            processed_df[col] = processed_df[col].fillna('')

    print(f"Text columns identified: {text_columns}")

    # 3. Add metadata
    print("3. Adding metadata...")
    processed_df['text_length'] = processed_df[text_columns[0]].str.len()
    processed_df['word_count'] = processed_df[text_columns[0]].str.split().str.len()

    return processed_df, text_columns

# Apply preprocessing
medical_clean, text_cols = preprocess_medical_data(medical_df)

print(f"\n Preprocessing complete!")
print(f" Clean dataset shape: {medical_clean.shape}")
print(f" Text columns: {text_cols}")

# Show sample of cleaned data
print(f"\n Sample of preprocessed data:")
print(medical_clean.head(2))

 PREPROCESSING MEDICAL DATA
1. Handling missing values...
   Removed 1101 rows with missing values
2. Cleaning text data...
Text columns identified: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']
3. Adding metadata...

 Preprocessing complete!
 Clean dataset shape: (3898, 8)
 Text columns: ['description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']

 Sample of preprocessed data:
   Unnamed: 0                                        description  \
0           0  A 23-year-old white female presents with compl...   
1           1           Consult for laparoscopic gastric bypass.   

      medical_specialty                              sample_name  \
0  Allergy / Immunology                        Allergic Rhinitis   
1            Bariatrics  Laparoscopic Gastric Bypass Consult - 2   

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL HISTORY:, He has difficul

In [None]:
# Step 7: Identifying which column contains the medical transcriptions
print(" IDENTIFYING MEDICAL CONTENT COLUMN")
print("=" * 50)

# Check which column likely contains the medical transcriptions
for col in text_cols:
    sample_text = medical_clean[col].iloc[0] if len(medical_clean) > 0 else ""
    print(f"\n Column: {col}")
    print(f"   Sample text: {str(sample_text)[:200]}...")
    print(f"   Average length: {medical_clean[col].str.len().mean():.0f} characters")
    print(f"   Average words: {medical_clean[col].str.split().str.len().mean():.0f} words")

# identify which column to use for RAG
content_column = 'transcription'

if content_column in medical_clean.columns:
    print(f"\n Using '{content_column}' as main content column")
    print(f"   Sample medical transcription:")
    sample = medical_clean[content_column].iloc[0]
    print(f"   {str(sample)[:500]}...")
else:
    print(f"\n Column '{content_column}' not found. Available columns: {list(medical_clean.columns)}")
    content_column = text_cols[0]  # Use first text column as fallback
    print(f"   Using '{content_column}' as fallback")

 IDENTIFYING MEDICAL CONTENT COLUMN

 Column: description
   Sample text: A 23-year-old white female presents with complaint of allergies....
   Average length: 129 characters
   Average words: 18 words

 Column: medical_specialty
   Sample text: Allergy / Immunology...
   Average length: 14 characters
   Average words: 2 words

 Column: sample_name
   Sample text: Allergic Rhinitis...
   Average length: 25 characters
   Average words: 4 words

 Column: transcription
   Sample text: SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried ...
   Average length: 2673 characters
   Average words: 409 words

 Column: keywords
   Sample text: allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,...
   Average length: 224 characters
   Average words: 26 words

 Using 'trans

In [None]:
# Step 8: Save the cleaned dataset
print("SAVING PROCESSED DATA")
print("=" * 50)

processed_file = "medical_rag/data/medical_data_processed.csv"
medical_clean.to_csv(processed_file, index=False)

print(f"   Saved processed data to: {processed_file}")
print(f"   Final dataset info:")
print(f"   Rows: {len(medical_clean)}")
print(f"   Columns: {len(medical_clean.columns)}")
print(f"   Main content column: '{content_column}'")
print(f"   Average text length: {medical_clean[content_column].str.len().mean():.0f} chars")
print(f"   Medical specialties: {medical_clean['medical_specialty'].nunique() if 'medical_specialty' in medical_clean.columns else 'N/A'}")

SAVING PROCESSED DATA
   Saved processed data to: medical_rag/data/medical_data_processed.csv
   Final dataset info:
   Rows: 3898
   Columns: 8
   Main content column: 'transcription'
   Average text length: 2673 chars
   Medical specialties: 39


In [None]:
# Install the specific LangChain components we need
print("Installing required LangChain modules...")

!pip install -q "langchain>=0.1.0" "langchain-community>=0.0.10" "sentence-transformers" "faiss-cpu"

print("Installation complete!")

Installing required LangChain modules...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.5/2.5 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.6/23.6 MB[0m [31m110.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.0/1.0 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m64.7/64.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m50.9/50.9 kB[0m [31m4.4

In [None]:
# OPTIMIZED: Process medical data in batches to avoid RAM issues
print("OPTIMIZED PROCESSING - USING BATCHES")
print("=" * 50)

import pandas as pd
import re
import os

# Load only essential columns to save memory
processed_file = "medical_rag/data/medical_data_processed.csv"
essential_columns = ['transcription', 'medical_specialty', 'description', 'sample_name', 'keywords']
medical_clean = pd.read_csv(processed_file, usecols=essential_columns)

print(f"Reloaded medical data (essential columns only): {medical_clean.shape}")

# Remove rows with missing transcriptions to reduce processing
initial_count = len(medical_clean)
medical_clean = medical_clean.dropna(subset=['transcription'])
print(f" Removed {initial_count - len(medical_clean)} rows with missing transcriptions")
print(f" Processing {len(medical_clean)} records")

# Simple Text Splitter with smaller chunks
class SimpleTextSplitter:
    def __init__(self, chunk_size=800, chunk_overlap=150):  # Small chunks for memory
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text):
        """Simple text splitter that splits by sentences"""
        if len(text) <= self.chunk_size:
            return [text]

        chunks = []
        start = 0

        while start < len(text):
            end = start + self.chunk_size
            if end >= len(text):
                chunks.append(text[start:])
                break

            # Find good break points
            break_point = text.rfind('. ', start, end)
            if break_point == -1:
                break_point = text.rfind('\n', start, end)
            if break_point == -1:
                break_point = text.rfind(' ', start, end)

            if break_point != -1:
                end = break_point + 1

            chunks.append(text[start:end])
            start = max(start + 1, end - self.chunk_overlap)

        return chunks

# Process in smaller batches
def process_batches(dataframe, batch_size=500):
    """Process data in batches to avoid memory issues"""
    all_chunks = []
    chunk_metadata = []

    total_batches = (len(dataframe) + batch_size - 1) // batch_size
    text_splitter = SimpleTextSplitter(chunk_size=800, chunk_overlap=150)

    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(dataframe))
        batch = dataframe.iloc[start_idx:end_idx]

        print(f"Processing batch {batch_num + 1}/{total_batches} (rows {start_idx}-{end_idx})")

        for idx, row in batch.iterrows():
            transcription = str(row['transcription']).strip()
            if len(transcription) > 100:  # Only substantial text
                chunks = text_splitter.split_text(transcription)

                for chunk_idx, chunk in enumerate(chunks):
                    all_chunks.append(chunk)
                    chunk_metadata.append({
                        'chunk_id': f"{idx}_{chunk_idx}",
                        'original_index': idx,
                        'chunk_index': chunk_idx,
                        'medical_specialty': row['medical_specialty'],
                        'description': row['description'],
                        'sample_name': row['sample_name'],
                        'keywords': row['keywords'],
                        'chunk_length': len(chunk),
                        'word_count': len(chunk.split())
                    })

        # Clear memory after each batch
        del batch
        if batch_num % 2 == 0:  # Clear memory every 2 batches
            import gc
            gc.collect()

    return all_chunks, chunk_metadata

print(f"\n SPLITTING MEDICAL TEXT INTO CHUNKS (BATCH PROCESSING)")
print("=" * 50)

# Process in batches
all_chunks, chunk_metadata = process_batches(medical_clean, batch_size=500)

print(f"\n Created {len(all_chunks)} chunks from {len(medical_clean)} medical records")
print(f" Average chunks per record: {len(all_chunks)/len(medical_clean):.1f}")

# Show chunk statistics
chunk_df = pd.DataFrame(chunk_metadata)
print(f"\n Chunk Statistics:")
print(f"   Average chunk length: {chunk_df['chunk_length'].mean():.0f} characters")
print(f"   Average word count: {chunk_df['word_count'].mean():.0f} words")
print(f"   Medical specialties covered: {chunk_df['medical_specialty'].nunique()}")

print(f"\n Sample chunks:")
for i in range(min(2, len(all_chunks))):
    print(f"\n--- Chunk {i+1} ---")
    print(f"Specialty: {chunk_metadata[i]['medical_specialty']}")
    print(f"Length: {chunk_metadata[i]['chunk_length']} chars")
    print(f"Text: {all_chunks[i][:150]}...")

# Free up memory
del medical_clean
import gc
gc.collect()

print(f"\n Memory optimized - ready for next steps!")

OPTIMIZED PROCESSING - USING BATCHES
Reloaded medical data (essential columns only): (3898, 5)
 Removed 0 rows with missing transcriptions
 Processing 3898 records

 SPLITTING MEDICAL TEXT INTO CHUNKS (BATCH PROCESSING)
Processing batch 1/8 (rows 0-500)
Processing batch 2/8 (rows 500-1000)
Processing batch 3/8 (rows 1000-1500)
Processing batch 4/8 (rows 1500-2000)
Processing batch 5/8 (rows 2000-2500)
Processing batch 6/8 (rows 2500-3000)
Processing batch 7/8 (rows 3000-3500)
Processing batch 8/8 (rows 3500-3898)

 Created 29713 chunks from 3898 medical records
 Average chunks per record: 7.6

 Chunk Statistics:
   Average chunk length: 455 characters
   Average word count: 70 words
   Medical specialties covered: 39

 Sample chunks:

--- Chunk 1 ---
Specialty: Allergy / Immunology
Length: 504 chars
Text: SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks...

--- Chunk 2 ---
Specialty: A

The optimized processing worked perfectly. We now have:

‚úÖ 29,713 medical text chunks from 3,898 records
‚úÖ Average 7.6 chunks per record
‚úÖ 455 characters per chunk (good size for RAG)
‚úÖ 39 medical specialties covered

Now let's continue with the remaining steps. Since we've optimized memory usage, the next steps should run smoothly.

**Step 10: Create Embeddings and Vector Store**

In [None]:
# Install FAISS and other required dependencies
print("Installing FAISS and dependencies...")

!pip install -q faiss-cpu sentence-transformers

print("Installation complete!")

Installing FAISS and dependencies...
Installation complete!


In [None]:
# Step 10: Create embeddings and FAISS vector store
print(" CREATING EMBEDDINGS AND VECTOR STORE")
print("=" * 50)

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import os

print("Loading embedding model...")

# Use a lightweight embedding model to save memory
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Test the embedding model
print(" Testing embedding model...")
test_embedding = embedding_model.encode(["Test medical embedding"])
print(f" Embedding model working - vector dimension: {test_embedding.shape[1]}")

print(f"\n Creating embeddings for {len(all_chunks)} chunks...")

# Process embeddings in batches to avoid memory issues
def create_embeddings_batch(texts, model, batch_size=500):
    """Create embeddings in batches to avoid memory issues"""
    all_embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size

    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]

        print(f"Processing batch {i+1}/{total_batches} ({start_idx}-{end_idx})")
        batch_embeddings = model.encode(batch_texts)
        all_embeddings.append(batch_embeddings)

        # Clear memory
        del batch_texts
        if i % 2 == 0:
            import gc
            gc.collect()

    return np.vstack(all_embeddings)

# Create embeddings in batches
chunk_embeddings = create_embeddings_batch(all_chunks, embedding_model, batch_size=500)

print(f"Created embeddings: {chunk_embeddings.shape}")

# Create FAISS index
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity

# Normalize vectors for cosine similarity
faiss.normalize_L2(chunk_embeddings)
index.add(chunk_embeddings)
print(" FAISS index created successfully!")
print(f" Index contains {index.ntotal} vectors of dimension {index.d}")

 CREATING EMBEDDINGS AND VECTOR STORE
Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

 Testing embedding model...
 Embedding model working - vector dimension: 384

 Creating embeddings for 29713 chunks...
Processing batch 1/60 (0-500)
Processing batch 2/60 (500-1000)
Processing batch 3/60 (1000-1500)
Processing batch 4/60 (1500-2000)
Processing batch 5/60 (2000-2500)
Processing batch 6/60 (2500-3000)
Processing batch 7/60 (3000-3500)
Processing batch 8/60 (3500-4000)
Processing batch 9/60 (4000-4500)
Processing batch 10/60 (4500-5000)
Processing batch 11/60 (5000-5500)
Processing batch 12/60 (5500-6000)
Processing batch 13/60 (6000-6500)
Processing batch 14/60 (6500-7000)
Processing batch 15/60 (7000-7500)
Processing batch 16/60 (7500-8000)
Processing batch 17/60 (8000-8500)
Processing batch 18/60 (8500-9000)
Processing batch 19/60 (9000-9500)
Processing batch 20/60 (9500-10000)
Processing batch 21/60 (10000-10500)
Processing batch 22/60 (10500-11000)
Processing batch 23/60 (11000-11500)
Processing batch 24/60 (11500-12000)
Processing batch 25/60 (12000-12500)
Processin

In [None]:
# Step 11: Save the vector store
print("SAVING VECTOR STORE")
print("=" * 50)

# Create directory if not exists
os.makedirs("medical_rag/vector_store", exist_ok=True)

# Save FAISS index
faiss.write_index(index, "medical_rag/vector_store/medical_faiss.index")

# Save metadata and chunks (bcz this is the only essential data to save space)
vector_store_data = {
    'chunks': all_chunks,
    'metadata': chunk_metadata,
    'embedding_model_name': 'sentence-transformers/all-MiniLM-L6-v2',
    'total_chunks': len(all_chunks),
    'medical_specialties': list(set(m['medical_specialty'] for m in chunk_metadata))
}

with open("medical_rag/vector_store/vector_metadata.pkl", "wb") as f:
    pickle.dump(vector_store_data, f)

print(" Vector store saved successfully!")

# Save chunk information for reference
chunk_info_df = pd.DataFrame(chunk_metadata)
chunk_info_path = "medical_rag/data/chunk_metadata.csv"
chunk_info_df.to_csv(chunk_info_path, index=False)
print(f"Chunk metadata saved to: {chunk_info_path}")

# Free up memory
del chunk_embeddings
import gc
gc.collect()

print(f"\n VECTOR STORE SUMMARY:")
print(f"   Total chunks: {len(all_chunks)}")
print(f"   Vector dimension: {dimension}")
print(f"   Medical specialties: {len(vector_store_data['medical_specialties'])}")
print(f"   Saved location: medical_rag/vector_store/")

SAVING VECTOR STORE
 Vector store saved successfully!
Chunk metadata saved to: medical_rag/data/chunk_metadata.csv

 VECTOR STORE SUMMARY:
   Total chunks: 29713
   Vector dimension: 384
   Medical specialties: 39
   Saved location: medical_rag/vector_store/


In [None]:
# Step 12: Create Medical RAG Retrieval System
print(" CREATING MEDICAL RAG RETRIEVAL SYSTEM")
print("=" * 50)

class MedicalRAGSystem:
    def __init__(self, vector_store_path="medical_rag/vector_store"):
        self.vector_store_path = vector_store_path
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.load_vector_store()

    def load_vector_store(self):
        """Load the FAISS index and metadata"""
        try:
            self.index = faiss.read_index(f"{self.vector_store_path}/medical_faiss.index")
            with open(f"{self.vector_store_path}/vector_metadata.pkl", "rb") as f:
                data = pickle.load(f)
            self.chunks = data['chunks']
            self.metadata = data['metadata']
            print(" Vector store loaded successfully!")
            print(f"   Available chunks: {len(self.chunks)}")
            print(f"   Medical specialties: {len(set(m['medical_specialty'] for m in self.metadata))}")
        except Exception as e:
            print(f" Error loading vector store: {e}")
            raise

    def retrieve_similar_chunks(self, query, k=5, specialty_filter=None):
        """Retrieve similar medical chunks for a query"""
        # Encode query
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search (get extra for potential filtering)
        scores, indices = self.index.search(query_embedding, k*3)

        results = []
        seen_chunks = set()  # Avoid duplicates

        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.chunks) and idx not in seen_chunks:
                chunk_data = {
                    'content': self.chunks[idx],
                    'metadata': self.metadata[idx],
                    'similarity_score': float(score)
                }

                # Apply specialty filter if provided
                if specialty_filter:
                    if specialty_filter.lower() in self.metadata[idx]['medical_specialty'].lower():
                        results.append(chunk_data)
                        seen_chunks.add(idx)
                else:
                    results.append(chunk_data)
                    seen_chunks.add(idx)

                if len(results) >= k:
                    break

        return results[:k]

# Initialize RAG system
print(" Initializing Medical RAG System...")
medical_rag = MedicalRAGSystem()

# Test the retrieval
print("\n TESTING RAG RETRIEVAL")
print("=" * 30)

test_queries = [
    "What are the symptoms of allergic rhinitis?",
    "How is asthma treated?",
    "What are common allergy medications?"
]

for query in test_queries:
    print(f"\n Query: '{query}'")
    results = medical_rag.retrieve_similar_chunks(query, k=2)
    print(f" Found {len(results)} relevant chunks:")

    for i, result in enumerate(results):
        specialty = result['metadata']['medical_specialty']
        score = result['similarity_score']
        preview = result['content'][:80].replace('\n', ' ')
        print(f"   {i+1}. {specialty} (score: {score:.3f})")
        print(f"      {preview}...")

 CREATING MEDICAL RAG RETRIEVAL SYSTEM
 Initializing Medical RAG System...
 Vector store loaded successfully!
   Available chunks: 29713
   Medical specialties: 39

 TESTING RAG RETRIEVAL

 Query: 'What are the symptoms of allergic rhinitis?'
 Found 2 relevant chunks:
   1. General Medicine (score: 0.561)
      mately 5 to 5:30 p.m.  He is involved in training purpose to how to sell managed...
   2. Sleep Medicine (score: 0.561)
      mately 5 to 5:30 p.m.  He is involved in training purpose to how to sell managed...

 Query: 'How is asthma treated?'
 Found 2 relevant chunks:
   1. Letters (score: 0.496)
      e is no smoke exposure there is a significant family history with both Abc's fat...
   2. Pediatrics - Neonatal (score: 0.496)
      e is no smoke exposure there is a significant family history with both Abc's fat...

 Query: 'What are common allergy medications?'
 Found 2 relevant chunks:
   1. Allergy / Immunology (score: 0.637)
      SUBJECTIVE:,  This 23-year-old white female

In [None]:
# Step 13: Integrate with Gemini for complete RAG pipeline
print(" INTEGRATING GEMINI FOR ANSWER GENERATION")
print("=" * 50)

import google.generativeai as genai

# Configure Gemini
genai.configure(api_key="AIzaSyDudXUzGqF6f5DQOBYH_AyaR8I7IpVxTls")
MODEL_NAME = "models/gemini-2.0-flash"

def generate_medical_answer(query, context_chunks):
    """Generate answer using Gemini with retrieved context"""
    if not context_chunks:
        return "I couldn't find relevant medical information to answer this question."

    # Prepare context from retrieved chunks
    context_text = "\n\n".join([
        f"--- SOURCE {i+1} (Medical Specialty: {chunk['metadata']['medical_specialty']}) ---\n{chunk['content']}"
        for i, chunk in enumerate(context_chunks)
    ])

    prompt = f"""You are a helpful medical assistant. Based ONLY on the following medical context from clinical notes, provide a accurate and helpful answer to the user's question.

MEDICAL CONTEXT:
{context_text}

USER QUESTION: {query}

IMPORTANT INSTRUCTIONS:
- Answer based ONLY on the provided medical context
- If the context doesn't contain relevant information, say "I cannot find specific information about this in the available medical records"
- Be precise and medically accurate
- Do not make up or hallucinate information
- Mention which medical specialty the information comes from when relevant

ANSWER:"""

    try:
        model = genai.GenerativeModel(MODEL_NAME)
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating answer: {str(e)}"

# Test complete RAG pipeline
print(" TESTING COMPLETE RAG PIPELINE")
print("=" * 30)

test_queries = [
    "What treatments are available for allergies?",
    "What are the symptoms of asthma?",
    "How is allergic rhinitis diagnosed?"
]

for query in test_queries:
    print(f"\n QUERY: {query}")
    print("-" * 50)

    # Step 1: Retrieve relevant chunks
    retrieved_chunks = medical_rag.retrieve_similar_chunks(query, k=3)
    print(f" Retrieved {len(retrieved_chunks)} medical chunks")

    # Step 2: Generate answer
    answer = generate_medical_answer(query, retrieved_chunks)

    print(f" ANSWER:")
    print(answer)
    print("-" * 50)

print(f"\n MEDICAL RAG SYSTEM COMPLETE!")
print(" Text chunking ‚Üí  Vector store ‚Üí  Retrieval ‚Üí  Answer generation")
print(f" System contains {len(medical_rag.chunks)} medical chunks across {len(set(m['medical_specialty'] for m in medical_rag.metadata))} specialties")

 INTEGRATING GEMINI FOR ANSWER GENERATION
 TESTING COMPLETE RAG PIPELINE

 QUERY: What treatments are available for allergies?
--------------------------------------------------
 Retrieved 3 medical chunks
 ANSWER:
From the Allergy/Immunology and SOAP/Chart/Progress Notes, the patient has tried Claritin, Zyrtec, and Allegra for allergies. Zyrtec and Claritin worked for a short time but then seemed to lose effectiveness. Allegra was used last summer and again two weeks ago, but it does not appear to be working very well. She has also used over-the-counter sprays but no prescription nasal sprays.

The current plan (from the SOAP/Chart/Progress Notes) is to try Zyrtec again instead of Allegra, with loratadine as another option. The patient was also given samples of Nasonex (two sprays in each nostril) for three weeks, and a prescription was written as well.

--------------------------------------------------

 QUERY: What are the symptoms of asthma?
---------------------------------------

In [None]:
# Step 1: Create the basic file structure
print("CREATING CORRECT FILE STRUCTURE")
print("=" * 50)

import os

# Create directories
os.makedirs(".streamlit", exist_ok=True)
os.makedirs("medical_rag/vector_store", exist_ok=True)

print(" Directories created")

CREATING CORRECT FILE STRUCTURE
 Directories created


In [None]:
# Step 2: Create .streamlit/config.toml
config_content = """[server]
headless = true
address = "0.0.0.0"
port = 8501

[browser]
gatherUsageStats = false

[theme]
primaryColor = "#1f77b4"
backgroundColor = "#ffffff"
secondaryBackgroundColor = "#f0f2f6"
textColor = "#262730"
font = "sans serif"
"""

with open(".streamlit/config.toml", "w") as f:
    f.write(config_content)
print(" Created: .streamlit/config.toml")

 Created: .streamlit/config.toml


In [None]:
# Step 3: Create requirements.txt
requirements_content = """streamlit==1.28.0
google-generativeai==0.3.2
sentence-transformers==2.2.2
faiss-cpu==1.7.4
pandas==2.0.3
numpy==1.24.3
python-dotenv==1.0.0
"""

with open("requirements.txt", "w") as f:
    f.write(requirements_content)
print(" Created: requirements.txt")

 Created: requirements.txt


In [None]:
# Step 4: Create medical_rag_system.py
rag_system_code = """import faiss
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np
import os

class MedicalRAGSystem:
    def __init__(self, vector_store_path="medical_rag/vector_store"):
        self.vector_store_path = vector_store_path
        self.embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        self.load_vector_store()

    def load_vector_store(self):
        try:
            possible_paths = [
                self.vector_store_path,
                "medical_rag/vector_store",
                "./medical_rag/vector_store"
            ]

            loaded = False
            for path in possible_paths:
                try:
                    index_path = f"{path}/medical_faiss.index"
                    metadata_path = f"{path}/vector_metadata.pkl"

                    if os.path.exists(index_path) and os.path.exists(metadata_path):
                        self.index = faiss.read_index(index_path)
                        with open(metadata_path, "rb") as f:
                            data = pickle.load(f)
                        self.chunks = data['chunks']
                        self.metadata = data['metadata']
                        print(f" Vector store loaded from: {path}")
                        loaded = True
                        break
                except Exception as e:
                    continue

            if not loaded:
                raise Exception("Could not load vector store from any path")

        except Exception as e:
            raise Exception(f"Error loading vector store: {str(e)}")

    def retrieve_similar_chunks(self, query, k=5):
        try:
            query_embedding = self.embedding_model.encode([query])
            faiss.normalize_L2(query_embedding)

            scores, indices = self.index.search(query_embedding, k*3)

            results = []
            seen_chunks = set()

            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.chunks) and idx not in seen_chunks:
                    chunk_data = {
                        'content': self.chunks[idx],
                        'metadata': self.metadata[idx],
                        'similarity_score': float(score)
                    }
                    results.append(chunk_data)
                    seen_chunks.add(idx)

                    if len(results) >= k:
                        break

            return results[:k]

        except Exception as e:
            print(f"Error in retrieval: {e}")
            return []
"""

with open("medical_rag_system.py", "w") as f:
    f.write(rag_system_code)
print(" Created: medical_rag_system.py")

 Created: medical_rag_system.py


In [None]:
# Step 5: Create app.py (main Streamlit app)
app_code = '''import streamlit as st
import google.generativeai as genai
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import pandas as pd
import os
import sys

st.set_page_config(
    page_title="Medical RAG Assistant",
    page_icon="üè•]",
    layout="wide"
)

st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .info-box {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 1rem 0;
    }
    .source-box {
        background-color: #e8f4fd;
        padding: 0.5rem;
        border-radius: 0.3rem;
        margin: 0.5rem 0;
        border-left: 4px solid #1f77b4;
    }
</style>
""", unsafe_allow_html=True)

def initialize_rag_system():
    try:
        from medical_rag_system import MedicalRAGSystem
        rag_system = MedicalRAGSystem()
        return rag_system, None
    except Exception as e:
        return None, f"Error initializing RAG system: {str(e)}"

def generate_medical_answer(query, context_chunks, api_key):
    if not context_chunks:
        return "I couldn\'t find relevant medical information to answer this question in the available records."

    context_text = "\\n\\n".join([
        f"--- MEDICAL NOTE {i+1} (Specialty: {chunk[\'metadata\'][\'medical_specialty\']}) ---\\n{chunk[\'content\']}"
        for i, chunk in enumerate(context_chunks)
    ])

    prompt = f"""You are a medical research assistant. Answer the question based ONLY on the provided medical context from clinical notes.

MEDICAL CONTEXT:
{context_text}

QUESTION: {query}

IMPORTANT INSTRUCTIONS:
- Answer using ONLY the information from the medical context above
- If the context doesn\'t contain relevant information, say "I cannot find specific information about this in the available medical records"
- Be precise and medically accurate
- Do not make up or hallucinate information
- Mention which medical specialty the information comes from when relevant
- Keep answers concise but informative

ANSWER:"""

    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel("models/gemini-2.0-flash")
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error generating answer: {str(e)}"

st.markdown(\'<div class="main-header">üè• Medical RAG Assistant</div>\', unsafe_allow_html=True)
st.markdown("**Ask medical questions based on 3,898 clinical transcriptions across 39 medical specialties**")

with st.sidebar:
    st.header(" Configuration")

    api_key = st.text_input(
        "Google AI Studio API Key",
        type="password",
        help="Get free API key from https://aistudio.google.com/"
    )

    st.markdown(\'<div class="info-box">\', unsafe_allow_html=True)
    st.write("**How to get API Key:**")
    st.write("1. Go to [Google AI Studio](https://aistudio.google.com/)")
    st.write("2. Sign in with Google account")
    st.write("3. Click \'Get API Key\' and create new key")
    st.write("4. Paste the key here")
    st.markdown(\'</div>\', unsafe_allow_html=True)

    if st.button(" Initialize Medical RAG System", use_container_width=True):
        if not api_key:
            st.error("Please enter your Google AI Studio API key first")
        else:
            with st.spinner("Loading medical database..."):
                rag_system, error = initialize_rag_system()
                if rag_system:
                    st.session_state.rag_system = rag_system
                    st.session_state.api_key = api_key
                    st.success(" Medical RAG System Ready!")
                    st.write(f"‚Ä¢ Medical chunks: {len(rag_system.chunks):,}")
                    st.write(f"‚Ä¢ Specialties: {len(set(m[\'medical_specialty\'] for m in rag_system.metadata))}")
                    st.write(f"‚Ä¢ Vector dimension: {rag_system.index.d}")
                else:
                    st.error(f" {error}")

if \'rag_system\' not in st.session_state:
    st.session_state.rag_system = None
if \'history\' not in st.session_state:
    st.session_state.history = []

if st.session_state.rag_system:
    st.header(" Medical Question & Answer")

    query = st.text_input(
        "Ask your medical question:",
        placeholder="e.g., What are common treatments for allergies? What symptoms indicate asthma?",
        key="query_input"
    )

    col1, col2 = st.columns([1, 4])
    with col1:
        num_chunks = st.slider("Sources to retrieve", 1, 5, 3)

    if query and st.session_state.get(\'api_key\'):
        with st.spinner(" Searching medical database..."):
            chunks = st.session_state.rag_system.retrieve_similar_chunks(query, k=num_chunks)
            answer = generate_medical_answer(query, chunks, st.session_state.api_key)

            st.session_state.history.append({
                \'query\': query,
                \'answer\': answer,
                \'chunks_used\': len(chunks),
                \'timestamp\': pd.Timestamp.now()
            })

        st.subheader(" Answer:")
        st.write(answer)

        with st.expander(f" View Source Documents ({len(chunks)} found)"):
            for i, chunk in enumerate(chunks):
                st.markdown(\'<div class="source-box">\', unsafe_allow_html=True)
                st.write(f"**Source {i+1}** | **Specialty:** {chunk[\'metadata\'][\'medical_specialty\']} | **Similarity Score:** {chunk[\'similarity_score\']:.3f}")
                st.write(f"**Content:** {chunk[\'content\'][:400]}...")
                st.markdown(\'</div>\', unsafe_allow_html=True)

    if st.session_state.history:
        st.subheader(" Recent Questions")
        for i, item in enumerate(reversed(st.session_state.history[-3:])):
            st.write(f"**Q:** {item[\'query\']}")
            st.write(f"**A:** {item[\'answer\'][:200]}...")
            st.write(f"*Sources used: {item[\'chunks_used\']}*")
            st.divider()

else:
    st.info(" Welcome! Please enter your Google AI Studio API key and initialize the system in the sidebar to start asking medical questions.")

with st.expander("‚Ñπ System Information"):
    st.write("""
    **Medical RAG System Overview:**

    - **Data Source:** 3,898 clinical medical transcription records
    - **Medical Content:** 29,713 processed text chunks
    - **Specialties Covered:** 39 different medical specialties
    - **Search Technology:** FAISS vector similarity search
    - **AI Model:** Google Gemini for answer generation
    - **Key Feature:** Provides source citations for transparency

    **How it works:**
    1. Your question is converted to a vector embedding
    2. System finds the most similar medical text chunks
    3. Gemini generates an answer using only the retrieved context
    4. Sources are provided for verification

    **Note:** This system provides information from medical records but is not a substitute for professional medical advice.
    """)

st.markdown("---")
st.markdown("*Built with Streamlit, FAISS, and Google Gemini ‚Ä¢ Medical RAG System*")
'''

with open("app.py", "w") as f:
    f.write(app_code)
print(" Created: app.py")

 Created: app.py


In [None]:
# Step 6: Create README.md (Fixed)
print(" CREATING README.MD")
print("=" * 50)

readme_content = "# Medical RAG Assistant\n\n"
readme_content += "A Retrieval-Augmented Generation (RAG) system for medical question answering, built with Streamlit, FAISS, and Google Gemini.\n\n"
readme_content += "## üè• Features\n\n"
readme_content += "- **29,713 medical text chunks** from 3,898 clinical transcriptions\n"
readme_content += "- **39 medical specialties** covered\n"
readme_content += "- **Semantic search** using FAISS vector database\n"
readme_content += "- **AI-powered answers** using Google Gemini\n"
readme_content += "- **Source citation** for transparency\n"
readme_content += "- **Web interface** with Streamlit\n\n"
readme_content += "##  Quick Start\n\n"
readme_content += "1. **Get API Key**: Free from [Google AI Studio](https://aistudio.google.com/)\n"
readme_content += "2. **Enter API Key**: In the app sidebar\n"
readme_content += "3. **Initialize System**: Click \"Initialize Medical RAG System\"\n"
readme_content += "4. **Ask Questions**: Type your medical questions\n\n"
readme_content += "##  Project Structure\n\n"
readme_content += "```\n"
readme_content += "medical-rag-assistant/\n"
readme_content += "‚îú‚îÄ‚îÄ app.py                          # Main Streamlit app\n"
readme_content += "‚îú‚îÄ‚îÄ medical_rag_system.py           # RAG system module\n"
readme_content += "‚îú‚îÄ‚îÄ requirements.txt                # Dependencies\n"
readme_content += "‚îú‚îÄ‚îÄ .streamlit/\n"
readme_content += "‚îÇ   ‚îî‚îÄ‚îÄ config.toml                 # Streamlit configuration\n"
readme_content += "‚îî‚îÄ‚îÄ medical_rag/\n"
readme_content += "    ‚îî‚îÄ‚îÄ vector_store/               # Vector database\n"
readme_content += "        ‚îú‚îÄ‚îÄ medical_faiss.index\n"
readme_content += "        ‚îî‚îÄ‚îÄ vector_metadata.pkl\n"
readme_content += "```\n\n"
readme_content += "##  Medical Disclaimer\n\n"
readme_content += "This system provides information from medical records for educational purposes only. It is not a substitute for professional medical advice.\n"

with open("README.md", "w") as f:
    f.write(readme_content)

print(" Created: README.md")
print(" ALL FILES CREATED SUCCESSFULLY!")

 CREATING README.MD
 Created: README.md
 ALL FILES CREATED SUCCESSFULLY!


In [None]:
# Step 7: Check created files
print(" CHECKING CREATED FILES")
print("=" * 50)

import os

files_to_check = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md",
    ".streamlit/config.toml"
]

print(" Checking deployment files:")
for file in files_to_check:
    if os.path.exists(file):
        size = os.path.getsize(file) / 1024  # Size in KB
        print(f" {file} ({size:.1f} KB)")
    else:
        print(f" {file} - MISSING")

print("\n Checking vector store files:")
vector_files = [
    "medical_rag/vector_store/medical_faiss.index",
    "medical_rag/vector_store/vector_metadata.pkl"
]

for file in vector_files:
    if os.path.exists(file):
        size_mb = os.path.getsize(file) / (1024 * 1024)
        print(f" {file} ({size_mb:.1f} MB)")
    else:
        print(f" {file} - MISSING (Critical for app to work)")

print("\n FINAL FILE STRUCTURE:")
for root, dirs, files in os.walk("."):
    if '.git' in root or '__pycache__' in root:
        continue
    level = root.replace(".", "").count(os.sep)
    indent = " " * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 2 * (level + 1)
    for file in files:
        if any(file.endswith(ext) for ext in ['.py', '.txt', '.toml', '.md', '.index', '.pkl']):
            size_kb = os.path.getsize(os.path.join(root, file)) / 1024
            print(f"{sub_indent}{file} ({size_kb:.1f} KB)")

 CHECKING CREATED FILES
 Checking deployment files:
 app.py (6.9 KB)
 medical_rag_system.py (2.5 KB)
 requirements.txt (0.1 KB)
 README.md (1.3 KB)
 .streamlit/config.toml (0.2 KB)

 Checking vector store files:
 medical_rag/vector_store/medical_faiss.index (43.5 MB)
 medical_rag/vector_store/vector_metadata.pkl (16.6 MB)

 FINAL FILE STRUCTURE:
./
  app.py (6.9 KB)
  README.md (1.3 KB)
  medical_rag_system.py (2.5 KB)
  requirements.txt (0.1 KB)
  .config/
    configurations/
    logs/
      2025.11.20/
  medical_rag/
    vector_store/
      vector_metadata.pkl (16957.9 KB)
      medical_faiss.index (44569.5 KB)
    data/
  .streamlit/
    config.toml (0.2 KB)
  sample_data/
    README.md (0.9 KB)


In [None]:
# Step 1: Create a zip file of all deployment files
print(" CREATING DEPLOYMENT PACKAGE")
print("=" * 50)

import shutil

# Create a zip file with all necessary files
shutil.make_archive("medical_rag_assistant", 'zip', '.')

print(" Created: medical_rag_assistant.zip")
print(" Download this zip file to your computer:")
print(" Files ‚Üí medical_rag_assistant.zip ‚Üí Download")

 CREATING DEPLOYMENT PACKAGE
 Created: medical_rag_assistant.zip
 Download this zip file to your computer:
 Files ‚Üí medical_rag_assistant.zip ‚Üí Download


In [None]:
import zipfile
import os

print(" CREATING ESSENTIAL DEPLOYMENT PACKAGE")
print("=" * 50)

output_zip_filename = "medical_rag_assistant_essential.zip"

# List of individual files to include (these are expected at the root of the repo)
files_to_include = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md",
]

# Specific handling for .streamlit/config.toml
# This needs to be stored as .streamlit/config.toml inside the zip
streamlit_config_path = ".streamlit/config.toml"

# Directory to include recursively, preserving its full path relative to the root
directories_to_include = [
    "medical_rag/vector_store"
]

with zipfile.ZipFile(output_zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add individual files at the root level of the zip
    for file_path in files_to_include:
        if os.path.exists(file_path):
            zipf.write(file_path, file_path) # arcname is the same as file_path to keep it at root
            print(f"   Added file: {file_path}")
        else:
            print(f"   Warning: File not found and skipped: {file_path}. Please ensure it exists.")

    # Add .streamlit/config.toml, preserving its subdirectory structure
    if os.path.exists(streamlit_config_path):
        zipf.write(streamlit_config_path, streamlit_config_path)
        print(f" Added file: {streamlit_config_path}")
    else:
        print(f" Warning: File not found and skipped: {streamlit_config_path}. Please ensure it exists.")

    # Add contents of directories recursively, preserving their full paths
    for dir_path in directories_to_include:
        if os.path.isdir(dir_path):
            for root, _, files in os.walk(dir_path):
                for file in files:
                    full_file_path = os.path.join(root, file)
                    # The arcname should be the same as full_file_path to preserve the directory structure inside the zip
                    zipf.write(full_file_path, full_file_path)
                    print(f"   Added directory content: {full_file_path}")
        else:
            print(f"   Warning: Directory not found and skipped: {dir_path}. Please ensure it exists.")

print(f" Created essential package: {output_zip_filename}")
print(" Download this zip file to your computer:")
print(f" Files ‚Üí {output_zip_filename} ‚Üí Download")


 CREATING ESSENTIAL DEPLOYMENT PACKAGE
   Added file: app.py
   Added file: medical_rag_system.py
   Added file: requirements.txt
   Added file: README.md
 Added file: .streamlit/config.toml
   Added directory content: medical_rag/vector_store/vector_metadata.pkl
   Added directory content: medical_rag/vector_store/medical_faiss.index
 Created essential package: medical_rag_assistant_essential.zip
 Download this zip file to your computer:
 Files ‚Üí medical_rag_assistant_essential.zip ‚Üí Download


### üõ†Ô∏è Diagnostic: Verify File System State

Before re-attempting to create the zip file, let's explicitly list the files and directories to ensure everything is in place.

In [None]:
print(" VERIFYING FILE SYSTEM STATE BEFORE ZIPPING")
print("=" * 50)

import os

# Files expected at root
expected_root_files = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md"
]

# Directory with config.toml
expected_streamlit_dir = ".streamlit"
expected_streamlit_config = ".streamlit/config.toml"

# Vector store directory
expected_vector_store_dir = "medical_rag/vector_store"

all_found = True

print("\n--- Checking root files ---")
for f in expected_root_files:
    if os.path.exists(f):
        print(f" Found: {f}")
    else:
        print(f" NOT Found: {f}")
        all_found = False

print("\n--- Checking .streamlit directory ---")
if os.path.isdir(expected_streamlit_dir):
    print(f" Found directory: {expected_streamlit_dir}/")
    if os.path.exists(expected_streamlit_config):
        print(f" Found config: {expected_streamlit_config}")
    else:
        print(f" NOT Found config: {expected_streamlit_config}")
        all_found = False
else:
    print(f" NOT Found directory: {expected_streamlit_dir}/")
    all_found = False

print("\n--- Checking medical_rag/vector_store directory ---")
if os.path.isdir(expected_vector_store_dir):
    print(f" Found directory: {expected_vector_store_dir}/")
    vector_store_contents = os.listdir(expected_vector_store_dir)
    if vector_store_contents:
        for item in vector_store_contents:
            print(f" Found item: {expected_vector_store_dir}/{item}")
    else:
        print(f"  Directory is empty: {expected_vector_store_dir}/")
        all_found = False
else:
    print(f" NOT Found directory: {expected_vector_store_dir}/")
    all_found = False

if all_found:
    print("\n All essential files and directories appear to be present. Proceeding to re-zip.")
else:
    print("\n Some essential files/directories are missing. Please re-run previous steps to ensure they are created.")



 VERIFYING FILE SYSTEM STATE BEFORE ZIPPING

--- Checking root files ---
 Found: app.py
 Found: medical_rag_system.py
 Found: requirements.txt
 Found: README.md

--- Checking .streamlit directory ---
 Found directory: .streamlit/
 Found config: .streamlit/config.toml

--- Checking medical_rag/vector_store directory ---
 Found directory: medical_rag/vector_store/
 Found item: medical_rag/vector_store/vector_metadata.pkl
 Found item: medical_rag/vector_store/medical_faiss.index

 All essential files and directories appear to be present. Proceeding to re-zip.


### üì¶ Re-zipping Essential Deployment Package

Now, let's try creating the essential zip package again with the corrected path handling.

In [None]:
import zipfile
import os

print(" CREATING ESSENTIAL DEPLOYMENT PACKAGE (FIXED)")
print("=" * 50)

output_zip_filename = "medical_rag_assistant_essential.zip"

# List of individual files to include (these are expected at the root of the repo)
files_to_include = [
    "app.py",
    "medical_rag_system.py",
    "requirements.txt",
    "README.md",
]

# Specific handling for .streamlit/config.toml
streamlit_config_path = ".streamlit/config.toml"

# Directory to include recursively, preserving its full path relative to the root
directories_to_include = [
    "medical_rag/vector_store"
]

with zipfile.ZipFile(output_zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add individual files at the root level of the zip
    for file_path in files_to_include:
        if os.path.exists(file_path):
            zipf.write(file_path, file_path) # arcname is the same as file_path to keep it at root
            print(f"   Added file: {file_path}")
        else:
            print(f"   Warning: File not found and skipped: {file_path}. Please ensure it exists.")

    # Add .streamlit/config.toml, preserving its subdirectory structure
    if os.path.exists(streamlit_config_path):
        zipf.write(streamlit_config_path, streamlit_config_path)
        print(f"   Added file: {streamlit_config_path}")
    else:
        print(f"   Warning: File not found and skipped: {streamlit_config_path}. Please ensure it exists.")

    # Add contents of directories recursively, preserving their full paths
    for dir_path in directories_to_include:
        if os.path.isdir(dir_path):
            for root, _, files in os.walk(dir_path):
                for file in files:
                    full_file_path = os.path.join(root, file)
                    # The arcname should be the same as full_file_path to preserve the directory structure inside the zip
                    zipf.write(full_file_path, full_file_path)
                    print(f"   Added directory content: {full_file_path}")
        else:
            print(f"   Warning: Directory not found and skipped: {dir_path}. Please ensure it exists.")

print(f" Created essential package: {output_zip_filename}")
print(" Download this zip file to your computer:")
print(f"   Files ‚Üí {output_zip_filename} ‚Üí Download")


 CREATING ESSENTIAL DEPLOYMENT PACKAGE (FIXED)
   Added file: app.py
   Added file: medical_rag_system.py
   Added file: requirements.txt
   Added file: README.md
   Added file: .streamlit/config.toml
   Added directory content: medical_rag/vector_store/vector_metadata.pkl
   Added directory content: medical_rag/vector_store/medical_faiss.index
 Created essential package: medical_rag_assistant_essential.zip
 Download this zip file to your computer:
   Files ‚Üí medical_rag_assistant_essential.zip ‚Üí Download


In [None]:
# Step 4: Streamlit Deployment Instructions
print(" STREAMLIT CLOUD DEPLOYMENT")
print("=" * 50)

deployment_steps = """
1. **GO TO STREAMLIT CLOUD:**
   - Visit: https://share.streamlit.io/
   - Sign in with your GitHub account

2. **CREATE NEW APP:**
   - Click "New app"
   - Repository: your-username/medical-rag-assistant
   - Branch: main
   - Main file path: app.py
   - Click "Deploy"

3. **WAIT FOR DEPLOYMENT:**
   - Initial deployment takes 2-5 minutes
   - Watch the logs for any errors
   - If successful, you'll get a URL like:
     https://medical-rag-assistant.streamlit.app/

4. **TEST YOUR APP:**
   - Open your app URL
   - In sidebar, enter Google AI Studio API key
   - Click "Initialize Medical RAG System"
   - Start asking medical questions!

 YOUR MEDICAL RAG ASSISTANT WILL BE LIVE!
"""

print(deployment_steps)

 STREAMLIT CLOUD DEPLOYMENT

1. **GO TO STREAMLIT CLOUD:**
   - Visit: https://share.streamlit.io/
   - Sign in with your GitHub account

2. **CREATE NEW APP:**
   - Click "New app"
   - Repository: your-username/medical-rag-assistant
   - Branch: main
   - Main file path: app.py
   - Click "Deploy"

3. **WAIT FOR DEPLOYMENT:**
   - Initial deployment takes 2-5 minutes
   - Watch the logs for any errors
   - If successful, you'll get a URL like:
     https://medical-rag-assistant.streamlit.app/

4. **TEST YOUR APP:**
   - Open your app URL
   - In sidebar, enter Google AI Studio API key
   - Click "Initialize Medical RAG System"
   - Start asking medical questions!

 YOUR MEDICAL RAG ASSISTANT WILL BE LIVE!



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
