In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U langchain-community
!pip install datasets
!pip install -U bitsandbytes
!pip install faiss-cpu
!pip install huggingface_hub
!pip install accelerate bitsandbytes

Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [None]:
from huggingface_hub import login

login(token="")


# raged

In [None]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from datasets import Dataset

# Set up paths
vector_store_path = "/content/drive/MyDrive/medical_rag_vectorstore_2k"

# Load the full dataset first
with open('/content/drive/MyDrive/llm/medqu.jsonl', 'r') as f:
    medical_data = [json.loads(line) for line in f]

# Take the same 2,000 document subset you used for fine-tuning
subset_data = medical_data[:2000]
print(f"Using {len(subset_data)} documents for RAG knowledge base")

# Extract text from the subset
documents = []
for item in subset_data:
    text = item.get('text', '')
    documents.append(text)

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)
chunks = text_splitter.create_documents(documents)
print(f"Created {len(chunks)} text chunks for embedding")

# Create and save embeddings for the 2,000 document subset
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create and save vector store
vector_store = FAISS.from_documents(chunks, embeddings)
vector_store.save_local(vector_store_path)
print("Vector store created and saved successfully!")

Using 2000 documents for RAG knowledge base
Created 4670 text chunks for embedding


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created and saved successfully!


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import torch
import os
import json

In [None]:
vector_store_path = "/content/drive/MyDrive/llm/medical_rag_vectorstore_2k"
dataset_path = "/content/drive/MyDrive/llm/medqu.jsonl"
base_model_id = "meta-llama/Llama-2-7b-hf"
adapter_path = "/content/drive/MyDrive/llm/medical_lora_adapter"

In [None]:
class MedicalRAG:
    def __init__(self, base_model_id, adapter_path, vector_store_path):
        print("Loading models and vector store...")

        # Load embeddings model
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Check if vector store exists, create it if it doesn't
        if not os.path.exists(os.path.join(vector_store_path, "index.faiss")):
            print(f"Vector store not found at {vector_store_path}. Creating new vector store...")
            self._create_vector_store(vector_store_path)

        # Load the vector store
        self.vector_store = FAISS.load_local(vector_store_path, self.embeddings,allow_dangerous_deserialization=True)
        self.retriever = self.vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 3}
        )

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_id)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        # Load fine-tuned model
        self.base_model = AutoModelForCausalLM.from_pretrained(
            base_model_id,
            device_map="auto",
            torch_dtype=torch.float16,
        load_in_4bit=True

        )
        self.model = PeftModel.from_pretrained(self.base_model, adapter_path)
        print("Models and vector store loaded successfully!")

    def _create_vector_store(self, vector_store_path):
        # Ensure directory exists
        os.makedirs(vector_store_path, exist_ok=True)

        # Load the dataset
        with open(dataset_path, 'r') as f:
            medical_data = [json.loads(line) for line in f]

        # Use the first 2000 documents (same subset used for fine-tuning)
        subset_data = medical_data[:2000]
        print(f"Using {len(subset_data)} documents for RAG knowledge base")

        # Extract text from the subset
        documents = []
        for item in subset_data:
            text = item.get('text', '')
            documents.append(text)

        # Split into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        chunks = text_splitter.create_documents(documents)
        print(f"Created {len(chunks)} text chunks for embedding")

        # Create vector store
        vector_store = FAISS.from_documents(chunks, self.embeddings)
        vector_store.save_local(vector_store_path)
        print(f"Vector store created and saved to {vector_store_path}")

    def answer_question(self, query):
        # Retrieve relevant documents
        docs = self.retriever.get_relevant_documents(query)
        context = "\n\n".join([doc.page_content for doc in docs])

        # Create prompt with retrieved context
        prompt = f"""<s>[INST] <<SYS>>
Provide accurate, concise answers to medical questions based on the following information:

{context}
<</SYS>>

Question: {query} [/INST]"""

        # Generate response
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs.input_ids,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.9
            )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("[/INST]")[-1].strip()

In [None]:
import pickle

# Create and save the MedicalRAG object
def setup_medical_rag():
    base_model_id = "meta-llama/Llama-2-7b-hf"
    adapter_path = "/content/drive/MyDrive/llm/medical_lora_adapter"
    vector_store_path = "/content/drive/MyDrive/llm/medical_rag_vectorstore_2k"

    # Create the RAG system
    med_rag = MedicalRAG(base_model_id, adapter_path, vector_store_path)

    # Save the system state (except the model which is too large)
    # We'll just save the paths and reload when needed
    rag_config = {
        'base_model_id': base_model_id,
        'adapter_path': adapter_path,
        'vector_store_path': vector_store_path
    }

    with open('/content/drive/MyDrive/llm/med_rag_config.pkl', 'wb') as f:
        pickle.dump(rag_config, f)

    return med_rag

# Create the system once
med_rag = setup_medical_rag()

Loading models and vector store...
Vector store not found at /content/drive/MyDrive/llm/medical_rag_vectorstore_2k. Creating new vector store...
Using 2000 documents for RAG knowledge base
Created 4670 text chunks for embedding
Vector store created and saved to /content/drive/MyDrive/llm/medical_rag_vectorstore_2k


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Models and vector store loaded successfully!


In [None]:
import pickle

# Load the configuration
with open('/content/drive/MyDrive/llm/med_rag_config.pkl', 'rb') as f:
    rag_config = pickle.load(f)

# Recreate the RAG system using saved configuration
med_rag = MedicalRAG(
    rag_config['base_model_id'],
    rag_config['adapter_path'],
    rag_config['vector_store_path']
)

#
query = "What causes multiple sclerosis?"
answer = med_rag.answer_question(query)
print(f"Query: {query}")
print(f"Answer: {answer}")

Loading models and vector store...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Models and vector store loaded successfully!


  docs = self.retriever.get_relevant_documents(query)


Query: What causes multiple sclerosis?
Answer: Answer: {'contexts': ['Multiple sclerosis (MS) is a progressive autoimmune disorder of the central nervous system. In MS, immune cells infiltrate the central nervous system and cause damage to myelin, the insulating covering of nerve cells. This damage impairs the transmission of electrical signals in the brain and spinal cord, and leads to neurological deficits.', 'Multiple sclerosis is thought to be an autoimmune disease. It is characterized by the presence of autoantibodies directed against proteins of the central nervous system. This indicates that the immune response in MS is directed against self-proteins. Autoantibodies against myelin basic protein (MBP) are the most important marker of MS, and they can be used to diagnose MS, to monitor disease activity, and to monitor treatment response.', 'We investigated the specificity of autoantibodies against MBP in MS patients. A total of 177 MS patients were included in the study. The autoa

In [None]:
query = "answer this in short: when can I use panadol?"
answer = med_rag.answer_question(query)
print(f"Query: {query}")
print(f"Answer: {answer}")

Query: answer this in short: when can I use panadol?
Answer: Answer: Panadol is an analgesic and antipyretic medicine. It contains paracetamol as its main ingredient. Panadol can be used for the relief of pain and fever. However, it should not be used in children under 12 years of age. It should also not be used in pregnant women. Panadol can be used to relieve pain associated with headaches, muscle aches, toothaches, backaches, menstrual cramps and arthritis. Panadol can also be used to reduce fever, although it is not as effective as aspirin. Panadol is also used to relieve pain associated with the common cold and flu. Panadol is available in various strengths. The 500 mg strength is usually recommended for pain relief and the 1000 mg strength is usually recommended for fever reduction. Panadol should be taken as directed. If you take more than the recommended dose, you may experience nausea, vomiting, diarrhea, stomach pain, and liver damage. If you experience any of these symptoms,