In [1]:
!pip install requests tqdm faiss-cpu transformers tensorflow sentence-transformers textblob gensim numba

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<3.0,>=1.25.0 (from faiss-cpu)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (from sentence-transformers)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m41.0 MB/s[0m eta [

In [1]:
import os
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm

# Directory to store downloaded and extracted data
DATA_DIR = Path("./mimic_textbooks")

# Download and extract the dataset zip file
def download_and_extract_zip(url, extract_to=DATA_DIR):
    # Ensure the directory exists
    extract_to.mkdir(parents=True, exist_ok=True)

    # Download the zip file
    zip_path = extract_to / "textbooks.zip"
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
            if chunk:
                file.write(chunk)

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset downloaded and extracted.")

# Download and extract textbooks
dataset_url = "https://www.dropbox.com/scl/fi/54p9kkx5n93bffyx08eba/textbooks.zip?rlkey=2y2c5x8y0uncnddichn9cmd7n&st=m290nmkk&dl=1"
download_and_extract_zip(dataset_url)


Downloading dataset...


88121KB [00:01, 85286.56KB/s]


Extracting dataset...
Dataset downloaded and extracted.


In [2]:
import re
from gensim.utils import simple_preprocess
from textblob import TextBlob

# Load text files
def load_text_files(directory):
    texts = []
    for file_path in Path(directory).glob("F*.txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            texts.append(file.read())
    return texts

# Cleaning and preprocessing function
def clean_and_tokenize(text):
    # Basic regex cleaning
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Lowercase all text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters

    # Tokenize with gensim
    tokens = simple_preprocess(text)
    return ' '.join(tokens)

# Spell correction
def correct_spelling(text):
    return str(TextBlob(text).correct())

# Chunk text into fixed-size chunks
def chunk_text(text, chunk_size=200):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Load, clean, correct, and chunk documents
documents = load_text_files(DATA_DIR / "textbooks/en")
cleaned_documents = [clean_and_tokenize(doc) for doc in documents]
# corrected_documents = [correct_spelling(doc) for doc in cleaned_documents]
chunked_documents = []
for doc in cleaned_documents:
    chunked_documents.extend(chunk_text(doc))

print(f"Total document chunks created: {len(chunked_documents)}")


Total document chunks created: 1086


** Using Torch Instead of tensor to resolve the error **

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate embeddings for all chunks in a batch
def get_embeddings_in_batch(texts, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch of texts and move to device
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        # Generate embeddings on the GPU
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state  # [batch_size, sequence_length, hidden_size]
            batch_embeddings = outputs.mean(dim=1)  # Mean pooling

        # Move to CPU and convert to numpy
        all_embeddings.extend(batch_embeddings.cpu().numpy())

    return np.array(all_embeddings)

# Generate embeddings for all document chunks in batches
embeddings = get_embeddings_in_batch(chunked_documents, batch_size=128)
print(f"Generated embeddings for {len(embeddings)} document chunks.")


Generated embeddings for 1086 document chunks.


In [7]:
# from transformers import TFAutoModel, AutoTokenizer
# import tensorflow as tf
# import numpy as np

# # Load the model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# model = TFAutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# # Function to generate embeddings for all chunks in a batch
# def get_embeddings_in_batch(texts, batch_size=16):
#     all_embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch_texts = texts[i:i + batch_size]

#         # Tokenize the batch of texts
#         inputs = tokenizer(batch_texts, return_tensors="tf", truncation=True, padding=True, max_length=512)

#         # Generate embeddings on the GPU
#         outputs = model(inputs).last_hidden_state  # [batch_size, sequence_length, hidden_size]
#         batch_embeddings = tf.reduce_mean(outputs, axis=1).numpy()  # Mean pooling

#         # Append batch embeddings to the list
#         all_embeddings.extend(batch_embeddings)

#     return np.array(all_embeddings)

# # Generate embeddings for all document chunks in batches
# embeddings = get_embeddings_in_batch(chunked_documents, batch_size=128)
# print(f"Generated embeddings for {len(embeddings)} document chunks.")



In [12]:
import faiss
import numpy as np

# Define the dimension of embeddings
dimension = 384  # Embedding size from MiniLM model
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to NumPy array for FAISS
embedding_matrix = np.array([embedding.flatten() for embedding in embeddings]).astype('float32')

# Add embeddings to FAISS index
index.add(embedding_matrix)
print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 1086


**Different way of deleting GPU memory**

In [11]:
import torch
import gc

# Safely delete model if it exists
try:
    del model
except NameError:
    print("Model was not defined or already deleted.")

# Clear PyTorch GPU cache
torch.cuda.empty_cache()

# Run Python garbage collector
gc.collect()

# Optional: Reset CUDA device (only if you're sure it's needed)
try:
    from numba import cuda
    device = cuda.get_current_device()
    device.reset()
except Exception as e:
    print(f"CUDA device reset failed: {e}")

Model was not defined or already deleted.
CUDA device reset failed: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


In [9]:
import torch
import gc
from numba import cuda

del model
torch.cuda.empty_cache()  # Clear GPU memory from torch
gc.collect()
device = cuda.get_current_device() # Clear GPU memory from tf
device.reset()

NameError: name 'model' is not defined

## Retrival Method

In [13]:
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model for retrieval on CPU
retrieval_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
retrieval_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2").cpu()

# Function to generate embeddings for a new query
def get_query_embedding(query):
    with torch.no_grad():
        inputs = retrieval_tokenizer(query, return_tensors="pt", padding=True, truncation=True)
        outputs = retrieval_model(**inputs)
        embedding = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
    return embedding

# Load FAISS index with existing embeddings
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)

# Function to retrieve relevant documents based on the query
def retrieve_documents(query, top_k=5):
    query_embedding = get_query_embedding(query).astype("float32")
    distances, indices = index.search(query_embedding, top_k)
    results = [chunked_documents[idx] for idx in indices[0]]
    return results

# Test retrieval component
sample_query = "What are the symptoms of heart failure?"
similar_documents = retrieve_documents(sample_query)
print("Retrieved documents:", similar_documents)


Retrieved documents: ['crisis vestibuloocular ref ex scan aureus ventricular septal defect ventricular tachycardia von willebrands disease von willebrand factor varicellazoster virus white blood cell world health organization hemoglobin plasma mean corpuscular hemoglobin platelet count prothrombin time reticulocyte count sedimentation rate erythrocyte westergren proteins total mgdl moll mm pgcell fmolcell mm seconds seconds of red cells male mmh mmh female mmh mmh mg', 'crisis vestibuloocular ref ex scan aureus ventricular septal defect ventricular tachycardia von willebrands disease von willebrand factor varicellazoster virus white blood cell world health organization hemoglobin plasma mean corpuscular hemoglobin platelet count prothrombin time reticulocyte count sedimentation rate erythrocyte westergren proteins total mgdl moll mm pgcell fmolcell mm seconds seconds of red cells male mmh mmh female mmh mmh mg', 'crisis vestibuloocular ref ex scan aureus ventricular septal defect ventr

## Generation Method

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generation_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)
generation_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)
generation_model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

# Function to generate a response using retrieved context
def generate_response(query, context, max_new_tokens=100):
    input_text = f"User query: {query}\n\nContext:\n{context}\n\nAnswer:"

    # Tokenize the input and move tensors to GPU
    inputs = generation_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cuda")

    # Generate response using max_new_tokens to control output length
    with torch.no_grad():
        outputs = generation_model.generate(inputs["input_ids"], max_new_tokens=max_new_tokens, num_return_sequences=1)

    # Decode the generated response
    response_text = generation_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response_text

# Testing generation with retrieved documents as context
retrieved_text = " ".join(similar_documents)  # Concatenate retrieved documents as context
response = generate_response(sample_query, retrieved_text)
print("Generated response:", response)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Generated response: User query: What are the symptoms of heart failure?

Context:
artery pip bisphosphate pip bisphosphate po partial pressure of oxygen pv plasma volume venous pressure correlation coefficient right variable group registration ranking results system rankl receptor activator of nuclear factor ligand rr relative risk respiratory rate rv residual volume right ventricle right ventricular se standard error of the mean siadh syndrome of inappropriate secretion of antidiuretic hormone sv splenic vein stroke volume tca tricarboxylic acid cycle tricyclic antidepressant vasopressin receptors vd volume of distribution vdj variable diversity joining gene segments rearranged to form ig genes vh variable region heavy chain antibody vl variable region light chain antibody vpl ventral posterior nucleus lateral vpm ventral posterior nucleus medial vpn vancomycin polymyxin nystatin media ratio xr xlinked recessive xxxy normal complement of sex chromosomes for femalemale zdv zidovudine f