In [1]:
!pip install requests tqdm faiss-cpu transformers torch sentence-transformers textblob gensim

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [2]:
import os
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm

# Directory to store downloaded and extracted data
DATA_DIR = Path("./mimic_textbooks")

# Download and extract the dataset zip file
def download_and_extract_zip(url, extract_to=DATA_DIR):
    # Ensure the directory exists
    extract_to.mkdir(parents=True, exist_ok=True)

    # Download the zip file
    zip_path = extract_to / "textbooks.zip"
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
            if chunk:
                file.write(chunk)

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset downloaded and extracted.")

# Download and extract textbooks
dataset_url = "https://www.dropbox.com/scl/fi/gk1y8ll3d7wllwbb24kqe/textbooks.zip?rlkey=cdpqf8cbeu3difouvhwsc866w&st=resv96io&dl=1"
download_and_extract_zip(dataset_url)


Downloading dataset...


243708KB [00:12, 19157.55KB/s]


Extracting dataset...
Dataset downloaded and extracted.


In [3]:
import re
from gensim.utils import simple_preprocess
from textblob import TextBlob

# Load text files
def load_text_files(directory):
    texts = []
    for file_path in Path(directory).glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            texts.append(file.read())
    return texts

# Cleaning and preprocessing function
def clean_and_tokenize(text):
    # Tokenize with gensim
    tokens = simple_preprocess(text)
    return ' '.join(tokens)

# Chunk text into fixed-size chunks
def chunk_text(text, chunk_size=1000):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

# Load, clean, correct, and chunk documents
documents = load_text_files(DATA_DIR / "en")
cleaned_documents = [clean_and_tokenize(doc) for doc in documents]
chunked_documents = []
for doc in cleaned_documents:
    chunked_documents.extend(chunk_text(doc))

print(f"Total document chunks created: {len(chunked_documents)}")


Total document chunks created: 12272


In [4]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available devices:", torch.cuda.device_count())

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model.to(device).eval()

# Function to generate embeddings for all chunks in a batch
def get_embeddings_in_batch(texts, batch_size=16):
    all_embeddings = []

    # Wrap the loop with tqdm to display a progress bar
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch of texts
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        # Generate embeddings on the GPU
        outputs = model(**inputs).last_hidden_state  # [batch_size, sequence_length, hidden_size]
        batch_embeddings = torch.mean(outputs, dim=1).cpu().detach().numpy()  # [batch_size, hidden_size]

        # Append batch embeddings to the list
        all_embeddings.extend(batch_embeddings)

    return np.array(all_embeddings)

# Generate embeddings for all document chunks in batches
embeddings = get_embeddings_in_batch(chunked_documents, batch_size=64)
print(f"Generated embeddings for {len(embeddings)} document chunks.")



Available devices: 1


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 192/192 [00:24<00:00,  7.93it/s]

Generated embeddings for 12272 document chunks.





In [5]:
import faiss
import numpy as np

# Define the dimension of embeddings
dimension = 384  # Embedding size from MiniLM model
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to NumPy array for FAISS
embedding_matrix = np.array([embedding.flatten() for embedding in embeddings]).astype('float32')

# Add embeddings to FAISS index
index.add(embedding_matrix)
print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 12272


In [6]:
# Example query for testing

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs).last_hidden_state
    return torch.mean(outputs, dim=1).cpu().detach().numpy()



query_text = "What are causes of heart failure?"
query_embedding = get_embedding(query_text)
query_embedding = np.array(query_embedding).reshape(1, -1).astype('float32')

# Search FAISS for the most similar documents
k = 5  # Number of closest documents to retrieve
distances, indices = index.search(query_embedding, k)

# Retrieve and print the most similar chunks
print("Top similar document chunks:")
for idx in indices[0]:
    print(chunked_documents[idx])


Top similar document chunks:
cardiac effects of noncardiac neoplasms the heart is truly remarkable organ beating more than million times per year and pumping over liters of blood day in typical lifespan its cumulative output would fill three supertankers the cardiovascular system is the first organ system to become functional in utero at approximately weeks of gestation without beating heart and vascular supply development cannot proceed and the embryo dies when the heart fails during postnatal life the results are equally catastrophic indeed cardiovascular disease is the leading cause of mortality worldwide and accounts for one in four of all deaths in the united states approximately death every minute or deaths each year greater mortality rate than for all forms of cancer combined the annual economic impact of cardiac disease exceeds billion with ischemic heart disease contributing well over half moreover roughly one third of these deaths are premature occurring in individuals younge

In [7]:
# prompt: write the chunked documents as a json and the FAISS index as a file so later we can load them

import json

# Save chunked documents as JSON
with open("chunked_documents.json", "w") as f:
    json.dump(chunked_documents, f)

# Save FAISS index to a file
faiss.write_index(index, "faiss_index.idx")

print("Chunked documents saved to chunked_documents.json")
print("FAISS index saved to faiss_index.idx")


Chunked documents saved to chunked_documents.json
FAISS index saved to faiss_index.idx
