In [None]:
!pip install requests tqdm faiss-cpu 'transformers==4.44.2' 'accelerate==0.33.0' torch sentence-transformers textblob gensim 'numpy<2'

In [None]:
import os
import requests
import zipfile
from pathlib import Path
from tqdm import tqdm

# Directory to store downloaded and extracted data
DATA_DIR = Path("./mimic_textbooks")

# Download and extract the dataset zip file
def download_and_extract_zip(url, extract_to=DATA_DIR):
    # Ensure the directory exists
    extract_to.mkdir(parents=True, exist_ok=True)

    # Download the zip file
    zip_path = extract_to / "textbooks.zip"
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(zip_path, "wb") as file:
        for chunk in tqdm(response.iter_content(chunk_size=1024), unit='KB'):
            if chunk:
                file.write(chunk)

    # Extract the zip file
    print("Extracting dataset...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset downloaded and extracted.")

# Download and extract textbooks
dataset_url = "https://www.dropbox.com/scl/fi/gk1y8ll3d7wllwbb24kqe/textbooks.zip?rlkey=cdpqf8cbeu3difouvhwsc866w&st=resv96io&dl=1"
download_and_extract_zip(dataset_url)


In [None]:
!ls ./mimic_textbooks/en

In [None]:
import re
from gensim.utils import simple_preprocess
from textblob import TextBlob

# Load text files
def load_text_files(directory):
    texts = []
    # read the file text over the whole directory and append to texts
    for file_path in Path(directory).glob("*.txt"):
        with open(file_path, "r", encoding="utf-8") as file:
            texts.append(file.read())
    return texts

# Cleaning and preprocessing function
def clean_and_tokenize(text):
    # Tokenize with gensim
    pass

# Chunk text into fixed-size chunks
def chunk_text(text, chunk_size=1000):
    pass

# Load, clean, correct, and chunk documents
documents = load_text_files(DATA_DIR / "en")
cleaned_documents = [pass]
chunked_documents = []
None

print(f"Total document chunks created: {len(chunked_documents)}")


In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
tokenizer = None
model = None

# Function to generate embeddings for all chunks in a batch
def get_embeddings_in_batch(texts, batch_size=16):
    all_embeddings = []

    # Wrap the loop with tqdm to display a progress bar
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch of texts
        inputs = None
        # Generate embeddings on the GPU
        outputs = None  # [batch_size, sequence_length, hidden_size], get ;ast state
        batch_embeddings = None  # [batch_size, hidden_size]

        # Append batch embeddings to the list
        all_embeddings.extend(batch_embeddings)

    return np.array(all_embeddings)

# Generate embeddings for all document chunks in batches
embeddings = get_embeddings_in_batch(chunked_documents, batch_size=64)
print(f"Generated embeddings for {len(embeddings)} document chunks.")



In [None]:
import faiss
import numpy as np

# Define the dimension of embeddings
dimension = 384  # Embedding size from MiniLM model
index = None

# Convert embeddings to NumPy array for FAISS
embedding_matrix = None

# Add embeddings to FAISS index
None
print(f"Total embeddings indexed: {index.ntotal}")


In [None]:
# Example query for testing

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs).last_hidden_state
    return torch.mean(outputs, dim=1).cpu().detach().numpy()



query_text = "What are causes of heart failure?"
query_embedding = get_embedding(query_text)
query_embedding = np.array(query_embedding).reshape(1, -1).astype('float32')

# Search FAISS for the most similar documents
k = 5  # Number of closest documents to retrieve
distances, indices = None

# Retrieve and print the most similar chunks
print("Top similar document chunks:")
for idx in indices[0]:
    print(chunked_documents[idx])
