In [None]:
# Install required packages
!pip install datasets torch sentence-transformers

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Load SQuAD v2 dataset
from datasets import load_dataset
dataset = load_dataset("squad_v2")

# Extract contexts and questions
contexts = dataset["train"]["context"]
questions = dataset["train"]["question"]

# 3. Load embedding model (e5-large)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("intfloat/e5-large")

# 4. Vectorize contexts and questions
context_embeddings = model.encode(
    contexts, 
    convert_to_numpy=True, 
    batch_size=64, 
    show_progress_bar=True
)

question_embeddings = model.encode(
    questions, 
    convert_to_numpy=True, 
    batch_size=64, 
    show_progress_bar=True
)

# 5. Save embeddings to Google Drive (inside embeddings folder)
import numpy as np
save_path = "/content/drive/MyDrive/RAG Research/embeddings"
np.save(f"{save_path}/context_embeddings.npy", context_embeddings)
np.save(f"{save_path}/question_embeddings.npy", question_embeddings)

print("âœ… Embeddings created and saved in Google Drive -> RAG Research/embeddings/")