In [None]:
!pip install faiss-gpu



In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Set up paths and model
dataset_files = {
    "train": "/content/drive/MyDrive/RAG_Poisoning/wikiasp_dataset/train.jsonl",
    "valid": "/content/drive/MyDrive/RAG_Poisoning/wikiasp_dataset/valid.jsonl",
    "test": "/content/drive/MyDrive/RAG_Poisoning/wikiasp_dataset/test.jsonl"
}
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
faiss_index_file = "/content/drive/MyDrive/RAG_Poisoning/embeddings/wikiasp_embeddings.faiss"

In [None]:
# Load MiniLM-L6-V2 model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Load MiniLM-L6-V2 model
model = SentenceTransformer(embedding_model, device="cuda")  # Ensure GPU usage

# Function to chunk text into smaller parts
def chunk_text(sentences, chunk_size=64, stride=16):
    """Chunks input text with overlap for better embedding coverage."""
    chunks = []
    for i in range(0, len(sentences), stride):
        chunk = sentences[i:i + chunk_size]
        if len(chunk) > 0:
            chunks.append(" ".join(chunk))
    return chunks

# Function to process a single instance
def process_instance(instance):
    exid = instance["exid"]
    sentences = instance["inputs"]
    targets = instance["targets"]

    # Chunk the input sentences
    chunks = chunk_text(sentences)

    # Generate embeddings for each chunk
    embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=False)

    # Create metadata for each embedding
    metadata = [
        {
            "exid": exid,
            "chunk_id": i,
            "targets": targets,
            "chunk": chunks[i]
        }
        for i in range(len(embeddings))
    ]

    return embeddings, metadata

# Function to process dataset with parallel processing
def process_dataset(dataset_files, model, faiss_index_file, max_workers=8):
    all_embeddings = []
    metadata = []
    dimension = None

    # Thread pool for parallel processing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for split, filepath in dataset_files.items():
            if not os.path.exists(filepath):
                print(f"File {filepath} not found, skipping.")
                continue

            with open(filepath, 'r') as file:
                lines = file.readlines()
                instances = [json.loads(line) for line in lines]

                # Parallelize instance processing
                results = list(
                    tqdm(
                        executor.map(process_instance, instances),
                        total=len(instances),
                        desc=f"Processing {split} split"
                    )
                )

                for embeddings, meta in results:
                    all_embeddings.append(embeddings)
                    metadata.extend(meta)

    # Combine all embeddings into a single array
    all_embeddings = np.vstack(all_embeddings).astype('float32')
    dimension = all_embeddings.shape[1]

    # Combine all metadata
    # Create FAISS index
    index = faiss.IndexFlatL2(dimension)  # L2 distance-based index
    index = faiss.IndexIDMap(index)  # Use ID map for metadata lookup

    # Add embeddings to the FAISS index
    ids = np.arange(len(all_embeddings)).astype('int64')  # Unique IDs for each embedding
    index.add_with_ids(all_embeddings, ids)

    # Save the index
    faiss.write_index(index, faiss_index_file)
    print(f"FAISS index saved to {faiss_index_file}")

    # Save metadata to a JSON file
    metadata_file = "/content/drive/MyDrive/RAG_Poisoning/embeddings/metadata.json"
    with open(metadata_file, "w") as f:
        json.dump(metadata, f, indent=2)
    print(f"Metadata saved to {metadata_file}")


# Run the processing
process_dataset(dataset_files, model, faiss_index_file, max_workers=256)  # Adjust max_workers as needed


Processing train split: 100%|██████████| 18177/18177 [23:29<00:00, 12.90it/s]
Processing valid split: 100%|██████████| 2218/2218 [03:02<00:00, 12.15it/s] 
Processing test split: 100%|██████████| 2333/2333 [03:05<00:00, 12.56it/s] 


FAISS index saved to /content/drive/MyDrive/RAG_Poisoning/embeddings/wikiasp_embeddings.faiss
Metadata saved to /content/drive/MyDrive/RAG_Poisoning/embeddings/metadata.json


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
em_model = SentenceTransformer(embedding_model, device="cuda")  # Use GPU for encoding

# Function to load FAISS index
def load_faiss_index(index_file):
    """Loads the FAISS index from a file."""
    index = faiss.read_index(index_file)
    return index

# Function to perform retrieval
def retrieve_top_k(query, index, metadata, top_k=5):
    """
    Retrieve top-k similar documents for a given query.

    Args:
        query (str): The query string.
        index (faiss.Index): The FAISS index.
        metadata (list[dict]): Metadata corresponding to the embeddings in the index.
        top_k (int): Number of top results to retrieve.

    Returns:
        list[dict]: Top-k metadata entries with similarity scores.
    """
    # Encode the query into an embedding
    query_embedding = em_model.encode([query], convert_to_numpy=True)

    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Collect results
    results = []
    for i, idx in enumerate(indices[0]):
        if idx == -1:  # FAISS returns -1 for missing indices
            continue
        results.append({
            "score": distances[0][i],
            **metadata[idx]
        })
    return results
