In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
!cp -r ./drive/MyDrive/sae_data/chunks4 ./chunks

In [None]:
import os

V = 4

if not os.path.exists(f"./drive/MyDrive/sae_data/vector_storages{V}"):
    os.mkdir(f"./drive/MyDrive/sae_data/vector_storages{V}")


In [None]:
import faiss
import torch
import numpy as np
import pickle
import pandas as pd
import spacy

from typing import Any, Dict, List, Callable, Union
from tqdm.auto import tqdm


class VectorStorage(object):
    def __init__(
        self,
        dim: int,
        embedder: Callable[..., Union[torch.Tensor, np.ndarray]] = None
    ):
        self.dim: int = dim
        self.embedder: Callable[..., Union[torch.Tensor, np.ndarray]] = embedder
        self.index = faiss.IndexFlatIP(self.dim)
        self._metadata: Dict[int, Dict[str, Any]] = {}
        self._id_to_offset: Dict[int, int] = {}
        self._offset_to_id: List[int] = []


    def add_document(self, index: int, text: str, metadata: Dict[str, Any]) -> None:
        vec = self.embedder(text)
        vec = vec / np.linalg.norm(vec)
        arr = np.asarray([vec], dtype="float32")

        self.index.add(arr)
        self._id_to_offset[index] = len(self._offset_to_id)
        self._offset_to_id.append(index)
        self._metadata[index] = metadata


    def add_documents(
        self,
        ids: List[int],
        texts: List[str],
        metadata: List[Dict[str, Any]]
    ) -> None:
        vectors = np.asarray(
            [
                self.embedder(t, show_progress_bar=False) for t in tqdm(texts)
            ], dtype="float32"
        )
        vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

        self.index.add(vectors)

        for idx in ids:
            self._id_to_offset[idx] = len(self._offset_to_id)
            self._offset_to_id.append(idx)

        for idx, md in zip(ids, metadata):
            self._metadata[idx] = md


    def search(self, text: str, *, k: int = 5, threshold: float = 1.0) -> List[Dict[str, Any]]:
        if self.embedder is None:
            raise ValueError("Embedder function must be provided.")
        query_vec = self.embedder(text, show_progress_bar=False)
        query_vec /= np.linalg.norm(query_vec)  # L2 normalization
        query_vec = np.asarray([query_vec], dtype="float32")

        distances, ids = self.index.search(query_vec, k)
        results: List[Dict[str, Any]] = []

        for dist, pos in zip(distances[0], ids[0]):
            if pos == -1 or dist > threshold:
                continue
            doc_id = self._offset_to_id[pos]
            results.append(
                {
                    "id": int(doc_id),
                    "score": float(dist),
                    "metadata": self._metadata.get(int(doc_id))
                }
            )
        return results

    def delete_documents(self, document_ids: List[int]) -> None:
        faiss_ids = np.array(document_ids, dtype="int64")
        self.index.remove_ids(faiss_ids)
        for doc_id in document_ids:
            self._metadata.pop(doc_id, None)

    def delete_document(self, document_id: int) -> None:
        self.delete_documents([document_id])

    def save(self, filepath: str) -> None:
        faiss.write_index(self.index, f"{filepath}.index")
        with open(f"{filepath}.pkl", "wb") as file:
            pickle.dump(
                {
                    "metadata": self._metadata,
                    "id_to_offset": self._id_to_offset,
                    "offset_to_id": self._offset_to_id
                }, file
            )

    def load(self, filepath: str) -> None:
        self.index = faiss.read_index(f"{filepath}.index")
        with open(f"{filepath}.pkl", "rb") as file:
            data = pickle.load(file)
            self._metadata = data["metadata"]
            self._id_to_offset = data["id_to_offset"]
            self._offset_to_id = data["offset_to_id"]



In [None]:
def process_text(text):
    return "".join([char for char in text.lower() if char.isalnum() or char.isspace()]).strip()

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('intfloat/e5-base-v2', device="cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
num_files = len([f for f in os.listdir("./chunks") if os.path.isfile(os.path.join("./chunks", f))])
num_files //= 2
files = [
    f"./chunks/chunk_{i + 1}_processed.csv"
    for i in range(num_files)
]

In [None]:
files

['./chunks/chunk_1_processed.csv',
 './chunks/chunk_2_processed.csv',
 './chunks/chunk_3_processed.csv',
 './chunks/chunk_4_processed.csv']

In [None]:
with tqdm(total=len(files), desc="building storages") as pbar:
    for file in files:
        data = pd.read_csv(file)
        data.text = data.text.apply(process_text)
        data = data.drop_duplicates(subset="text")
        data = data.loc[data.text.apply(lambda x: len(x.split()) > 10)]
        storage = VectorStorage(
            dim=model.get_sentence_embedding_dimension(),
            embedder=model.encode,
        )
        metadata = [{"text": text} for text in data["text"]]
        storage.add_documents(
            ids=data.index.tolist(),
            texts=data["text"].tolist(),
            metadata=metadata
        )
        filename = file.split("/")[-1].split(".")[0]
        storage.save(f"./drive/MyDrive/sae_data/vector_storages{V}/storage-{filename}")
        pbar.update(1)

building storages:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/18320 [00:00<?, ?it/s]

  0%|          | 0/19401 [00:00<?, ?it/s]

  0%|          | 0/17202 [00:00<?, ?it/s]

  0%|          | 0/15180 [00:00<?, ?it/s]