In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [3]:
import faiss
import torch
import numpy as np
import pickle

from typing import Any, Dict, List, Callable, Union
from tqdm.auto import tqdm


__all__ = (
    "VectorStorage",
)


class VectorStorage(object):
    """
    A class to manage vector storage using FAISS.
    This class provides methods to add, search, and delete vectors,
    as well as save and load the index to/from disk.
    It also allows for the storage of associated metadata.
    Attributes:
        dim (int): The dimension of the vectors.
        embedder (SentenceTransformer): A function to convert text to vectors.
        index (faiss.Index): The FAISS index for vector storage.
    """

    def __init__(
        self,
        dim: int,
        embedder: Callable[..., Union[torch.Tensor, np.ndarray]] = None
    ):
        """
        Initialize the VectorStorage with the specified parameters.
        Args:
            dim (int): The dimension of the vectors.
            embedder (Callable[[str], np.ndarray]): A function to convert text to vectors.
        """
        self.dim: int = dim
        self.embedder: Callable[..., Union[torch.Tensor, np.ndarray]] = embedder
        self.index = faiss.IndexFlatIP(self.dim)
        self._metadata: Dict[int, Dict[str, Any]] = {}
        self._id_to_offset: Dict[int, int] = {}
        self._offset_to_id: List[int] = []


    def add_document(self, index: int, text: str, metadata: Dict[str, Any]) -> None:
        """
        Add a single document to the vector storage.
        Args:
            index (int): The ID of the document.
            text (str): The text content of the document.
            metadata (Dict[str, Any]): Metadata associated with the document.
        Raises:
            ValueError: If the embedder function is not provided.
        """
        vec = self.embedder(text)
        vec = vec / np.linalg.norm(vec)
        arr = np.asarray([vec], dtype="float32")

        self.index.add(arr)
        self._id_to_offset[index] = len(self._offset_to_id)
        self._offset_to_id.append(index)
        self._metadata[index] = metadata


    def add_documents(
        self,
        ids: List[int],
        texts: List[str],
        metadata: List[Dict[str, Any]]
    ) -> None:
        """
        Add multiple documents to the vector storage.
        Args:
            ids (List[int]): The IDs of the documents.
            texts (List[str]): The text content of the documents.
            metadata (List[Dict[str, Any]]): Metadata associated with the documents.
        Raises:
            ValueError: If the embedder function is not provided.
        """
        vectors = np.asarray(
            [
                self.embedder(t, show_progress_bar=False) for t in tqdm(texts)
            ], dtype="float32"
        )
        vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

        self.index.add(vectors)

        for idx in ids:
            self._id_to_offset[idx] = len(self._offset_to_id)
            self._offset_to_id.append(idx)

        for idx, md in zip(ids, metadata):
            self._metadata[idx] = md


    def search(self, text: str, *, k: int = 5, threshold: float = 1.0) -> List[Dict[str, Any]]:
        """
        Search for the nearest neighbors of the given text in the vector storage.
        Args:
            text (str): The text to search for.
            k (int): The number of nearest neighbors to return.
            threshold (float): The distance threshold for filtering results.
        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing the ID, score,
                                  and metadata of the nearest neighbors.
        Raises:
            ValueError: If the embedder function is not provided.
        """
        if self.embedder is None:
            raise ValueError("Embedder function must be provided.")
        query_vec = self.embedder(text, show_progress_bar=False)
        query_vec /= np.linalg.norm(query_vec)  # L2 normalization
        query_vec = np.asarray([query_vec], dtype="float32")

        distances, ids = self.index.search(query_vec, k)
        results: List[Dict[str, Any]] = []

        for dist, pos in zip(distances[0], ids[0]):
            if pos == -1 or dist > threshold:
                continue
            doc_id = self._offset_to_id[pos]
            results.append(
                {
                    "id": int(doc_id),
                    "score": float(dist),
                    "metadata": self._metadata.get(int(doc_id))
                }
            )
        return results

    def delete_documents(self, document_ids: List[int]) -> None:
        """
        Delete multiple documents from the vector storage.
        Args:
            document_ids (List[int]): The IDs of the documents to delete.
        """
        faiss_ids = np.array(document_ids, dtype="int64")
        self.index.remove_ids(faiss_ids)
        for doc_id in document_ids:
            self._metadata.pop(doc_id, None)

    def delete_document(self, document_id: int) -> None:
        """
        Delete a single document from the vector storage.
        Args:
            document_id (int): The ID of the document to delete.
        """
        self.delete_documents([document_id])

    def save(self, filepath: str) -> None:
        """
        Save the FAISS index and metadata to disk.
        Args:
            filepath (str): The base file path to save the index and metadata.
        """
        faiss.write_index(self.index, f"{filepath}.index")
        with open(f"{filepath}.pkl", "wb") as file:
            pickle.dump({
                "metadata": self._metadata,
                "id_to_offset": self._id_to_offset,
                "offset_to_id": self._offset_to_id
            }, file)

    def load(self, filepath: str) -> None:
        """
        Load the FAISS index and metadata from disk.
        Args:
            filepath (str): The base file path to load the index and metadata from.
        """
        self.index = faiss.read_index(f"{filepath}.index")
        with open(f"{filepath}.pkl", "rb") as file:
            data = pickle.load(file)
            self._metadata = data["metadata"]
            self._id_to_offset = data["id_to_offset"]
            self._offset_to_id = data["offset_to_id"]



In [4]:
import pandas as pd

In [5]:
data = pd.read_csv("./drive/MyDrive/raw_data2.csv")

In [6]:
def process_text(text):
    return "".join([char for char in text.lower() if char.isalnum() or char.isspace()]).strip()

In [7]:
data.text = data.text.apply(process_text)

In [8]:
from sentence_transformers import SentenceTransformer

In [9]:
model = SentenceTransformer('intfloat/e5-base-v2', device="cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [10]:
storage = VectorStorage(
    dim=model.get_sentence_embedding_dimension(),
    embedder=model.encode,
)

In [11]:
metadata = [{"text": text} for text in data["text"]]

In [12]:
storage.add_documents(
    ids=data.index.tolist(),
    texts=data["text"].tolist(),
    metadata=metadata
)

  0%|          | 0/983 [00:00<?, ?it/s]

In [13]:
storage.search(
    "Russia and America were on good terms with America in 1942.".lower()
)

[{'id': 247,
  'score': 0.8237992525100708,
  'metadata': {'text': 'territory gained by the ussr  1939 41 and in 1945'}},
 {'id': 795,
  'score': 0.7956979870796204,
  'metadata': {'text': '19 carr twilight chs 34 20 haslam soviet foreign policy pp 678'}},
 {'id': 77,
  'score': 0.7944697737693787,
  'metadata': {'text': 'russias fin de siecle 19001914  m a r k d st e i n b e rg'}},
 {'id': 818,
  'score': 0.793385922908783,
  'metadata': {'text': 'moscows foreign policy 19452000 identities institutions and interests'}},
 {'id': 9,
  'score': 0.7923578023910522,
  'metadata': {'text': 'those sentences retain their relevance at the beginning of the twentyfirst century western particularly american attitudes and understandings of russia and the soviet union unfolded in the last hundred years within a broad discourse of optimism about human progress that relied on the comfort ing thought that capitalist democracy represented the best possible solution to human society if not the end of hi

In [14]:
storage.save("./drive/MyDrive/rd2indexflat")