In [None]:
#!pip install faiss-cpu==1.7.4
#!pip install langchain_community==0.0.20
#!pip install sentence-transformers==2.3.1

# pandas data

In [67]:
import pandas as pd
import numpy as np

# Create sample data
data = {
    'file_id': ['file001', 'file002', 'file003', 'file004', 'file005', 
                'file006', 'file007', 'file008', 'file009', 'file010'],
    'chunk_id': ['chunk001', 'chunk002', 'chunk003', 'chunk004', 'chunk005', 
                 'chunk006', 'chunk007', 'chunk008', 'chunk009', 'chunk010'],
    'chunk_content': ['Introduction to Artificial Intelligence and Machine Learning', 
                      'Supervised Learning Algorithms: Linear Regression and Logistic Regression', 
                      'Unsupervised Learning: Clustering Algorithms - KMeans and DBSCAN', 
                      'Neural Networks and Deep Learning: Introduction to TensorFlow and Keras', 
                      'Natural Language Processing (NLP) Techniques: Tokenization and Word Embeddings', 
                      'Computer Vision and Image Processing: Convolutional Neural Networks (CNNs)', 
                      'Reinforcement Learning and Decision Making: Q-Learning and Markov Decision Processes', 
                      'Time Series Forecasting: ARIMA and Exponential Smoothing Methods', 
                      'Model Evaluation and Performance Metrics: ROC Curves and Confusion Matrices', 
                      'Feature Engineering and Data Preprocessing: Handling Missing Data and Outliers'],
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
df.head(2)

Unnamed: 0,file_id,chunk_id,chunk_content
0,file001,chunk001,Introduction to Artificial Intelligence and Ma...
1,file002,chunk002,Supervised Learning Algorithms: Linear Regress...


# Embeddings

In [68]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedding_model = HuggingFaceBgeEmbeddings(model_name=model_name) #, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs


In [69]:
embedding = embedding_model.embed_query("what is ai?")
len(embedding)

768

In [70]:
df['embeddings'] = df['chunk_content'].apply(lambda x: embedding_model.embed_query(x))

In [71]:
type(df.embeddings[0])

list

In [72]:
df.head(2)

Unnamed: 0,file_id,chunk_id,chunk_content,embeddings
0,file001,chunk001,Introduction to Artificial Intelligence and Ma...,"[0.012818186543881893, 0.0036683804355561733, ..."
1,file002,chunk002,Supervised Learning Algorithms: Linear Regress...,"[0.03751228004693985, -0.006025714799761772, -..."


In [73]:
df.to_csv("embeddings.csv", index=False)

In [74]:
len(df.embeddings[9])

768

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   file_id        10 non-null     object
 1   chunk_id       10 non-null     object
 2   chunk_content  10 non-null     object
 3   embeddings     10 non-null     object
dtypes: object(4)
memory usage: 448.0+ bytes


# Actual Working and Experiment

In [78]:
# Importing necessary libraries
from typing import Any, Dict
import faiss
import ast
from typing import Any, Dict

class FaissExtractor():
    def __init__(
        self,
        table: str,
        method: str = "ivfflat",
        embeddings_column: str = "embeddings",
        chunk_id_column: str = "chunk_id",
        chunk_content_column: str = "chunk_content",
        n_list: int = 100,
        n_neighbors: int = 5,
    ) -> None:
        
        model_name = "BAAI/bge-base-en-v1.5"

        self.embedding_model = HuggingFaceBgeEmbeddings(model_name=model_name)

        self.method = method
        self.n_list = n_list

        self.table = table
        self.embeddings_column = embeddings_column
        self.chunk_id_column = chunk_id_column
        self.chunk_content_column = chunk_content_column

        self.n_neighbors = n_neighbors

        self.index = None
        self.dimension = None

    def build_index(self) -> None:

        self.df = pd.read_csv(self.table)
        self.df[self.embeddings_column] = self.df[self.embeddings_column].apply(ast.literal_eval)

        vectors = np.array(self.df[self.embeddings_column].tolist())
        vectors /= np.linalg.norm(vectors, axis=1)[:, np.newaxis]

        self.dimension = vectors.shape[1]

        if self.method == "ivfflat":
            quantizer = faiss.IndexFlatL2(self.dimension)
            self.index = faiss.IndexIVFFlat(
                quantizer,
                self.dimension,
                self.n_list,
                faiss.METRIC_INNER_PRODUCT,
            )

        elif self.method == "hnsw":
            self.index = faiss.IndexHNSWFlat(
                self.dimension,
                self.n_list,
                faiss.METRIC_INNER_PRODUCT,
            )
        else:
            raise ValueError("Invalid indexing method. Use 'ivfflat' or 'hnsw'.")

        self.index.train(vectors)
        self.index.add(vectors)

    def search(self, query: str) -> Dict[str, Any]:

        if self.index is None:
            raise ValueError("Index not built yet. Call build_index() first.")

        query_vector = self.embedding_model.embed_query(query)
        query_vector = np.array(query_vector)
        query_vector = query_vector / np.linalg.norm(query_vector)

        similarity, indices = self.index.search(
            query_vector.reshape(1, -1), self.n_neighbors
        )
        distance = 1 - similarity

        chunk_ids = self.df[self.chunk_id_column].values[indices]
        chunks = self.df[self.chunk_content_column].values[indices]

        nearest_vectors = self.df[self.embeddings_column].values[indices]

        result = dict(
            similarity=similarity.flatten().tolist(),
            distance=distance.flatten().tolist(),
            chunk_ids=chunk_ids,
            chunks=chunks,
            vectors=nearest_vectors,
        )

        return result

In [79]:
# Instantiating FaissExtractor object
faiss_extractor = FaissExtractor(
    table="embeddings.csv",  # Provide the path to your CSV file
    embeddings_column="embeddings",  # Name of the embeddings column in your CSV
    chunk_id_column="chunk_id",  # Name of the chunk ID column in your CSV
    chunk_content_column="chunk_content",  # Name of the chunk content column in your CSV
    n_list=10,  # Number of clusters for IVFFlat index
    n_neighbors=5,  # Number of nearest neighbors to retrieve
)

# Building the index
faiss_extractor.build_index()

# Performing a similarity search
query = "what is ai"  # Specify your query
search_result = faiss_extractor.search(query)

# Printing the search result
print(search_result)


{'similarity': [0.6494624614715576, -3.4028234663852886e+38, -3.4028234663852886e+38, -3.4028234663852886e+38, -3.4028234663852886e+38], 'distance': [0.3505375385284424, 3.4028234663852886e+38, 3.4028234663852886e+38, 3.4028234663852886e+38, 3.4028234663852886e+38], 'chunk_ids': array([['chunk001', 'chunk010', 'chunk010', 'chunk010', 'chunk010']],
      dtype=object), 'chunks': array([['Introduction to Artificial Intelligence and Machine Learning',
        'Feature Engineering and Data Preprocessing: Handling Missing Data and Outliers',
        'Feature Engineering and Data Preprocessing: Handling Missing Data and Outliers',
        'Feature Engineering and Data Preprocessing: Handling Missing Data and Outliers',
        'Feature Engineering and Data Preprocessing: Handling Missing Data and Outliers']],
      dtype=object), 'vectors': array([[list([0.012818186543881893, 0.0036683804355561733, -0.004391423426568508, 0.014650912955403328, 0.08693765103816986, 0.0453343465924263, 0.021985