# Quick Check(IGNORE)

In [None]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Set your Hugging Face token
HF_TOKEN = os.environ.get("HF_TOKEN", None)

# Load documents
loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader)
documents = loader.load()
print('len of documents are', len(documents))

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))

# Generate embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Store embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)
vectorstore.save_local('faiss_index')

print("Embeddings stored successfully!")

In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.10-py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.8/332.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.82-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langs

In [None]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer

# Load documents
loader = DirectoryLoader('/content/drive/MyDrive/ti_sample_emedding_files', loader_cls=TextLoader)
documents = loader.load()
print('len of documents are', len(documents))

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))

# Load the new embedding model
model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
model = SentenceTransformer(model_name)

# Generate embeddings
def embed_text(text):
    return model.encode(text, convert_to_tensor=True)

embeddings = [embed_text(doc.page_content) for doc in all_splits]

# Store embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)
vectorstore.save_local('faiss_index_new_model')

print("Embeddings stored successfully with the new model!")


len of documents are 112
Length of all_splits: 112


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

AttributeError: 'list' object has no attribute 'embed_documents'

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer

# Load documents
loader = DirectoryLoader('/content/drive/MyDrive/ti_sample_emedding_files', loader_cls=TextLoader)
documents = loader.load()
print('len of documents are', len(documents))

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))

# Load the new embedding model
model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
model = SentenceTransformer(model_name)

# Define a custom embedding class
class CustomEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [self.model.encode(text) for text in texts]

# Initialize the custom embedding class
embeddings = CustomEmbeddings(model)

# Store embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)
vectorstore.save_local('faiss_index_new_model')

print("Embeddings stored successfully with the new model!")


len of documents are 112
Length of all_splits: 112




Embeddings stored successfully with the new model!


In [None]:
# Load the FAISS index
vectorstore = FAISS.load_local('/content/faiss_index_new_model', embeddings, allow_dangerous_deserialization=True)

# Perform a search query
query = "Interruption while handling metal"
query_embedding = embed_text(query)

# Retrieve the top 5 most similar documents
results = vectorstore.similarity_search_by_vector(query_embedding, k=5)

for result in results:
    print(result.page_content)




TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

# START HERE

In [None]:
!pip freeze

absl-py==1.4.0
accelerate==0.31.0
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.9.0
bitsandbytes==0.43.1
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.6.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.18.1
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.4
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.5
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.8
cuda-python==

In [None]:
!pip install langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install faiss-gpu

Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl (975 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.5/975.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.10-py3-none-any.whl (332 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.8/332.8 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.82-py3-none-any.whl (127 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.4/127.4 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from lang

In [1]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np

# Load documents
loader = DirectoryLoader('/content/drive/MyDrive/sample_embedding_folder', loader_cls=TextLoader)
documents = loader.load()
print('len of documents are', len(documents))

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))

# Load the new embedding model
model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
model = SentenceTransformer(model_name)

# Define a custom embedding class
class CustomEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [self.model.encode(text, convert_to_tensor=True).cpu().numpy() for text in texts]

    def embed_query(self, text):
        return self.model.encode(text, convert_to_tensor=True).cpu().numpy()

# Initialize the custom embedding class
embeddings = CustomEmbeddings(model)

# Generate embeddings
document_embeddings = [embeddings.embed_documents([doc.page_content])[0] for doc in all_splits]

# Store embeddings in the FAISS vector store on GPU
faiss_index = faiss.IndexFlatL2(document_embeddings[0].shape[0])
gpu_resource = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(gpu_resource, 0, faiss_index)

gpu_index.add(np.array(document_embeddings))
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index), 'faiss_index_new_model3.index')

print("Embeddings stored successfully with the new model!")



In [3]:
cpu_index = faiss.read_index('/content/drive/MyDrive/faiss_index_new_model3.index')
gpu_index = faiss.index_cpu_to_gpu(gpu_resource, 0, cpu_index)

# Perform a search query
query = "Does the VIP modules & CSI2 module could work simultaneously?"
query_embedding = embeddings.embed_query(query)

# Retrieve the top 5 most similar documents
distances, indices = gpu_index.search(np.array([query_embedding]), k=5)

for idx in indices[0]:
    print(all_splits[idx].page_content)
    print("############################")

# Approach 1

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# import torch

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )
# device = "cuda" # the device to load the model onto

# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",quantization_config=bnb_config,use_auth_token="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")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",use_auth_token="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")


In [None]:
# PROMPT="Do you have mayonnaise recipes?"
# messages = [
#     {"role": "user", "content": PROMPT}
# ]

# encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

# model_inputs = encodeds.to(device)
# # model.to(device)

# generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
# decoded = tokenizer.batch_decode(generated_ids)
# print(decoded[0])

# Approach 2

In [None]:
!pip install -U bitsandbytes
!pip install -U accelerate




In [None]:
# !pip install -U bitsandbytes
# !pip install -U accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datetime import datetime

# Install and import necessary packages
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Example prompt
prompt = "Do you have any recommendations for a good thriller novel?"

# Encode and prepare inputs
messages = [{"role": "user", "content": prompt}]
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to(device)

# Perform inference and measure time
start_time = datetime.now()
generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
elapsed_time = datetime.now() - start_time

# Decode and print output
decoded = tokenizer.batch_decode(generated_ids)
print("Generated response:", decoded[0])
print("Time elapsed:", elapsed_time)

# Check if running on GPU
print("Device in use:", device)





`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated response: <s> [INST] Do you have any recommendations for a good thriller novel? [/INST] Yes, I would be happy to recommend a few thriller novels that have received high praise and great reviews from critics and readers alike. Here are some suggestions:

1. "Gone Girl" by Gillian Flynn: This psychological thriller tells the story of a married couple whose relationship is tested when the wife goes missing. The narrative is filled with twists and turns that will keep you on the edge of your seat.
2. "The Silent Patient" by Alex Michaelides: This psychological thriller revolves around a woman who shoots her husband five times and then refuses to speak. The novel explores the mystery behind her silence and the dark secrets in her past.
3. "The Girl on the Train" by Paula Hawkins: This suspenseful thriller follows a woman named Rachel who becomes involved in a missing persons case while riding the train every day. The story unfolds through a series of flashbacks, each one revealing

In [None]:
# !pip install mistral_inference

In [None]:
# from mistral_inference.model import Transformer
# from mistral_inference.generate import generate

# from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
# from mistral_common.protocol.instruct.messages import UserMessage
# from mistral_common.protocol.instruct.request import ChatCompletionRequest
# from pathlib import Path

# # mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
# # mistral_models_path.mkdir(parents=True, exist_ok=True)
# mistral_models_path = Path.home().joinpath('mistral_models', '7B-v0.3')
# mistral_models_path.mkdir(parents=True, exist_ok=True)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",use_auth_token="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")
# # tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
# model = Transformer.from_folder(mistral_models_path)

# completion_request = ChatCompletionRequest(messages=[UserMessage(content="Explain Machine Learning to me in a nutshell.")])

# tokens = tokenizer.encode_chat_completion(completion_request).tokens

# out_tokens, _ = generate([tokens], model, max_tokens=64, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
# result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])

# print(result)


In [None]:
import os
import pickle
import faiss
import numpy as np
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Load documents
loader = DirectoryLoader('/content/drive/MyDrive/sample_embedding_folder', loader_cls=TextLoader)
documents = loader.load()
print('Length of documents:', len(documents))

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
all_splits = text_splitter.split_documents(documents)
print("Length of all_splits:", len(all_splits))

# Load the new embedding model
model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
model = SentenceTransformer(model_name)

# Define a custom embedding class
class CustomEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [self.model.encode(text, convert_to_tensor=True).cpu().numpy() for text in texts]

    def embed_query(self, text):
        return self.model.encode(text, convert_to_tensor=True).cpu().numpy()

# Initialize the custom embedding class
embeddings = CustomEmbeddings(model)

# Generate embeddings
document_embeddings = [embeddings.embed_documents([doc.page_content])[0] for doc in all_splits]

# Initialize FAISS index on CPU
faiss_index = faiss.IndexFlatL2(document_embeddings[0].shape[0])

# Add embeddings to the index
faiss_index.add(np.array(document_embeddings))

# Store FAISS index in .faiss file
faiss.write_index(faiss.index_gpu_to_cpu(faiss_index), 'index.faiss')

# Store document embeddings in a .pkl file
with open('index.pkl', 'wb') as f:
    pickle.dump(document_embeddings, f)

print("Embeddings and FAISS index stored successfully with the new model!")


Length of documents: 1443
Length of all_splits: 1661
Embeddings and FAISS index stored successfully with the new model!


# TOTAL APPROACH

In [None]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline,BitsAndBytesConfig

class DocumentRetrievalAndGeneration:
    def __init__(self, embedding_model_name, lm_model_id, data_folder):
        self.documents = self.load_documents(data_folder)
        self.embeddings = SentenceTransformer(embedding_model_name)
        self.gpu_index = self.build_faiss_index()
        self.llm = self.initialize_llm(lm_model_id)

    def load_documents(self, folder_path):
        loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
        documents = loader.load()
        print('Length of documents:', len(documents))
        return documents

    def build_faiss_index(self):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
        all_splits = text_splitter.split_documents(self.documents)
        document_embeddings = [self.embeddings.encode(doc.page_content, convert_to_tensor=True).cpu().numpy() for doc in all_splits]
        faiss_index = faiss.IndexFlatL2(document_embeddings[0].shape[0])
        gpu_resource = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(gpu_resource, 0, faiss_index)
        gpu_index.add(np.array(document_embeddings))
        return gpu_index

    def initialize_llm(self, model_id):
        bnb_config = BitsAndBytesConfig(
          load_in_4bit=True,
          bnb_4bit_use_double_quant=True,
          bnb_4bit_quant_type="nf4",
          bnb_4bit_compute_dtype=torch.bfloat16
        )
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=bnb_config)
        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
        generate_text = pipeline(
            model=model,
            tokenizer=tokenizer,
            return_full_text=True,
            task='text-generation',
            temperature=0.6,
            max_new_tokens=2048,
        )
        return generate_text

    def query_and_generate_response(self, query):
        query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
        distances, indices = self.gpu_index.search(np.array([query_embedding]), k=5)

        content = ""
        all_splits=build_faiss_index.all_splits
        for idx in indices[0]:
          content += "-" * 50 + "\n"

          content+=all_splits[idx].page_content
          print(all_splits[idx].page_content)
          print("############################")
        # for idx in indices[0]:
        #     content += "-" * 50 + "\n"
        #     content += self.documents[idx].page_content + "\n"

        prompt = f"Query: {query}\nSolution: {content}\n"
        encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
        model_inputs = encodeds.to(device)

        # Perform inference and measure time
        start_time = datetime.now()
        generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
        elapsed_time = datetime.now() - start_time

        # Decode and print output
        decoded = tokenizer.batch_decode(generated_ids)
        print("Generated response:", decoded[0])
        print("Time elapsed:", elapsed_time)

        # Check if running on GPU
        print("Device in use:", device)
        # result = self.llm(prompt)
        # generated_text = result[0]['generated_text']
        # return generated_text , content

if __name__ == "__main__":
    # Example usage
    embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
    lm_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
    data_folder = '/content/drive/MyDrive/sample_embedding_folder'

    doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)

    # Example query
    query = "master to slave delay"

    # Perform query and generate response
    response = doc_retrieval_gen.query_and_generate_response(query)
    print("Generated response:\n", response)


In [None]:
# !faiss-gpu

/bin/bash: line 1: faiss-gpu: command not found


In [4]:
import os
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import faiss
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from datetime import datetime

class DocumentRetrievalAndGeneration:
    def __init__(self, embedding_model_name, lm_model_id, data_folder, faiss_index_path):
        self.documents = self.load_documents(data_folder)
        self.embeddings = SentenceTransformer(embedding_model_name)
        self.gpu_index = self.load_faiss_index(faiss_index_path)
        self.llm = self.initialize_llm(lm_model_id)

    def load_documents(self, folder_path):
        loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
        documents = loader.load()
        print('Length of documents:', len(documents))
        return documents

    def load_faiss_index(self, faiss_index_path):
        cpu_index = faiss.read_index(faiss_index_path)
        gpu_resource = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(gpu_resource, 0, cpu_index)
        return gpu_index

    def initialize_llm(self, model_id):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        generate_text = pipeline(
            model=model,
            tokenizer=tokenizer,
            return_full_text=True,
            task='text-generation',
            temperature=0.6,
            max_new_tokens=2048,
        )
        return generate_text

    def query_and_generate_response(self, query):
        query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
        distances, indices = self.gpu_index.search(np.array([query_embedding]), k=5)

        content = ""
        for idx in indices[0]:
            content += "-" * 50 + "\n"
            content += self.documents[idx].page_content + "\n"
            print(self.documents[idx].page_content)
            print("############################")

        prompt = f"Query: {query}\nSolution: {content}\n"

        # Encode and prepare inputs
        messages = [{"role": "user", "content": prompt}]
        encodeds = self.llm.tokenizer.apply_chat_template(messages, return_tensors="pt")
        model_inputs = encodeds.to(self.llm.device)

        # Perform inference and measure time
        start_time = datetime.now()
        generated_ids = self.llm.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
        elapsed_time = datetime.now() - start_time

        # Decode and print output
        decoded = self.llm.tokenizer.batch_decode(generated_ids)
        generated_response = decoded[0]
        print("Generated response:", generated_response)
        print("Time elapsed:", elapsed_time)
        print("Device in use:", self.llm.device)

        return generated_response

if __name__ == "__main__":
    # Example usage
    embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
    lm_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
    data_folder = '/content/drive/MyDrive/sample_embedding_folder'
    faiss_index_path = '/content/drive/MyDrive/faiss_index_new_model3.index'

    doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder, faiss_index_path)

    # Example query
    query = "master to slave delay"

    # Perform query and generate response
    response = doc_retrieval_gen.query_and_generate_response(query)
    print("Generated response:\n", response)
