In [None]:
!pip install -q langchain faiss-cpu sentence-transformers==2.2.2 InstructorEmbedding pypdf

In [None]:
pip install google-cloud-aiplatform

In [None]:
from langchain.document_loaders import TextLoader
from pypdf import PdfReader
from langchain import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory

In [None]:
import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple
from typing import Optional, TypeVar
import math
from typing import Any

import numpy as np
from tqdm.auto import tqdm

from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

# Define an embedding method that uses the model
def encode_texts_to_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    print("batch size ", len(sentences))
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception as e:
        print("exception", e)
        return [None for _ in range(len(sentences))]
    

# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: int = 1, batch_size: int = 10
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []
    print(len(sentences))
    print(sentences[0])
    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = 1 / api_calls_per_second

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    print(is_successful)
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

In [None]:
import os
# Single File read block
# Specify the directory you want to read

documents_1 = ''
documents_directory = '/home/jupyter/rbi-bot/rbi-docs'
splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20
)

def split_documents(splitter, documents_1):
    # Split the documents using the provided splitter
    split_1 = splitter.split_text(documents_1)
    # Create documents from the split chunks
    #split_1 = splitter.create_documents(split_1)
    return split_1

vector_db = None
filepath = '/home/jupyter/rbi-bot/rbi-docs/01MC01042024E0D6B768164C41678A616F743BF7426B 2.pdf'
with open(filepath, 'r') as file:
    # Read the contents of the file
    reader = PdfReader(filepath)
    for page in reader.pages:
        documents_1 += page.extract_text()
        print(f"generated documents")
    splits_1 = split_documents(splitter,documents_1)
    print(f"split documents")
    #extension_db = FAISS.from_documents(splits_1, model)
    #extension_db.save_local(vector_db_directory) 
    #print(splits_1)

#Encode a subset of questions for validation

is_successful, question_embeddings = encode_text_to_embedding_batched(
   splits_1
)

# Filter for successfully embedded sentences
#questions = splits_1[is_successful]

DIMENSIONS = len(question_embeddings[0])
print(DIMENSIONS)
print(question_embeddings[1])

In [None]:
########### Embedding creation code till here ######
#### TODO - Write embeddings to JSONL format ####
#### TODO - Push files to vector search index ####


In [None]:
import os
from vertexai.preview.language_models import TextEmbeddingModel


# Directory processing block
# Specify the directory you want to read
documents_1 = ''
documents_directory = '/home/jupyter/rbi-bot/rbi-docs'
vector_db_directory = '/data/vector-store/rbi-docs-v2/'

#instructor_embeddings = HuggingFaceInstructEmbeddings(
#    model_name='hkunlp/instructor-xl', model_kwargs={}
#)

model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

def split_documents(splitter, documents_1):
    # Split the documents using the provided splitter
    split_1 = splitter.split_text(documents_1)
    # Create documents from the split chunks
    split_1 = splitter.create_documents(split_1)
    return split_1

vector_db = None
for i, filename in enumerate(os.listdir(documents_directory)):
    # Create the full file path
    filepath = os.path.join(documents_directory, filename)
    
    # Check if the file is a file and not a directory
    if os.path.isfile(filepath):
        # Open the file
        with open(filepath, 'r') as file:
            # Read the contents of the file
            print(f"Processing file {i,filename}:")
            reader = PdfReader(filepath)
            for page in reader.pages:
                documents_1 += page.extract_text()
            # Implement embeddings
            print(f"extracted pages")
            splits_1 = split_documents(splitter,documents_1)
            print(f"split documents")
            extension_db = FAISS.from_documents(splits_1, model)
            print(f"embeddings generated :")
            if vector_db is None:
                vector_db = extension_db
            else:
                vector_db.merge_from(extension_db)
        # Save db
        print(f"saving to vector DD :")
        vector_db.save_local(vector_db_directory)
        print(f"saved")



In [None]:
!pip install ipywidgets==7.7

In [None]:
token = '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'
# Load db
loaded_db = FAISS.load_local(
    '/data/vector-store/rbi-docs-v2/', instructor_embeddings, allow_dangerous_deserialization=True
)

In [None]:
question = 'what should be constitution of a UCB board?'
search = loaded_db.similarity_search(question)
search
search_with_similarity_scores = loaded_db.similarity_search_with_score(question)
search_with_similarity_scores

In [None]:
temperature = 0.5
max_length = 300
llm_model = 'tiiuae/falcon-7b-instruct'

# Load LLM
llm = HuggingFaceHub(
    repo_id=llm_model,
    model_kwargs={'temperature': temperature, 'max_length': max_length},
    huggingfacehub_api_token=token
)

# Create the chatbot
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=loaded_db.as_retriever(),
    return_source_documents=True,
)

In [None]:
response = qa({'query': question})
response