In [None]:
!pip install -q langchain faiss-cpu sentence-transformers==2.2.2 InstructorEmbedding pypdf

In [None]:
pip install google-cloud-aiplatform

In [4]:
from langchain.document_loaders import TextLoader
from pypdf import PdfReader
from langchain import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory

import functools
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Generator, List, Tuple
from typing import Optional, TypeVar
import math
from typing import Any
from pathlib import Path
import os
import json
import numpy as np
from tqdm.auto import tqdm


#Vertex AI embedding
from vertexai.preview.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("textembedding-gecko")
def encode_texts_to_vertex_embeddings(sentences: List[str]) -> List[Optional[List[float]]]:
    print("batch size ", len(sentences))
    try:
        embeddings = model.get_embeddings(sentences)
        return [embedding.values for embedding in embeddings]
    except Exception as e:
        print("exception", e)
        return [None for _ in range(len(sentences))]
    

# Generator function to yield batches of sentences
def generate_batches(
    sentences: List[str], batch_size: int
) -> Generator[List[str], None, None]:
    for i in range(0, len(sentences), batch_size):
        yield sentences[i : i + batch_size]


def encode_text_to_embedding_batched(
    sentences: List[str], api_calls_per_second: float = 1.0, batch_size: int = 100
) -> Tuple[List[bool], np.ndarray]:

    embeddings_list: List[List[float]] = []
    # Prepare the batches using a generator
    batches = generate_batches(sentences, batch_size)

    seconds_per_job = int(1 / api_calls_per_second)

    with ThreadPoolExecutor() as executor:
        futures = []
        for batch in tqdm(
            batches, total=math.ceil(len(sentences) / batch_size), position=0
        ):
            futures.append(
                executor.submit(functools.partial(encode_texts_to_embeddings), batch)
            )
            time.sleep(seconds_per_job)

        for future in futures:
            embeddings_list.extend(future.result())

    is_successful = [
        embedding is not None for sentence, embedding in zip(sentences, embeddings_list)
    ]
    embeddings_list_successful = np.squeeze(
        np.stack([embedding for embedding in embeddings_list if embedding is not None])
    )
    return is_successful, embeddings_list_successful

# block to process entire directory of pdf docs

embeddings_file_path = Path("/home/jupyter/rbi-bot/embeddings/")

# Create a rate limit of 5 requests per minute. default quota
API_CALLS_PER_SECOND = 0.08
# According to the docs, each request can process 5 instances per request
ITEMS_PER_REQUEST = 100


documents_directory = '/home/jupyter/rbi-bot/rbi-docs'

splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

def split_documents(splitter, documents_1):
    # Split the documents using the provided splitter
    split_1 = splitter.split_text(documents_1)
    # Create documents from the split chunks
    #split_1 = splitter.create_documents(split_1)
    return split_1

vector_db = None
for i, filename in enumerate(os.listdir(documents_directory)):
    documents_1 = ''
    # Create the full file path
    filepath = os.path.join(documents_directory, filename)
    
    chunk_path = embeddings_file_path.joinpath(
        f"{embeddings_file_path.stem}_{filename}.json"
    )
    
    # Check if the file is a file and not a directory
    if os.path.isfile(filepath):
        # Open the file
        with open(filepath, 'r') as file:
            # Read the contents of the file
            print(f"Processing file {i,filename}:")
            reader = PdfReader(filepath)
            for page in reader.pages:
                documents_1 += page.extract_text()
        # Implement embeddings
        print(f"extracted pages")
        splits_1 = split_documents(splitter,documents_1)
        print(f"split documents")
        is_successful, question_chunk_embeddings = encode_text_to_embedding_batched(
        sentences=splits_1,
        api_calls_per_second=API_CALLS_PER_SECOND,
        batch_size=ITEMS_PER_REQUEST,
        )
        print(f"embeddings generated :", is_successful)
        # Save embeddings to jsonl format
        print(f"saving to file :")
        with open(chunk_path, "a") as f:
            # Append to file
            embeddings_formatted = [
                json.dumps(
                    {
                        "document-id": filename, 
                        "chunk-seq": str(num),
                        "chunk-id" : filename + "_" + str(num),
                        "chunk-text": splits_1[num],
                        "chunk-page-no": "0",
                        "chunk-embedding":  [str(value) for value in embedding]
                    }
                )
                + "\n"
                for num,embedding in enumerate(question_chunk_embeddings)
            ]
            f.writelines(embeddings_formatted)
        print(f"saved")



Processing file (0, 'NT6449FC6A865BD345A2917B9386D516C3D2.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  25
embeddings generated : [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (1, 'NT141FE25734050D34BE7BED8C87283466104.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  16
embeddings generated : [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (2, 'CIRCULARCCB6DB27B9062D14007BD700245BE816F26.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  4
embeddings generated : [True, True, True, True]
saving to file :
saved
Processing file (3, 'NT78C4A9C75F684443139BB60588DA825376.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  4
embeddings generated : [True, True, True, True]
saving to file :
saved
Processing file (4, 'NT5601B310BAFFA0464F9164FDE854402564.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  4
embeddings generated : [True, True, True, True]
saving to file :
saved
Processing file (5, 'NT875A7981CA1BA942D1AA8ACF9DA1D7FDDD.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  9
embeddings generated : [True, True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (6, 'NOTI821CFF2E131AB64E17BF3F50101EDE56F0.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  8
embeddings generated : [True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (7, 'NT13E6531B10CAE642489F29EE38E10C92E7.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  13
embeddings generated : [True, True, True, True, True, True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (8, 'NOTI103NSFR2912202346EE2A0CB6BD4705850B8F8AB3AA8D75.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  15
embeddings generated : [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
saving to file :
saved
Processing file (9, 'NT19B668901332F243BEBDFD0DEB77E17F68.pdf'):
extracted pages
split documents


  0%|          | 0/1 [00:00<?, ?it/s]

batch size  38


KeyboardInterrupt: 