In [1]:
!pip install -q langchain langchain_experimental langchain_openai faiss-cpu sentence-transformers==2.2.2 pypdf ipywidgets==7.7.2 psycopg2

In [2]:
from langchain.document_loaders import TextLoader
from pypdf import PdfReader
from langchain import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
import os
from sentence_transformers import SentenceTransformer
import psycopg2
import pandas as pd
import re

In [3]:
def generate_document_splits(filepath,chunk_size, debug=False):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=30)    
    pages = ''
    with open(filepath, 'r') as file:
        # Read the contents of the file
        if(debug):
            print(f"Processing file {file}:")
        reader = PdfReader(filepath)
        #documentName = file
        for i, page in enumerate(reader.pages):
            page_text = page.extract_text()
            pages += f"pageindicator:{i + 1}:{page_text}\n"    
        splits = splitter.split_text(pages)
        if(debug):
            for i, split in enumerate(splits_1):
                print(f"split {i} {split}",i,split)
        return splits
    

In [16]:
def generate_document_embeddings(splits,embedding_model,debug=False):
    embeddings = embedding_model.encode(splits)
    if(debug):
        print(embeddings)
    return embeddings

In [17]:
def generate_vector_store_df(splits,embeddings,document_domain,document_name,chunk_size,embedding_model_name,debug=False):
    current_page = 1  # Default to page 1
    data = []
    sequence_number = 1
    #for each split - create the vector-db row
    for i,text in enumerate(splits):
        # Check for pageindicator pattern
        match = re.match(r"pageindicator:(\d+):(.*)", text)
        if match:
            # Update the current page based on the indicator
            current_page = int(match.group(1))
            text = match.group(2).strip()  # Remove the indicator and strip extra spaces
        # Append the processed data to the list
        data.append({
            "Document_Domain" : document_domain, 
            "Document_Name": document_name,
            "Page_Number": current_page,
            "Sequence": sequence_number,
            "Chunk_size": chunk_size,
            "Text": text,
            "embedding_model": embedding_model_name,
            "embedding_1024":embeddings[i].astype(float).tolist()
        })
        sequence_number += 1
    df = pd.DataFrame(data)
    if(debug):
        print(df)
    return df

## Document embeddings table in pg_vector DB 
CREATE TABLE document_embeddings (
    Document_Domain VARCHAR(200),
    Document_Name VARCHAR(200),
    Page_Number INTEGER,
    Sequence INTEGER,
    Chunk_size INTEGER,
    Text TEXT,
    embedding_model VARCHAR(200),
    embedding_1024 vector(1024),
    UNIQUE (Document_Domain, Document_Name, Sequence)
);

In [18]:
def delete_matching_rows(conn, df,debug=False):
    # Check if the DataFrame has at least one row
    if not df.empty:
        # Extract values from the first row
        document_name = df.loc[0, 'Document_Name']
        document_domain = df.loc[0, 'Document_Domain']
        
        cursor = conn.cursor()
        try:
            # Formulate and execute the DELETE statement
            cursor.execute("""
                DELETE FROM document_embeddings
                WHERE Document_Name = %s AND Document_Domain = %s
            """, (document_name, document_domain))
            # Commit the changes
            conn.commit()
            if(debug):
                print(f"Rows with Document_Name '{document_name}' and Document_Domain '{document_domain}' deleted successfully.")
        except Exception as e:
            print("An error occurred:", e)
            conn.rollback()  # Rollback in case of any error
    else:
        if(debug):
            print("DataFrame is empty, no rows to delete.")

In [19]:
def update_document_embeddings(db_params, df,debug=False):
    # Establish a connection to the database
    conn = psycopg2.connect(
        dbname=db_params['dbname'],
        user=db_params['user'],
        password=db_params['password'],
        host=db_params['host'],
        port=db_params['port']
    )
    delete_matching_rows(conn,df,debug)
    cursor = conn.cursor()
    try:
        # Iterate through the dataframe
        for index, row in df.iterrows():
            # Insert new row
            cursor.execute("""
                INSERT INTO document_embeddings (Document_Domain, Document_Name, Page_Number, Sequence, Chunk_size, Text, embedding_model, embedding_1024)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            """, (
                row['Document_Domain'],
                row['Document_Name'],
                row['Page_Number'],
                row['Sequence'],
                row['Chunk_size'],
                row['Text'],
                row['embedding_model'],
                list(row['embedding_1024'])  # Ensure embeddings are inserted as an array
            ))
        # Commit the changes
        conn.commit()
    except Exception as e:
        print("An error occurred:", e)
        conn.rollback()  # Rollback in case of any error
    finally:
        # Close the connection and cursor
        cursor.close()
        conn.close()

In [20]:
def process_document(document_domain,file_path,db_params,embedding_model,chunk_size,debug=False):
    document_name = os.path.basename(file_path)
    splits = generate_document_splits(file_path,chunk_size, debug)
    embeddings = generate_document_embeddings(splits,embedding_model,debug)
    df = generate_vector_store_df(splits,embeddings,document_domain,document_name,chunk_size,embedding_model_name,debug)
    update_document_embeddings(db_params, df,debug=debug)

In [21]:
def process_directory(directory, run_id,document_domain,db_params,embedding_model,chunk_size,debug):
    processed_dir = os.path.join(directory, f'processed_{run_id}')
    
    # Create processed directory if it does not exist
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)
        print(f"Created directory: {processed_dir}")
    
    # Process files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        # Check if the file is not a directory and not already processed
        if os.path.isfile(file_path) and not os.path.exists(os.path.join(processed_dir, filename)):
            if filename.endswith('.DS_Store'):
                continue
            # Process the file
            print(f"Processing {file_path}...")
            process_document(document_domain,file_path,db_params,embedding_model,chunk_size,debug)
            # Create a dummy file in the processed directory
            open(os.path.join(processed_dir, filename), 'a').close()
            print(f"Marked as processed: {filename}")

In [22]:
# Initialize variables
document_domain = "RBI_Guidelines"
directory = '/Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/'
dbname = 'rbi_bot_db'
dbuser = 'rbi_bot_user'
dbpassword = 'rbi_bot_pwd'
dbhost = '127.0.0.1'
dbport = 5432
chunk_size = 400
embedding_model_name = 'thenlper/gte-large'
embedding_model = SentenceTransformer(embedding_model_name)
db_params = {
    'dbname': dbname ,
    'user': dbuser,
    'password': dbpassword,
    'host': dbhost,
    'port': dbport
}
run_id = '15May_2'
process_directory(directory, run_id,document_domain,db_params,embedding_model,chunk_size,False)

Processing /Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/LIBOR4B79B1870D234E3FACC2EE378D8F6F84.pdf...
Marked as processed: LIBOR4B79B1870D234E3FACC2EE378D8F6F84.pdf
Processing /Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/22MC894E2203DDDF4E0ABCA714DCE21F8F6C.pdf...
Marked as processed: 22MC894E2203DDDF4E0ABCA714DCE21F8F6C.pdf
Processing /Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/NOTIEC406443DAEB4529A479B8C8160CE8AD.pdf...
Marked as processed: NOTIEC406443DAEB4529A479B8C8160CE8AD.pdf
Processing /Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/164DPSSIRBA234EEFCA0445FE97DEF6A608514CD4.pdf...
Marked as processed: 164DPSSIRBA234EEFCA0445FE97DEF6A608514CD4.pdf
Processing /Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs/NT20042020_EN97FB72E765504BC4AB499A3E4799A254.pdf...
Marked as processed: NT20042020_EN97FB72E765504BC4AB499A3E4799A254.pdf
Processing /Users/sarathnathbuddhiraju/w

In [99]:
import os
def delete_specific_duplicates(directory):
    # List all files in the directory
    files = os.listdir(directory)

    # Prepare to track which files to delete
    to_delete = []

    # Iterate over each file to check for duplicates with specific suffixes
    for file in files:
        # Split the filename from its extension
        base_name, extension = os.path.splitext(file)
        
        # Continue only if the file is a PDF
        if extension.lower() != '.pdf':
            continue
        
        # Check for duplicates like "filename (1)" and "filename 2"
        if base_name.endswith(')'):
            suffix_start = base_name.rfind('(')
            if suffix_start != -1 and base_name[suffix_start:].replace('(', '').replace(')', '').strip().isdigit():
                to_delete.append(file)
        else:
            parts = base_name.rsplit(' ', 1)
            if len(parts) > 1 and parts[1].isdigit():
                to_delete.append(file)

    # Delete the identified duplicate files
    for dup_file in to_delete:
        os.remove(os.path.join(directory, dup_file))
        print(f"Deleted: {dup_file}")

# Directory path
directory = '/Users/sarathnathbuddhiraju/workspaces/sarat_notebooks/data/rbi-docs'
delete_specific_duplicates(directory)


Deleted: 93MDVARIATIONMARGIN29E1715A212F48B89160C223B91ABF74 2.pdf
Deleted: 115MDCN01042024F088EFA4F3F04DC9968AD7EC6844AFE9 2.pdf
Deleted: NT46D8F6CFB6BBF341DEBA0B6A301866FED4(1).pdf
Deleted: 102MDITSERVICES56B33FD530B1433187D75CB7C06C8F70 2.pdf
Deleted: 71MDPSBS1A00DE17143448F991236A3C505BA69D 2.pdf
Deleted: CIRCULARKFS1504242AE2500BAF494C2A82442B0B642705C1 2.pdf
Deleted: MD18KYCF6E92C82E1E1419D87323E3869BC9F13 2.pdf
Deleted: APDIR5736998BB416FA4828A5D1200573D7BC64(1).pdf
Deleted: 105MDPRUDENTIALREGULATIONSAIFISCF490815D13A4EE9BD3D48B79DD89285 2.pdf
Deleted: 101MDPPD80EBE808A91E45CE9BE6B989C188F877 2.pdf
Deleted: NOT148AB670DEFC004A5AAF203E6C730B79DE 2.pdf
Deleted: 103MDCAPITALREQUIREMENTS50C9076B7D494F259CC908D618297293 2.pdf
Deleted: 65MD603579515C5142D2B168D5FA886A2CCB 2.pdf
Deleted: 68MD667BD1A3E259475FA08F8B407617C923 2.pdf
Deleted: 108MDINTERNALOMBUDSMANCC05402F77BE4F229B59877F341386A4 2.pdf
Deleted: NOTI11E02042024A16134430F514F08A0DC7CBAA61CA15A (1).pdf
Deleted: 06MC010420244D