In [17]:
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
import json

In [18]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [29]:
dir_path = "contracts"

In [21]:
def get_pdf_files(directory):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')]


In [28]:
def process_pdf(file_path):
    # Load PDF
    loader = PyPDFLoader(file_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    all_splits = text_splitter.split_documents(data)

    # Create vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

    # Create language model
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

    # Create RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    prompts = {
        "Agreement type": "Extract the Agreement type from the following text, : {text}. Format the response as a JSON array.",
        #"Full Legal Name": "Extract the Full Legal Name from the following text: {text}. Format the response as a JSON array.",
        #"SEC File number": "List all SEC File numbers mentioned in the document. Format the response as a JSON array.",
        #"Address": "List all complete addresses mentioned in the document. Format the response as a JSON array.",
        #"Name of investment advisor": "List all names of investment advisors mentioned in the document. Format the response as a JSON array."
    }

    # Extract entities
    results = []
    for entity, prompt in prompts.items():
        response = qa_chain.invoke({"query": prompt})
        try:
            values = json.loads(response['result'])
        except json.JSONDecodeError:
            values = [response['result'].strip()]
        
        for value in values:
            results.append({
                "Entity_Name": entity,
                "Entity_Value": value
            })

    return results

# Get all PDF files
pdf_files = get_pdf_files(dir_path)

# Process each PDF file
for pdf_file in pdf_files:
    print(f"Processing {pdf_file}...")
    try:
        extracted_entities = process_pdf(pdf_file)
        
        print(f"Results for {os.path.basename(pdf_file)}:")
        for entity in extracted_entities:
            print(f"{entity['Entity_Name']}: {entity['Entity_Value']}")
        print("\n" + "="*50 + "\n")
    except Exception as e:
        print(f"Error processing {pdf_file}: {str(e)}")
        print("\n" + "="*50 + "\n")    

Processing C:\Users\ayush\OneDrive\Desktop\EY\EYII\fine_tuning\contracts\09 - Exhibit I - ATB_-_Chinook_-_General_Security_Agreement_-_CPI_-_CPI.pdf...
Results for 09 - Exhibit I - ATB_-_Chinook_-_General_Security_Agreement_-_CPI_-_CPI.pdf:
Agreement type: AgreementType


Processing C:\Users\ayush\OneDrive\Desktop\EY\EYII\fine_tuning\contracts\FORM OF GLOBAL CUSTODY AGREEMENT WITH JP MORGAN CHASE.pdf...
Results for FORM OF GLOBAL CUSTODY AGREEMENT WITH JP MORGAN CHASE.pdf:
Agreement type: AgreementType


Processing C:\Users\ayush\OneDrive\Desktop\EY\EYII\fine_tuning\contracts\FORM OF MASTER GLOBAL CUSTODY AGREEMENT.pdf...
Results for FORM OF MASTER GLOBAL CUSTODY AGREEMENT.pdf:
Agreement type: Agreement Type


Processing C:\Users\ayush\OneDrive\Desktop\EY\EYII\fine_tuning\contracts\Global Custody Agreement For Foreign and Domestic.pdf...
Results for Global Custody Agreement For Foreign and Domestic.pdf:
Agreement type: AgreementType


Processing C:\Users\ayush\OneDrive\Desktop\EY\EYII\