In [1]:
# !pip freeze > requirements.txt

## imports

In [1]:
from azure.ai.formrecognizer import FormRecognizerClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

In [3]:
import pandas as pd
import openai
import glob
import json
import ast
from os import path 
import tiktoken
from dotenv import load_dotenv

In [4]:
from pydantic import BaseModel, validator
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import yaml
from yaml.loader import SafeLoader
import os
import tiktoken
from langchain.chat_models import AzureChatOpenAI
from openai import AzureOpenAI
import warnings
warnings.filterwarnings("ignore")

## setting env variable

In [6]:
os.environ.update({
    "AZURE_OPENAI_API_KEY": os.getenv("openai_api_key"),
    "AZURE_OPENAI_ENDPOINT": os.getenv("azure_endpoint"),      
})

gpt_model = os.getenv("openai_model_gpt_name")
embedding_model = os.getenv("openai_model_embd_name")

azure_openai_version = os.getenv("openai_api_version")
gpt_model = os.getenv("openai_model_gpt_name")

In [21]:
# defining LLMs
llm = AzureChatOpenAI(openai_api_base=os.getenv("openai_api_base"), temperature=0.5,
                      deployment_name=gpt_model, openai_api_version=os.getenv("openai_api_version"))

# test LLM
llm.invoke("hello")

AIMessage(content='Hello! How can I assist you today?')

In [7]:
# read the list of the anomalies from xlsx
excel_file = 'High Level Contract Analysis.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file)

# Now you can work with the DataFrame 'df'
print(df.head())  # Display the first few rows of the DataFrame

    Contract Term Agreement Type
0       Agreement            S&D
1  Effective Date            S&D
2         Parties            S&D
3     Distributor            S&D
4     Organon Hub            S&D


In [8]:
contract_terms = df["Contract Term"]
contract_terms

0           Agreement
1      Effective Date
2             Parties
3         Distributor
4         Organon Hub
            ...      
151              Term
152          Duration
153     Governing Law
154      Counterparts
155    Agreement Type
Name: Contract Term, Length: 156, dtype: object

In [27]:
# reading a sample contract, stored in txt format (already extracted from pdf)
from langchain_community.document_loaders.text import TextLoader

loader = TextLoader("./extracted_text/20030905_Distributorship Agreement_Maxalt.txt")
doc = loader.load()

In [28]:
# doc

## Key field list reading from xlsx file

In [29]:
key_fields_list = list(contract_terms[:10])
key_fields_list

['Agreement',
 'Effective Date',
 'Parties',
 'Distributor',
 'Organon Hub',
 'Definitions',
 'Affiliate',
 'Agency',
 'IP Owners',
 'MA']

# Implementing with function calling

In [15]:
import json

# Given list
input_list = key_fields_list

# Create JSON format
json_format = {
    "properties": {key: {"type": "string"} for key in input_list},
    "required": []
}

# Convert dictionary to JSON string
json_string = json.dumps(json_format, indent=4)

# Print the JSON string
print(json_string)

{
    "properties": {
        "Agreement": {
            "type": "string"
        },
        "Effective Date": {
            "type": "string"
        },
        "Parties": {
            "type": "string"
        },
        "Distributor": {
            "type": "string"
        },
        "Organon Hub": {
            "type": "string"
        },
        "Definitions": {
            "type": "string"
        },
        "Affiliate": {
            "type": "string"
        },
        "Agency": {
            "type": "string"
        },
        "IP Owners": {
            "type": "string"
        },
        "MA": {
            "type": "string"
        }
    },
    "required": []
}


In [22]:
entity_dict = json.loads(json_string)
type(entity_dict)

dict

In [23]:
from langchain.chains import create_extraction_chain
from openai import AzureOpenAI

In [30]:
# Input text from the document
inp = str(doc)

Getting structured output from raw LLM generations is hard.

For example, suppose you need the model output formatted with a specific schema for:

Extracting a structured row to insert into a database
Extracting API parameters
Extracting different parts of a user query (e.g., for semantic vs keyword search)

In [25]:
chain = create_extraction_chain(entity_dict, llm)
chain.run(inp)

[{'Agreement': 'Distributorship Agreement',
  'Effective Date': '5th day of September, 2003',
  'Parties': 'Merck Sharp & Dohme Asia Pacific Services Pte. Ltd. and Kyorin Pharmaceutical Co., Ltd.',
  'Definitions': 'Capitalized terms not otherwise defined in this Agreement shall have the meanings ascribed to them in the License Agreement, the Comprehensive Agreement and the Marketing Authorization Agreement.',
  'Affiliate': 'Affiliates of MSDAPS shall mean Merck and Affiliates of Merck other than MSDAPS and Banyu.',
  'Agency': 'Any governmental or other regulatory authority responsible for granting any health or pricing approvals, reimbursement prices or labels or other Registrations necessary before Product may be imported, repackaged, marketed or sold in the Territory.'}]