In [2]:
import sys
import os
import json
import re

src_directory = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(src_directory)

from llm_handler import *

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
rag_knowledge_base_path = "../../../data/cdm_samples_json"

llm_handler = LLMHandler(
    model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
    token="huggingface access token"
)

# Initialize RAG within the LLMHandler
llm_handler.initialize_rag(
    documents_path=rag_knowledge_base_path,
    embed_model_name="BAAI/bge-small-en-v1.5",
    top_k=5,
    similarity_cutoff=0.6
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]


Initialized LLM with model: meta-llama/Meta-Llama-3.1-8B-Instruct
LLM is explicitly disabled. Using MockLLM.
Initialized RAG with 856 documents and embedding model: BAAI/bge-small-en-v1.5


In [5]:
def load_test_contracts(test_data_dir):
    json_dir = os.path.join(test_data_dir, "json")
    text_dir = os.path.join(test_data_dir, "text")

    test_contracts = {}

    for sub_folder in os.listdir(json_dir):
        json_sub_folder = os.path.join(json_dir, sub_folder)
        text_sub_folder = os.path.join(text_dir, sub_folder)
        
        if os.path.isdir(json_sub_folder) and os.path.isdir(text_sub_folder):
            contracts = []
            for json_file in os.listdir(json_sub_folder):
                if json_file.endswith(".json"):
                    text_file = json_file.replace(".json", ".txt")
                    json_path = os.path.join(json_sub_folder, json_file)
                    text_path = os.path.join(text_sub_folder, text_file)
                    
                    if os.path.exists(json_path) and os.path.exists(text_path):
                        with open(json_path, 'r') as jf:
                            json_content = json.load(jf)
                        
                        with open(text_path, 'r') as tf:
                            text_content = tf.read()
                        
                        contracts.append({
                            "id": json_file,
                            "cdm": json_content,
                            "description": text_content
                        })
            
            if contracts:
                test_contracts[sub_folder] = contracts

    return test_contracts


test_data_dir = "../../../data/type_wise_cdm_samples/test_data"
test_contracts = load_test_contracts(test_data_dir)


# contract_descriptions_path = 'test_data'

# contracts = []

# for filename in os.listdir(contract_descriptions_path):
#     if filename.endswith('.txt'):
#         file_path = os.path.join(contract_descriptions_path, filename)
#         with open(file_path, 'r', encoding='utf-8') as file:
#             contracts.append({'name': os.path.splitext(filename)[0], 'description': file.read().strip()})

In [28]:
# def extract_json(string):
#     match = re.search(r'(\{.*\})', string, re.DOTALL)
#     if match:
#         json_string = match.group(1)
        
#         try:
#             json_obj = json.loads(json_string)
#             return json_obj
#         except Exception:
#             return None
#     else:
#         return None

def extract_json(string):
    match = re.search(r'(\[.*\]|\{.*\})', string, re.DOTALL)
    if match:
        json_string = match.group(1)
        try:
            return json.loads(json_string)
        except json.JSONDecodeError:
            corrected_json_string = re.sub(r"(?<!\\)'", '"', json_string)
            corrected_json_string = corrected_json_string.replace("True", "true").replace("False", "false").replace("None", "null")
            try:
                return json.loads(corrected_json_string)
            except json.JSONDecodeError:
                return None
    return None
    

def is_json(string):
    try:
        json.loads(string)
        return True
    except Exception:
        return False

In [27]:
cdms = []

output_dir = "test_results_cdm"

for contract_type, contracts in test_contracts.items():

    print(f"\n\nProcessing contracts of type: {contract_type}\n")
    
    for contract in contracts:
        print("\nProcessing "+contract['id']+"...")
        print("-"*60)

        without_rag_path = os.path.join(output_dir, "Meta-Llama-3.1-8B-Instruct", "without_rag", contract_type, contract['id'])
        with_rag_path = os.path.join(output_dir, "Meta-Llama-3.1-8B-Instruct", "with_rag", contract_type, contract['id'])

        os.makedirs(os.path.dirname(without_rag_path), exist_ok=True)
        os.makedirs(os.path.dirname(with_rag_path), exist_ok=True)

        if not os.path.exists(without_rag_path):
            basic_prompt = f"""
            Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

            Derivatives Contract:
            {contract['description']}

            Please ensure:
            1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
            2. **Only** provide the JSON representation, with no additional text or explanations.
            3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
            
            Output the **final JSON** below.
            """
            conf = ""
            cnt = 0
            while(True):
                cdm_response = extract_json(llm_handler.generate(basic_prompt+conf))
                if cdm_response:
                    print("Generated a correct JSON!!")
                    with open(without_rag_path, "w") as f:
                        json.dump(cdm_response, f, indent=4)
                    print("CDM Saved in " + without_rag_path)
                    break
                else:
                    print(cdm_response, "\n")
                    conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                    print("Generated CDM is not a correct JSON. Generating again...")
                    cnt += 1
                    if cnt > 5:
                        break

        else:
            print("!!! CDM already exists: " + without_rag_path)
        
        
        if not os.path.exists(with_rag_path):
            
            while(True):
                contract_summary = llm_handler.generate("Summarize the following contract in AT MOST 500 words : " + contract["description"])
                if len(contract_summary) < 5000:
                    print("Contract Summary for RAG context retrieval is generated!")
                    break
            
            rag_context = llm_handler.get_context_using_rag(f"Find CDM representations that best capture the information from a derivatives contract like the following:\n\n{contract_summary}")
            rag_prompt = f"""
            Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

            Derivatives Contract:
            {contract['description']}

            Relevant CDM Examples for Reference:
            {rag_context}

            Please ensure:
            1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
            2. **Only** provide the JSON representation, with no additional text or explanations.
            3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
            
            Output the **final JSON** below.
            """

            conf = ""
            cnt = 0
            while(True):
                cdm_response_rag = extract_json(llm_handler.generate(rag_prompt+conf))
                if cdm_response_rag:
                    print("(RAG) Generated a correct JSON!!")
                    with open(with_rag_path, "w") as f:
                        json.dump(cdm_response_rag, f, indent=4)
                    print("CDM_RAG Saved in " + with_rag_path)
                    break
                else:
                    print(cdm_response_rag, "\n")
                    conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                    print("(RAG) Generated CDM is not a correct JSON. Generating again...")
                    cnt += 1
                    if cnt > 5:
                        break

        else:
            print("!!! CDM_RAG already exists: " + with_rag_path)





Processing contracts of type: EquitySwap


Processing eqs-ex10-short-form-interestLeg-driving-schedule-dates_2.json...
------------------------------------------------------------
!!! CDM already exists: test_results_cdm/Meta-Llama-3.1-8B-Instruct/without_rag/EquitySwap/eqs-ex10-short-form-interestLeg-driving-schedule-dates_2.json
!!! CDM_RAG already exists: test_results_cdm/Meta-Llama-3.1-8B-Instruct/with_rag/EquitySwap/eqs-ex10-short-form-interestLeg-driving-schedule-dates_2.json

Processing eqs-ex11-on-european-single-stock-underlyer-short-form_1.json...
------------------------------------------------------------
!!! CDM already exists: test_results_cdm/Meta-Llama-3.1-8B-Instruct/without_rag/EquitySwap/eqs-ex11-on-european-single-stock-underlyer-short-form_1.json
!!! CDM_RAG already exists: test_results_cdm/Meta-Llama-3.1-8B-Instruct/with_rag/EquitySwap/eqs-ex11-on-european-single-stock-underlyer-short-form_1.json

Processing eqs-ex18-pan-asia-interdealer-index-swap-short-form_1.

In [11]:

# for contract in contracts:
    
#     while(True):
#         contract_summary = llm_handler.generate("Summarize the following contract in AT MOST 500 words : " + contract["description"])
#         if len(contract_summary) < 5000:
#             print("Contract Summary for RAG context retrieval is generated!")
#             break

#     # RAG context retrieval
#     rag_context = llm_handler.get_context_using_rag(f"Find CDM representations that best capture the information from a derivatives contract like the following:\n\n{contract_summary}")

#     # Prepare prompts
#     basic_prompt = f"""
#     Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

#     Derivatives Contract:
#     {contract['description']}

#     Please ensure:
#     1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
#     2. **Only** provide the JSON representation, with no additional text or explanations.
#     3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
    
#     Output the **final JSON** below.
#     """

#     rag_prompt = f"""
#     Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

#     Derivatives Contract:
#     {contract['description']}

#     Relevant CDM Examples for Reference:
#     {rag_context}

#     Please ensure:
#     1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
#     2. **Only** provide the JSON representation, with no additional text or explanations.
#     3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
    
#     Output the **final JSON** below.
#     """


#     # Saving the responses to JSON files if they don't already exist
#     cdm_file_path = f"test_data/generated_cdms/Llama3.1-8B-Instruct/{contract['name']}_cdm.json"
#     cdm_rag_file_path = f"test_data/generated_cdms/Llama3.1-8B-Instruct/{contract['name']}_cdm_rag.json"

#     if not os.path.exists(cdm_file_path):
#         conf = ""
#         while(True):
#             cdm_response = extract_json(llm_handler.generate(basic_prompt+conf))
#             if cdm_response:
#                 print("Generated a correct JSON!!")
#                 break
#             else:
#                 print(cdm_response, "\n")
#                 conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
#                 print("Generated CDM is not a correct JSON. Generating again...")

#         with open(cdm_file_path, 'w') as file:
#             json.dump(cdm_response, file, indent=4)
#         print(f"Saved CDM response to {cdm_file_path}")

#     if not os.path.exists(cdm_rag_file_path):
#         conf = ""
#         while(True):
#             cdm_with_rag_response = extract_json(llm_handler.generate(rag_prompt+conf))
#             if cdm_with_rag_response:
#                 print("(RAG) Generated a correct JSON!!")
#                 break
#             else:
#                 print(cdm_with_rag_response, "\n")
#                 conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
#                 print("(RAG) Generated CDM is not a correct JSON. Generating again...")

#         with open(cdm_rag_file_path, 'w') as file:
#             json.dump(cdm_with_rag_response, file, indent=4)
#         print(f"Saved CDM_RAG response to {cdm_rag_file_path}")


#     # Print or process the responses
#     print("CDM Response without RAG:")
#     print(cdm_response)
#     print("\nCDM Response with RAG:")
#     print(cdm_with_rag_response)

#     cdms.append(
#         {
#             "name": contract['name'],
#             "description": contract['description'],
#             "cdm": cdm_response,
#             "cdm_rag": cdm_with_rag_response
#         }
#     )

In [10]:
# from evaluate import *

# cdm_schema_path = "../cdm_schema/"
# cdm_repo_path = "../cdm_schema/cdm_schema_json"

# cdms = []

# for contract in contracts:

#     # exclude = ['bofa_interest_rate_swap', 'bofa_equity_option', 'jpmorgan_interest_rate_swap']
    
#     # if contract['name'] in exclude:
#     #     continue

#     cdm_file_path = f"test_data/generated_cdms/Llama3.1-8B-Instruct/{contract['name']}_cdm.json"
#     cdm_rag_file_path = f"test_data/generated_cdms/Llama3.1-8B-Instruct/{contract['name']}_cdm_rag.json"

#     if os.path.exists(cdm_file_path):
#         with open(cdm_file_path, 'r') as cdm_file:
#             cdm = json.load(cdm_file)

#     if os.path.exists(cdm_rag_file_path):
#         with open(cdm_rag_file_path, 'r') as cdm_rag_file:
#             cdm_rag = json.load(cdm_rag_file)

#     syntactic_correctness, schema_adherence = evaluate_cdm(cdm, cdm_schema_path, cdm_repo_path)
#     rag_syntactic_correctness, rag_schema_adherence = evaluate_cdm(cdm_rag, cdm_schema_path, cdm_repo_path)

#     semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm))
#     rag_semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm_rag))

#     print("\n", contract['name'])
#     print("-----------------------\n")
#     print(syntactic_correctness)
#     print(schema_adherence)
#     print("\n")
#     print(rag_syntactic_correctness)
#     print(rag_schema_adherence)
#     print("\n")
#     print(semantic_coverage)
#     print(rag_semantic_coverage)
#     print("\n")

#     print("======================\n")
#     # break