In [1]:
import sys
import os
import json
import re

src_directory = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(src_directory)

from llm_handler import *
from fine_tuner import *

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
rag_knowledge_base_path = "../../data/cdm_samples_json"

llm_handler = FineTuner(
    model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
    token="huggingface access token",
    json_folder="../../data/old_fine_tuning_data/cdm_examples", 
    txt_folder="../../data/old_fine_tuning_data/contract_descriptions"
)

llm_handler.initialize_rag(
    documents_path=rag_knowledge_base_path,
    embed_model_name="BAAI/bge-small-en-v1.5",
    top_k=5,
    similarity_cutoff=0.6
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.016 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 9.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
meta-llama/Meta-Llama-3.1-8B-Instruct initialized for fine-tuning!
LLM is explicitly disabled. Using MockLLM.
Initialized RAG with 856 documents and embedding model: BAAI/bge-small-en-v1.5


In [3]:
llm_handler.fine_tune(max_steps=60)

Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 542/542 [00:00<00:00, 16991.65 examples/s]


Dataset Preparation Completed!
Training...


Map (num_proc=2): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 542/542 [00:02<00:00, 220.35 examples/s]
max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 542 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.179
2,1.2221
3,1.2652
4,1.1716
5,1.2291
6,1.1186
7,1.0635
8,1.0193
9,1.0135
10,0.9534


Training Completed!


In [4]:

contract_descriptions_path = 'test_data'

contracts = []

for filename in os.listdir(contract_descriptions_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(contract_descriptions_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            contracts.append({'name': os.path.splitext(filename)[0], 'description': file.read().strip()})

In [5]:
def extract_json(string):
    print("Curr Json String ---> ")
    print(string)
    match = re.search(r'(\{.*\})', string, re.DOTALL)
    if match:
        json_string = match.group(1)
        
        try:
            json_obj = json.loads(json_string)
            return json_obj
        except Exception:
            return None
    else:
        return None

def is_json(string):
    try:
        json.loads(string)
        return True
    except Exception:
        return False

In [None]:
cdms = []

for contract in contracts:

    exclude = ['bofa_equity_option', 'bofa_interest_rate_swap', 'bofa_equity_swap_confirmation']
    if contract['name'] in exclude:
        continue
    
    print("Working with ---> ", contract['name'])

    while(True):
        contract_summary = llm_handler.generate("Summarize the following contract in AT MOST 500 words : " + contract["description"])
        contract_summary = contract_summary[0:min(2500, len(contract_summary))]
        if len(contract_summary) < 5000:
            print("Contract Summary for RAG context retrieval is generated!")
            break

    # RAG context retrieval
    rag_context = llm_handler.get_context_using_rag(f"Find CDM representations that best capture the information from a derivatives contract like the following:\n\n{contract_summary}")

    # Prepare prompts
    basic_prompt = f"""
    Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

    Derivatives Contract:
    {contract['description']}

    Please ensure:
    1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
    2. **Only** provide the JSON representation, with no additional text or explanations.
    3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
    
    Output the **final JSON** below.
    """

    rag_prompt = f"""
    Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

    Derivatives Contract:
    {contract['description']}

    Relevant CDM Examples for Reference:
    {rag_context}

    Please ensure:
    1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
    2. **Only** provide the JSON representation, with no additional text or explanations.
    3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
    
    Output the **final JSON** below.
    """


    # Saving the responses to JSON files if they don't already exist
    cdm_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm.json"
    cdm_rag_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm_rag.json"

    if not os.path.exists(cdm_file_path):
        conf = ""
        while(True):
            cdm_response = extract_json(llm_handler.generate(basic_prompt+conf))
            if cdm_response:
                print("Generated a correct JSON!!")
                break
            else:
                print(cdm_response, "\n")
                conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                print("Generated CDM is not a correct JSON. Generating again...")

        with open(cdm_file_path, 'w') as file:
            json.dump(cdm_response, file, indent=4)
        print(f"Saved CDM response to {cdm_file_path}")

    if not os.path.exists(cdm_rag_file_path):
        conf = ""
        while(True):
            cdm_with_rag_response = extract_json(llm_handler.generate(rag_prompt+conf))
            if cdm_with_rag_response:
                print("(RAG) Generated a correct JSON!!")
                break
            else:
                print(cdm_with_rag_response, "\n")
                conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                print("(RAG) Generated CDM is not a correct JSON. Generating again...")

        with open(cdm_rag_file_path, 'w') as file:
            json.dump(cdm_with_rag_response, file, indent=4)
        print(f"Saved CDM_RAG response to {cdm_rag_file_path}")


    # Print or process the responses
    print("CDM Response without RAG:")
    print(cdm_response)
    print("\nCDM Response with RAG:")
    print(cdm_with_rag_response)

    cdms.append(
        {
            "name": contract['name'],
            "description": contract['description'],
            "cdm": cdm_response,
            "cdm_rag": cdm_with_rag_response
        }
    )

Working with --->  bofa_equity_option
Contract Summary for RAG context retrieval is generated!
Curr Json String ---> 
{"trade": {"tradeIdentifier": [{"issuerReference": {"globalReference": "2492844434-ND1-1", "externalReference": "party1"}, "assignedIdentifier": [{"identifier": {"value": "12345", "meta": {"scheme": "http://www.partyA.com/trade-id"}}}], "meta": {"globalKey": "f6a1e4d9"}}, {"issuerReference": {"globalReference": "2492844434-ND1-1", "externalReference": "party1"}, "assignedIdentifier": [{"identifier": {"value": "12345", "meta": {"scheme": "http://www.partyA.com/trade-id"}}}], "meta": {"globalKey": "f6a1e4d9"}}], "tradeDate": {"value": "2024-07-29", "meta": {"globalKey": "3e8e0d"}}, "tradableProduct": {"product": {"contractualProduct": {"productTaxonomy": [{"source": "ISDA", "productQualifier": "Option_SingleName_Put"}], "economicTerms": {"payout": {"optionPayout": [{"payerReceiver": {"payer": "Party1", "receiver": "Party2"}, "priceQuantity": {"quantitySchedule": {"address

In [None]:
from evaluate import *

cdm_schema_path = "../cdm_schema/"
cdm_repo_path = "../cdm_schema/cdm_schema_json"

cdms = []

for contract in contracts:

    # exclude = ['bofa_interest_rate_swap', 'bofa_equity_option', 'jpmorgan_interest_rate_swap']
    
    # if contract['name'] in exclude:
    #     continue

    cdm_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm.json"
    cdm_rag_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm_rag.json"

    if os.path.exists(cdm_file_path):
        with open(cdm_file_path, 'r') as cdm_file:
            cdm = json.load(cdm_file)

    if os.path.exists(cdm_rag_file_path):
        with open(cdm_rag_file_path, 'r') as cdm_rag_file:
            cdm_rag = json.load(cdm_rag_file)

    syntactic_correctness, schema_adherence = evaluate_cdm(cdm, cdm_schema_path, cdm_repo_path)
    rag_syntactic_correctness, rag_schema_adherence = evaluate_cdm(cdm_rag, cdm_schema_path, cdm_repo_path)

    semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm))
    rag_semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm_rag))

    print("\n", contract['name'])
    print("-----------------------\n")
    print(syntactic_correctness)
    print(schema_adherence)
    print("\n")
    print(rag_syntactic_correctness)
    print(rag_schema_adherence)
    print("\n")
    print(semantic_coverage)
    print(rag_semantic_coverage)
    print("\n")

    print("======================\n")
    # break