In [1]:
import sys
import os
import json
import re

src_directory = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(src_directory)

from llm_handler import *
from fine_tuner import *

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
rag_knowledge_base_path = "../../../data/cdm_samples_json"

llm_handler = FineTuner(
    model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
    token="huggingface access token",
    json_folder="../../../data/type_wise_cdm_samples/non_test_data/json", 
    txt_folder="../../../data/type_wise_cdm_samples/non_test_data/text"
)

llm_handler.initialize_rag(
    documents_path=rag_knowledge_base_path,
    embed_model_name="BAAI/bge-small-en-v1.5",
    top_k=5,
    similarity_cutoff=0.6
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.016 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 9.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
meta-llama/Meta-Llama-3.1-8B-Instruct initialized for fine-tuning!
LLM is explicitly disabled. Using MockLLM.
Initialized RAG with 856 documents and embedding model: BAAI/bge-small-en-v1.5


In [3]:
llm_handler.fine_tune(max_steps=60)

Unsloth 2024.11.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 579/579 [00:00<00:00, 20805.33 examples/s]


Dataset Preparation Completed!
Training...


Map (num_proc=2): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 579/579 [00:02<00:00, 234.51 examples/s]
max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 579 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.1735
2,1.3053
3,1.1566
4,1.1609
5,1.1949
6,1.1097
7,0.8885
8,0.8791
9,0.9645
10,0.9525


Training Completed!


In [4]:
def load_test_contracts(test_data_dir):
    json_dir = os.path.join(test_data_dir, "json")
    text_dir = os.path.join(test_data_dir, "text")

    test_contracts = {}

    for sub_folder in os.listdir(json_dir):
        json_sub_folder = os.path.join(json_dir, sub_folder)
        text_sub_folder = os.path.join(text_dir, sub_folder)
        
        if os.path.isdir(json_sub_folder) and os.path.isdir(text_sub_folder):
            contracts = []
            for json_file in os.listdir(json_sub_folder):
                if json_file.endswith(".json"):
                    text_file = json_file.replace(".json", ".txt")
                    json_path = os.path.join(json_sub_folder, json_file)
                    text_path = os.path.join(text_sub_folder, text_file)
                    
                    if os.path.exists(json_path) and os.path.exists(text_path):
                        with open(json_path, 'r') as jf:
                            json_content = json.load(jf)
                        
                        with open(text_path, 'r') as tf:
                            text_content = tf.read()
                        
                        contracts.append({
                            "id": json_file,
                            "cdm": json_content,
                            "description": text_content
                        })
            
            if contracts:
                test_contracts[sub_folder] = contracts

    return test_contracts


test_data_dir = "../../../data/type_wise_cdm_samples/test_data"
test_contracts = load_test_contracts(test_data_dir)

In [5]:
# def extract_json(string):
#     match = re.search(r'(\{.*\})', string, re.DOTALL)
#     if match:
#         json_string = match.group(1)
        
#         try:
#             json_obj = json.loads(json_string)
#             return json_obj
#         except Exception:
#             return None
#     else:
#         return None

def extract_json(string):
    match = re.search(r'(\[.*\]|\{.*\})', string, re.DOTALL)
    if match:
        json_string = match.group(1)
        try:
            return json.loads(json_string)
        except json.JSONDecodeError:
            corrected_json_string = re.sub(r"(?<!\\)'", '"', json_string)
            corrected_json_string = corrected_json_string.replace("True", "true").replace("False", "false").replace("None", "null")
            try:
                return json.loads(corrected_json_string)
            except json.JSONDecodeError:
                return None
    return None
    

def is_json(string):
    try:
        json.loads(string)
        return True
    except Exception:
        return False

In [None]:
cdms = []

output_dir = "test_results_cdm"

for contract_type, contracts in test_contracts.items():

    print(f"\n\nProcessing contracts of type: {contract_type}\n")
    
    for contract in contracts:
        print("\nProcessing "+contract['id']+"...")
        print("-"*60)

        without_rag_path = os.path.join(output_dir, "Fine-tuned-Meta-Llama-3.1-8B-Instruct", "without_rag", contract_type, contract['id'])
        with_rag_path = os.path.join(output_dir, "Fine-tuned-Meta-Llama-3.1-8B-Instruct", "with_rag", contract_type, contract['id'])

        os.makedirs(os.path.dirname(without_rag_path), exist_ok=True)
        os.makedirs(os.path.dirname(with_rag_path), exist_ok=True)

        if not os.path.exists(without_rag_path):
            basic_prompt = f"""
            Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

            Derivatives Contract:
            {contract['description']}

            Please ensure:
            1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
            2. **Only** provide the JSON representation, with no additional text or explanations.
            3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
            
            Output the **final JSON** below.
            """
            conf = ""
            cnt = 0
            while(True):
                cdm_response = extract_json(llm_handler.generate(basic_prompt+conf))
                if cdm_response:
                    print("Generated a correct JSON!!")
                    with open(without_rag_path, "w") as f:
                        json.dump(cdm_response, f, indent=4)
                    print("CDM Saved in " + without_rag_path)
                    break
                else:
                    print(cdm_response, "\n")
                    conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                    print("Generated CDM is not a correct JSON. Generating again...")
                    cnt += 1
                    if cnt > 1:
                        break

        else:
            print("!!! CDM already exists: " + without_rag_path)
        
        
        if not os.path.exists(with_rag_path):
            
            while(True):
                contract_summary = llm_handler.generate("Summarize the following contract in AT MOST 500 words : " + contract["description"])
                if len(contract_summary) < 5000:
                    print("Contract Summary for RAG context retrieval is generated!")
                    break
            
            rag_context = llm_handler.get_context_using_rag(f"Find CDM representations that best capture the information from a derivatives contract like the following:\n\n{contract_summary}")
            rag_prompt = f"""
            Represent the following derivatives contract in a complete ISDA CDM JSON format, adhering strictly to the CDM schema. Ensure all key details are covered and follow the schema definitions exactly:

            Derivatives Contract:
            {contract['description']}

            Relevant CDM Examples for Reference:
            {rag_context}

            Please ensure:
            1. The output is **concise** but **complete**, following the ISDA CDM dictionary.
            2. **Only** provide the JSON representation, with no additional text or explanations.
            3. If the content exceeds the token limit, **truncate** appropriately while still providing a **valid** and complete JSON structure.
            
            Output the **final JSON** below.
            """

            conf = ""
            cnt = 0
            while(True):
                cdm_response_rag = extract_json(llm_handler.generate(rag_prompt+conf))
                if cdm_response_rag:
                    print("(RAG) Generated a correct JSON!!")
                    with open(with_rag_path, "w") as f:
                        json.dump(cdm_response_rag, f, indent=4)
                    print("CDM_RAG Saved in " + with_rag_path)
                    break
                else:
                    print(cdm_response_rag, "\n")
                    conf = "\n\nNote that your previous response didn't generate a COMLETE JSON probably because of the token limit. Make sure to generate a COMPLETE JSON this time even if you need to ignore some less important data."
                    print("(RAG) Generated CDM is not a correct JSON. Generating again...")
                    cnt += 1
                    if cnt > 1:
                        break

        else:
            print("!!! CDM_RAG already exists: " + with_rag_path)





Processing contracts of type: EquitySwap


Processing eqs-ex10-short-form-interestLeg-driving-schedule-dates_2.json...
------------------------------------------------------------
None 

Generated CDM is not a correct JSON. Generating again...
None 

Generated CDM is not a correct JSON. Generating again...
None 

Generated CDM is not a correct JSON. Generating again...
None 

Generated CDM is not a correct JSON. Generating again...
Contract Summary for RAG context retrieval is generated!
None 

(RAG) Generated CDM is not a correct JSON. Generating again...
None 

(RAG) Generated CDM is not a correct JSON. Generating again...


In [None]:
# from evaluate import *

# cdm_schema_path = "../cdm_schema/"
# cdm_repo_path = "../cdm_schema/cdm_schema_json"

# cdms = []

# for contract in contracts:

#     # exclude = ['bofa_interest_rate_swap', 'bofa_equity_option', 'jpmorgan_interest_rate_swap']
    
#     # if contract['name'] in exclude:
#     #     continue

#     cdm_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm.json"
#     cdm_rag_file_path = f"test_data/generated_cdms/Fine-tuned-Llama3.1-8B-Instruct/{contract['name']}_cdm_rag.json"

#     if os.path.exists(cdm_file_path):
#         with open(cdm_file_path, 'r') as cdm_file:
#             cdm = json.load(cdm_file)

#     if os.path.exists(cdm_rag_file_path):
#         with open(cdm_rag_file_path, 'r') as cdm_rag_file:
#             cdm_rag = json.load(cdm_rag_file)

#     syntactic_correctness, schema_adherence = evaluate_cdm(cdm, cdm_schema_path, cdm_repo_path)
#     rag_syntactic_correctness, rag_schema_adherence = evaluate_cdm(cdm_rag, cdm_schema_path, cdm_repo_path)

#     semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm))
#     rag_semantic_coverage = get_coverage_score(get_coverage(llm_handler, contract['description'], cdm_rag))

#     print("\n", contract['name'])
#     print("-----------------------\n")
#     print(syntactic_correctness)
#     print(schema_adherence)
#     print("\n")
#     print(rag_syntactic_correctness)
#     print(rag_schema_adherence)
#     print("\n")
#     print(semantic_coverage)
#     print(rag_semantic_coverage)
#     print("\n")

#     print("======================\n")
#     # break