# Prompt Engineering with Open Source Large Language Models


## Table of Contents
1. Setup and Installation
2. Mistral-7B-Instruct Response Function
3. Normalization Functions
4. Evaluation Functions
5. Structured Prompts
6. Zero-shot Prompting
7. Few-shot Prompting
8. Chain-of-Thought Prompting
9. Evaluation Using Entire Dataset (Few-shot Prompting)


In [None]:
!pip install transformers==4.49.0 datasets accelerate==1.4.0 vllm==0.8.2 -q

In [None]:
import pandas as pd
import os
import json
from pprint import pprint
import re
from difflib import SequenceMatcher

In [None]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("🔐 No HF token found – skipping login (only needed for gated/private models)")

In [None]:
# Set Working Directory and read csv file
os.chdir(r'/content/drive/MyDrive)
df = pd.read_csv(r'Extraction.csv')

In [None]:
from vllm import LLM, SamplingParams

# Load Mixtral with longer context window (32k tokens)
llm = LLM(
    model="mistralai/Mistral-7B-Instruct-v0.2",
    tensor_parallel_size=1,
    dtype="float16",
    gpu_memory_utilization=0.7,
    max_model_len=10500,
    trust_remote_code=True
)


INFO 04-04 01:53:32 [__init__.py:239] Automatically detected platform cuda.
INFO 04-04 01:53:46 [config.py:585] This model supports multiple tasks: {'embed', 'classify', 'reward', 'score', 'generate'}. Defaulting to 'generate'.
INFO 04-04 01:53:46 [config.py:1697] Chunked prefill is enabled with max_num_batched_tokens=8192.


  self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)


INFO 04-04 01:53:48 [core.py:54] Initializing a V1 LLM engine (v0.8.2) with config: model='mistralai/Mistral-7B-Instruct-v0.2', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=10500, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=mistralai/Mistral-7B-Instruct-v0.2, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, c

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 04-04 01:53:57 [loader.py:447] Loading weights took 7.60 seconds
INFO 04-04 01:53:58 [gpu_model_runner.py:1186] Model loading took 13.4967 GB and 8.186906 seconds
INFO 04-04 01:54:09 [backends.py:415] Using cache directory: /root/.cache/vllm/torch_compile_cache/e49581de1a/rank_0_0 for vLLM's torch.compile
INFO 04-04 01:54:09 [backends.py:425] Dynamo bytecode transform time: 11.69 s
INFO 04-04 01:54:10 [backends.py:115] Directly load the compiled graph for shape None from the cache
INFO 04-04 01:54:18 [monitor.py:33] torch.compile takes 11.69 s in total
INFO 04-04 01:54:19 [kv_cache_utils.py:566] GPU KV cache size: 101,936 tokens
INFO 04-04 01:54:19 [kv_cache_utils.py:569] Maximum concurrency for 10,500 tokens per request: 9.71x
INFO 04-04 01:54:53 [gpu_model_runner.py:1534] Graph capturing finished in 34 secs, took 0.51 GiB
INFO 04-04 01:54:53 [core.py:151] init engine (profile, create kv cache, warmup model) took 55.45 seconds


In [None]:
# Wrapper for inference using Mistral model
def safe_str(x):
    """
    Safely converts any input (LLM output) to a clean string.
    Handles:
    - dict/list → JSON string
    - None → empty string
    - str → stripped string
    - others → stringified and stripped
    """
    if isinstance(x, str):
        return x.strip()
    elif isinstance(x, (dict, list)):
        return json.dumps(x)
    elif x is None:
        return ""
    else:
        return str(x).strip()

def generate_response(prompt):
    """
    Runs prompt through the vLLM model and returns a safe string.
    """
    outputs = llm.generate(prompt, sampling_params)
    return safe_str(outputs[0].outputs[0].text)

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

## 3. Normalization Functions

In [None]:
# Function to correct JSON format output and print outputs from model and ground truth
def format_and_print_json(gpt_output, ground_truth, title_1="🔹 Mistralai Output", title_2="🔸 Ground Truth"):
    """
    Normalize and pretty-print GPT and ground truth JSON for side-by-side comparison.

    Steps:
    - Converts inputs to Python dictionaries if they're JSON strings.
    - Sorts list values (e.g., 'party') alphabetically for consistent display.
    - Pretty-prints both outputs with proper indentation and optional Markdown formatting.

    Args:
        gpt_output (str or dict): GPT model's output
        ground_truth (str or dict): Ground truth metadata
        title_1 (str): Optional title for Mistralai output
        title_2 (str): Optional title for ground truth
    """
    # Convert strings to dicts if needed
    if isinstance(gpt_output, str):
        try:
            gpt_output = json.loads(gpt_output)
        except json.JSONDecodeError:
            print("⚠️ Mistralai Output is not valid JSON")
            return
    if isinstance(ground_truth, str):
        ground_truth = json.loads(ground_truth)

    # Helper function to sort lists
    def sort_lists(obj):
        for key, val in obj.items():
            if isinstance(val, list):
                if all(isinstance(item, dict) for item in val):
                    # Sort list of dicts by JSON string representation
                    obj[key] = sorted(val, key=lambda d: json.dumps(d, sort_keys=True))
                else:
                    # Sort regular list (like list of strings)
                    obj[key] = sorted(val)
        return obj

    gpt_output = sort_lists(gpt_output)
    ground_truth = sort_lists(ground_truth)

    # Pretty print JSON
    print(f"{title_1}:\n```json\n{json.dumps(gpt_output, indent=4)}\n```")
    print(f"\n{title_2}:\n```json\n{json.dumps(ground_truth, indent=4)}\n```")


# Extract the first valid JSON object from GPT response
def extract_json_from_text(text):
    """
    Extracts the first valid JSON object from a given text string.

    Handles:
    - Extra explanations around the JSON
    - Markdown code blocks (e.g., ```json)
    - Only returns the first complete JSON object

    Returns:
        str: JSON string
    Raises:
        ValueError: If no valid JSON is found
    """
    if not isinstance(text, str):
        text = str(text)

    # Clean markdown code fences
    text = re.sub(r"```json|```", "", text).strip()

    # Match JSON object using balanced braces
    brace_stack = []
    start = None

    for i, char in enumerate(text):
        if char == '{':
            if not brace_stack:
                start = i
            brace_stack.append('{')
        elif char == '}':
            if brace_stack:
                brace_stack.pop()
                if not brace_stack and start is not None:
                    return text[start:i+1]

    raise ValueError("No valid JSON object found in the text.")

## 4. Evaluation Functions

In [None]:
# Field-by-field value Accuracy
def compare_parties(pred, truth):
    """
    Compare two lists of 'party' entities using Jaccard similarity.

    Works for both:
    - List of strings (["Company_A", "Company_B"])
    - List of dictionaries ([{"name": ..., "type": ...}, ...])

    Returns:
        float: Jaccard similarity between predicted and ground truth sets.
    """
    # Convert dicts to sorted JSON strings for comparison, leave strings as-is
    def normalize_party_list(party_list):
        normalized = []
        for item in party_list:
            if isinstance(item, dict):
                normalized.append(json.dumps(item, sort_keys=True))  # Convert dict to consistent string
            else:
                normalized.append(str(item))
        return set(normalized)

    pred_set = normalize_party_list(pred)
    truth_set = normalize_party_list(truth)

    if not pred_set and not truth_set:
        return 1.0
    if not pred_set or not truth_set:
        return 0.0

    intersection = pred_set & truth_set
    union = pred_set | truth_set
    return round(len(intersection) / len(union), 2)


def evaluate_metadata_fields(gpt_output, ground_truth):
    """
    Evaluate metadata extraction quality by comparing the GPT output with the ground truth on a per-field basis.

    This function checks the accuracy of each metadata item:
    - effective_date, jurisdiction, party, term
    - Handles optional fields (no penalty if both are missing)
    - For 'party', it allows partial credit using set overlap (Jaccard similarity)

    Returns:
        A dictionary with per-field accuracy scores:
        - 1.0 for exact matches
        - 0.0 for mismatches
        - partial value for list overlaps (e.g., 'party')
        - None if both fields are missing
    """
    if isinstance(gpt_output, str):
        gpt_output = json.loads(gpt_output)
    if isinstance(ground_truth, str):
        ground_truth = json.loads(ground_truth)

    fields = ['effective_date', 'jurisdiction', 'party', 'term']
    field_scores = {}

    for field in fields:
        gpt_val = gpt_output.get(field)
        gt_val = ground_truth.get(field)

        if gpt_val is None and gt_val is None:
            field_scores[field] = None  # No penalty if missing in both
        elif gpt_val is None or gt_val is None:
            field_scores[field] = 0.0
        else:
            if field != "party":
                field_scores[field] = 1.0 if gpt_val == gt_val else 0.0
            else:
                field_scores[field] = compare_parties(gpt_val, gt_val)

    return field_scores

In [None]:
# Evaluate key_match and party_count
def evaluate_key_match_and_party_count(gpt_output, true_keys_str, true_party_count):
    """
    Evaluate the structural correctness of GPT output based on:
    - Whether all expected metadata fields (keys) are present
    - Whether the number of 'party' entities matches the ground truth

    Parameters:
        gpt_output: The extracted metadata from GPT
        true_keys_str: A comma-separated string of expected keys (from df['keys'])
        true_party_count: The number of expected parties (from df['party_count'])

    Returns:
        A dictionary with:
        - key_match_score: Fraction of required keys that GPT included
        - party_count_score: 1 if correct number of parties, 0 otherwise
        - supporting details like predicted/expected keys and party counts
    """
    if isinstance(gpt_output, str):
        gpt_output = json.loads(gpt_output)

    expected_keys = set(true_keys_str.split(','))
    gpt_keys = set(gpt_output.keys())

    matched_keys = expected_keys & gpt_keys
    key_match_score = round(len(matched_keys) / len(expected_keys), 2)

    predicted_party_count = len(gpt_output.get('party', [])) if 'party' in gpt_output else 0
    party_count_score = 1.0 if predicted_party_count == true_party_count else 0.0

    return {
        "key_match_score": key_match_score,
        "party_count_score": party_count_score,
        "expected_keys": expected_keys,
        "returned_keys": gpt_keys,
        "expected_party_count": true_party_count,
        "predicted_party_count": predicted_party_count
    }

In [None]:
def summarize_evaluation_across_samples(all_field_scores, all_structural_scores):
    """
    Summarizes evaluation metrics across multiple samples.
    Handles missing fields and avoids ZeroDivisionError.
    """
    total_field_scores = {'effective_date': [], 'jurisdiction': [], 'party': [], 'term': []}
    total_structural_scores = {'key_match': [], 'party_count': []}

    print("\n\n================== 🧾 Evaluation Summary Across Samples ==================\n")

    for i, (field_scores, structural) in enumerate(zip(all_field_scores, all_structural_scores)):
        print(f"\n📄 Sample {i+1}")

        # --- Field-Level Scores ---
        field_sum = 0
        field_count = 0

        for field, score in field_scores.items():
            if score is None:
                print(f"- {field}: (Not present in either) ✅")
            else:
                print(f"- {field}: {score}")
                total_field_scores[field].append(score)
                field_sum += score
                field_count += 1

        avg_field_score = round(field_sum / field_count, 3) if field_count > 0 else "N/A"

        # --- Structural Scores ---
        key_score = structural.get('key_match_score')
        party_score = structural.get('party_count_score')

        if key_score is not None:
            total_structural_scores['key_match'].append(key_score)
        if party_score is not None:
            total_structural_scores['party_count'].append(party_score)

        print(f"- Key Match Score: {key_score}")
        print(f"- Party Count Score: {party_score}")

        combined_structural = [
            s for s in [key_score, party_score] if s is not None
        ]
        avg_structural = round(sum(combined_structural) / len(combined_structural), 3) if combined_structural else "N/A"

        print(f"🔹 Avg Field Score: {avg_field_score}")
        print(f"🔸 Combined Structural Score: {avg_structural}")

    # --- Aggregate Averages ---
    print("\n================== 📊 AVERAGES ==================\n")

    print("🔹 Field-Level Averages:")
    for field, scores in total_field_scores.items():
        if scores:
            avg = round(sum(scores) / len(scores), 3)
            print(f"- {field}: {avg}")
        else:
            print(f"- {field}: (No available comparisons)")

    print("\n🔸 Structural Averages:")
    for k in total_structural_scores:
        scores = total_structural_scores[k]
        if scores:
            avg = round(sum(scores) / len(scores), 3)
            label = "Key Match" if k == "key_match" else "Party Count"
            print(f"- {label} Score: {avg}")
        else:
            label = "Key Match" if k == "key_match" else "Party Count"
            print(f"- {label} Score: (No available comparisons)")


In [None]:
def print_average_scores_only(all_field_scores, all_structural_scores):
    """
    Print only the average field-level and structural scores across all samples.
    Use this when you want a compact summary view.
    """
    print("\n================== 📊 AVERAGE METRICS ==================\n")

    # --- Field-Level Averages ---
    field_totals = {}
    field_counts = {}

    for sample in all_field_scores:
        for field, score in sample.items():
            if score is not None:
                field_totals[field] = field_totals.get(field, 0) + score
                field_counts[field] = field_counts.get(field, 0) + 1

    print("🔹 Field-Level Averages:")
    for field in ['effective_date', 'jurisdiction', 'party', 'term']:
        if field_counts.get(field, 0) > 0:
            avg = round(field_totals[field] / field_counts[field], 3)
            print(f"- {field}: {avg}")
        else:
            print(f"- {field}: (No available comparisons)")

    # --- Structural Averages ---
    total_structural_scores = {
        "key_match": [],
        "party_count": []
    }

    for result in all_structural_scores:
        total_structural_scores["key_match"].append(result["key_match_score"])
        total_structural_scores["party_count"].append(result["party_count_score"])

    print("\n🔸 Structural Averages:")
    for metric_key, values in total_structural_scores.items():
        if values:
            avg = round(sum(values) / len(values), 3)
            label = "Key Match" if metric_key == "key_match" else "Party Count"
            print(f"- {label} Score: {avg}")
        else:
            print(f"- {metric_key} Score: (No data)")


## 5. Structured Prompts

In [None]:
# Define persona, instructions, and formatting rules
persona = """
You are an expert in identifying and extracting metadata from NDA (Non-Disclosure Agreement) documents.
"""

instruction = """
Extract the following metadata:
- effective_date: When the agreement becomes legally binding
- jurisdiction: The governing legal territory
- party: Entities bound by the agreement (May have multiple values)
- term: Duration of the agreement
Return only the metadata fields that are explicitly present in the text. If a field like effective_date or term is missing, exclude it from the output JSON and do not guess or explain or return null.
"""

data_format = """
Output must be a JSON object following these Rules:
- Replace all spaces and colons (:) in attribute values with underscores (_)
- Dates must be in 'YYYY-MM-DD' format
- Jurisdiction: Only the state or country name is included, without prefixes like "State of"
- Party names must follow Title_Case with underscores replacing spaces and corporate suffixes standardized (e.g., "Inc.", "Corp.", "LLC", "Ltd.")
- Duration (term) must be normalized to 'number_units' format (e.g., 2_years, 12_months)
"""

## 6. Zero-shot Prompting

In [None]:
# Zero-shot evaluation: no examples included in the prompt
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

# Set up Mistral sampling
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=512,
)


for x in example_indices:
    input_text = df['text'][x]

    # Zero-shot prompt
    prompt = f"""
{persona}

{instruction}

{data_format}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt)

    try:
        # Extract clean JSON from model response
        extracted_json = extract_json_from_text(model_output_raw)

        # Display Model Output vs Ground Truth
        format_and_print_json(extracted_json, df['extracted'][x])

        # Evaluation Method 1: Field-Level Accuracy
        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        # Evaluation Method 2: Structural Correctness
        print("\n🔍 Structural Evaluation:")
        # Fix: convert to dict if still string
        if isinstance(extracted_json, str):
            extracted_json = json.loads(extracted_json)
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ Model output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary for zero-shot
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)






Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it, est. speed input: 1063.59 toks/s, output: 70.65 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        {
            "title": "Heidrick & Struggles, Inc",
            "type": "Delaware_Corporation"
        },
        {
            "title": "Second_Party",
            "type": ""
        }
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it, est. speed input: 1124.12 toks/s, output: 66.87 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        {
            "title": "Requesting Stockholder",
            "name": "Elaine P. Wynn",
            "type": "individual"
        },
        {
            "title": "",
            "name": "Wynn Resorts, Limited",
            "type": "corporation"
        }
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Ltd."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: (Not present in either) ✅

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it, est. speed input: 1031.09 toks/s, output: 67.12 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2014-04-06",
    "jurisdiction": "Delaware",
    "party": [
        {
            "title": "GTCR LLC",
            "name": "GTCR LLC",
            "type": "LLC"
        },
        {
            "title": "Vocus, Inc.",
            "name": "Vocus, Inc.",
            "type": "Inc."
        }
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2014-04-06",
    "jurisdiction": "Delaware",
    "party": [
        "Gtcr_LLC",
        "Vocus_Inc."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: (Not present in either) ✅

🔍 Structural Evaluation:
Key Match Score: 0.75
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.11s/it, est. speed input: 1238.42 toks/s, output: 65.88 toks/s]

🔹 Mistralai Output:
```json
{
    "effective_date": "2005-11-01",
    "jurisdiction": "New Jersey",
    "party": [
        {
            "title": "RENAISSANCE BRANDS LTD.",
            "name": "RENAISSANCE BRANDS LTD."
        },
        {
            "title": "VITAMIN SHOPPE INDUSTRIES INC.",
            "name": "VITAMIN SHOPPE INDUSTRIES INC."
        }
    ],
    "term": "2_years"
}
```

🔸 Ground Truth:
```json
{
    "jurisdiction": "New_Jersey",
    "party": [
        "Renaissance_Brands_Ltd.",
        "Vitamin_Shoppe_Industuries_Inc."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 0.0
jurisdiction: 0.0
party: 0.0
term: 0.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2




📄 Sample 1
- effective_date: 1.0
- jurisdiction: 1.0
- party: 0.0
- term: 1.0
- Key Match Score: 1.




## 7. Few-shot Prompting

In [None]:
examples = ""
for i in range(2):  # Adjust number of few-shot examples here
    text_sample = df['text'][i]
    metadata_sample = df['extracted'][i]
    examples += f"\n\nExample {i+1}\ntext = {text_sample}\noutput = {metadata_sample}"

# Main prompt test on selected samples
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

# Set up Mistral sampling
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=512,
)


for x in example_indices:
    input_text = df['text'][x]

    # Zero-shot prompt
    prompt = f"""
{persona}

{instruction}

{data_format}

Here are some examples of NDA metadata extraction:{examples}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt)

    try:
        # Extract clean JSON from model response
        extracted_json = extract_json_from_text(model_output_raw)

        # Display Model Output vs Ground Truth
        format_and_print_json(extracted_json, df['extracted'][x])

        # Evaluation Method 1: Field-Level Accuracy
        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        # Evaluation Method 2: Structural Correctness
        print("\n🔍 Structural Evaluation:")
        # Fix: convert to dict if still string
        if isinstance(extracted_json, str):
            extracted_json = json.loads(extracted_json)
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ Model output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary for zero-shot
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)






Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it, est. speed input: 7356.43 toks/s, output: 47.10 toks/s]


🔹 GPT Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_&_Struggles_Inc.",
        "Second_Party"
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s, est. speed input: 10126.66 toks/s, output: 56.25 toks/s]


🔹 GPT Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Limited"
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Ltd."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.33
term: (Not present in either) ✅

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s, est. speed input: 13212.12 toks/s, output: 51.90 toks/s]


⚠️ GPT Output is not valid JSON

📊 Field-Level Evaluation:
⚠️ Model output is not valid JSON
Raw output:
 {
"jurisdiction": "Delaware"
}

The effective_date is not explicitly stated in the text, so it should not be included in the output.





Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it, est. speed input: 7661.62 toks/s, output: 56.62 toks/s]

🔹 GPT Output:
```json
{
    "effective_date": "2005-11-01",
    "jurisdiction": "New_Jersey",
    "party": [
        "RENAISSANCE_BRANDS_LTD.",
        "VITAMIN_SHOPPE_INDUSTRIES_INC."
    ],
    "term": "2_years"
}
```

🔸 Ground Truth:
```json
{
    "jurisdiction": "New_Jersey",
    "party": [
        "Renaissance_Brands_Ltd.",
        "Vitamin_Shoppe_Industuries_Inc."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 0.0
jurisdiction: 1.0
party: 0.0
term: 0.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2




📄 Sample 1
- effective_date: 1.0
- jurisdiction: 1.0
- party: 0.0
- term: 1.0
- Key Match Score: 1.0
- Party Count Score: 1.0
🔹 Avg Field Score: 0.75
🔸 Combined Structural Score: 1.0

📄 Sample 2
- effective_date: 1.0
- jurisdiction: 1.0
- party: 0.33
- term: (Not present 




## 8. Chain-of-Thought Prompting

In [None]:
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

# Set up Mistral sampling
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=512,
)

for x in example_indices:
    input_text = df['text'][x]

    # Zero-shot prompt
    prompt = f"""
{persona}

{instruction}

{data_format}

Let’s extract the required metadata by thinking step-by-step.

1. First, read the NDA and look for the effective_date. This is typically found where the agreement says something like "effective as of", "made on", or "entered into on".
2. Next, identify the jurisdiction. This is usually mentioned in a clause like "governed by the laws of..." or "jurisdiction of".
3. Then, extract all the parties. These are the legal entities entering into the agreement. They often appear at the beginning or near the signature lines.
4. Finally, determine the term. Look for duration-related phrases like "for two years", "valid until", or "will terminate after...".


Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt)

    try:
        # Extract clean JSON from model response
        extracted_json = extract_json_from_text(model_output_raw)

        # Display Model Output vs Ground Truth
        format_and_print_json(extracted_json, df['extracted'][x])

        # Evaluation Method 1: Field-Level Accuracy
        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        # Evaluation Method 2: Structural Correctness
        print("\n🔍 Structural Evaluation:")
        # Fix: convert to dict if still string
        if isinstance(extracted_json, str):
            extracted_json = json.loads(extracted_json)
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ Model output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary for zero-shot
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)






Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it, est. speed input: 1094.74 toks/s, output: 70.55 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        {
            "title": "Heidrick & Struggles, Inc.",
            "name": "Heidrick_Struggles_Inc"
        },
        {
            "title": "Second Party",
            "name": "Second_Party"
        }
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.20s/it, est. speed input: 1004.98 toks/s, output: 70.88 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        {
            "title": "Requesting Stockholder",
            "name": "Elaine P. Wynn",
            "type": "individual"
        },
        {
            "title": "",
            "name": "Wynn Resorts, Limited",
            "type": "corporation"
        }
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Ltd."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: (Not present in either) ✅

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it, est. speed input: 1382.26 toks/s, output: 70.86 toks/s]


🔹 Mistralai Output:
```json
{
    "effective_date": "2014-04-06",
    "jurisdiction": "Delaware",
    "party": [
        {
            "title": "GTCR LLC",
            "name": "GTCR LLC",
            "type": "LLC"
        },
        {
            "title": "Vocus, Inc.",
            "name": "Vocus, Inc.",
            "type": "Company"
        }
    ],
    "term": "Not specified"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2014-04-06",
    "jurisdiction": "Delaware",
    "party": [
        "Gtcr_LLC",
        "Vocus_Inc."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 0.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2





Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it, est. speed input: 1396.83 toks/s, output: 69.99 toks/s]

🔹 Mistralai Output:
```json
{
    "effective_date": "2005-11-01",
    "jurisdiction": "New Jersey",
    "party": [
        {
            "title": "RENAISSANCE BRANDS LTD.",
            "name": "RENAISSANCE BRANDS LTD."
        },
        {
            "title": "VITAMIN SHOPPE INDUSTRIES INC.",
            "name": "VITAMIN SHOPPE INDUSTRIES INC."
        }
    ],
    "term": "2_years"
}
```

🔸 Ground Truth:
```json
{
    "jurisdiction": "New_Jersey",
    "party": [
        "Renaissance_Brands_Ltd.",
        "Vitamin_Shoppe_Industuries_Inc."
    ]
}
```

📊 Field-Level Evaluation:
effective_date: 0.0
jurisdiction: 0.0
party: 0.0
term: 0.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Returned Keys: {'effective_date', 'term', 'jurisdiction', 'party'}
Expected Party Count: 2
Predicted Party Count: 2




📄 Sample 1
- effective_date: 1.0
- jurisdiction: 1.0
- party: 0.0
- term: 1.0
- Key Match Score: 1.




## 9. Evaluation Using Entire Dataset (Few-shot Prompting)

In [None]:
# Create few-shot examples using first 2 rows
examples = ""
for i in range(2):
    text_sample = df['text'][i]
    metadata_sample = df['extracted'][i]
    examples += f"\n\nExample {i+1}\ntext = {text_sample}\noutput = {metadata_sample}"

# Prepare indices and tracking
example_indices = df.index.tolist()
all_field_scores = []
all_structural_scores = []
results = []

# Set up Mistral sampling
sampling_params = SamplingParams(
    temperature=0.0,
    max_tokens=512,
)

# Loop through all examples
for x in example_indices:
    input_text = df['text'][x]

    prompt = f"""
{persona}

{instruction}

{data_format}

Here are some examples of NDA metadata extraction:{examples}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    try:
        model_output_raw = generate_response(prompt)
        extracted_json = extract_json_from_text(model_output_raw)
        normalized_output = normalize_output(extracted_json)

        if isinstance(normalized_output, str):
            normalized_output = json.loads(normalized_output)

        field_scores = evaluate_metadata_fields(normalized_output, df['extracted'][x])
        all_field_scores.append(field_scores)

        key_eval = evaluate_key_match_and_party_count(
            normalized_output,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        ground_truth = df['extracted'][x]
        if isinstance(ground_truth, str):
            ground_truth = json.loads(ground_truth)

        model_keys = list(normalized_output.keys())
        model_party_count = len(normalized_output.get("party", [])) if "party" in normalized_output else 0

        results.append({
            "index": x,
            "model_output": json.dumps(normalized_output),
            "ground_truth": json.dumps(ground_truth),
            "model_keys": ",".join(model_keys),
            "true_keys": df['keys'][x],
            "model_party_count": model_party_count,
            "true_party_count": df['party_count'][x]
        })

    except Exception as e:
        results.append({
            "index": x,
            "model_output": f"ERROR: {str(e)}",
            "ground_truth": df['extracted'][x],
            "model_keys": "",
            "true_keys": df['keys'][x],
            "model_party_count": "",
            "true_party_count": df['party_count'][x]
        })

# Create DataFrame and export results
results_df = pd.DataFrame(results)
results_df.to_csv("mistral.csv", index=False)

# Show evaluation summary
print_average_scores_only(all_field_scores, all_structural_scores)


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it, est. speed input: 5871.96 toks/s, output: 55.43 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it, est. speed input: 5191.05 toks/s, output: 60.97 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s, est. speed input: 7989.75 toks/s, output: 53.49 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it, est. speed input: 6928.19 toks/s, output: 59.03 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it, est. speed input: 6804.56 toks/s, output: 53.37 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it, est. speed input: 7684.90 toks/s, output: 52.30 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 6993.10 toks/s, output: 62.09 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s, est. speed input: 7693.12 toks/s, output: 56.10 toks/s]
Processed prompts: 100%|████████



🔹 Field-Level Averages:
- effective_date: 0.693
- jurisdiction: 0.989
- party: 0.385
- term: 0.256

🔸 Structural Averages:
- Key Match Score: 0.98
- Party Count Score: 0.795



