# Prompt Engineering with closed-source (OpenAI) Large Language Models


## Table of Contents
1. Setup and Installation
2. Set OpenAI API key and Response Function]
3. Normalization Functions
4. Evaluation Functions
5. Structured Prompts
6. Zero-shot Prompting
7. Few-shot Prompting
8. Chain-of-Thought Prompting



## 1. Setup and Installation

In [None]:
!pip install openai -q


In [None]:
from openai import OpenAI
import pandas as pd
import os
import json
from pprint import pprint
import re
from difflib import SequenceMatcher

In [None]:
# Set Working Directory and read csv file
os.chdir(r'/content/drive/MyDrive')
df = pd.read_csv(r'Extraction.csv')

## 2. Set OpenAI API key and Response Function

In [None]:
from google.colab import userdata
OPENAI_AI_KEY = userdata.get('OPEN_AI_KEY')
os.environ["OPENAI_API_KEY"] = OPENAI_AI_KEY
client = OpenAI()

In [None]:
# Define your response function using gpt-4o
def generate_response(prompt, model="gpt-4o", temperature=0, max_tokens=300, top_p=1.0, **kwargs):
    """Generate a response using OpenAI's API."""
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        **kwargs
    )
    return response.choices[0].message.content

## 3. Normalization Functions

In [None]:
# Function to correct JSON format output and print outputs from model and ground truth
def format_and_print_json(gpt_output, ground_truth, title_1="🔹 GPT Output", title_2="🔸 Ground Truth"):
    """
    Normalize and pretty-print GPT and ground truth JSON for side-by-side comparison.

    Steps:
    - Converts inputs to Python dictionaries if they're JSON strings.
    - Sorts list values (e.g., 'party') alphabetically for consistent display.
    - Pretty-prints both outputs with proper indentation and optional Markdown formatting.

    Args:
        gpt_output (str or dict): GPT model's output
        ground_truth (str or dict): Ground truth metadata
        title_1 (str): Optional title for GPT output
        title_2 (str): Optional title for ground truth
    """
    # Convert strings to dicts if needed
    if isinstance(gpt_output, str):
        try:
            gpt_output = json.loads(gpt_output)
        except json.JSONDecodeError:
            print("⚠️ GPT Output is not valid JSON")
            return
    if isinstance(ground_truth, str):
        ground_truth = json.loads(ground_truth)

    # Sort lists in each key
    def sort_lists(obj):
        for key, val in obj.items():
            if isinstance(val, list):
                obj[key] = sorted(val)
        return obj

    gpt_output = sort_lists(gpt_output)
    ground_truth = sort_lists(ground_truth)

    # Pretty print JSON
    print(f"{title_1}:\n```json\n{json.dumps(gpt_output, indent=4)}\n```")
    print(f"\n{title_2}:\n```json\n{json.dumps(ground_truth, indent=4)}\n```")

# Extract the first valid JSON object from GPT response
def extract_json_from_text(text):
    try:
        match = re.search(r"\{[\s\S]*\}", text)
        if match:
            return match.group()
    except Exception as e:
        print("⚠️ JSON extraction error:", str(e))
    raise ValueError("No valid JSON found in the text.")

## 4. Evaluation Functions

In [None]:
# Field-by-field value Accuracy
def compare_parties(pred, truth):
    """
    Compare two lists of 'party' entities using Jaccard similarity (set overlap).

    Parameters:
        pred (list): List of parties predicted by the model.
        truth (list): List of ground truth parties.

    Returns:
        float: A score between 0 and 1 representing similarity.
            - 1.0 if both sets match exactly
            - 0.0 if no overlap or one is empty
            - Partial score if there's partial overlap (e.g., 1 common out of 3 total = 0.33)
    """
    pred_set = set(pred)
    truth_set = set(truth)
    if not pred_set and not truth_set:
        return 1.0
    if not pred_set or not truth_set:
        return 0.0
    intersection = pred_set & truth_set
    union = pred_set | truth_set
    return round(len(intersection) / len(union), 2)


def evaluate_metadata_fields(gpt_output, ground_truth):
    """
    Evaluate metadata extraction quality by comparing the GPT output with the ground truth on a per-field basis.

    This function checks the accuracy of each metadata item:
    - effective_date, jurisdiction, party, term
    - Handles optional fields (no penalty if both are missing)
    - For 'party', it allows partial credit using set overlap (Jaccard similarity)

    Returns:
        A dictionary with per-field accuracy scores:
        - 1.0 for exact matches
        - 0.0 for mismatches
        - partial value for list overlaps (e.g., 'party')
        - None if both fields are missing
    """
    if isinstance(gpt_output, str):
        gpt_output = json.loads(gpt_output)
    if isinstance(ground_truth, str):
        ground_truth = json.loads(ground_truth)

    fields = ['effective_date', 'jurisdiction', 'party', 'term']
    field_scores = {}

    for field in fields:
        gpt_val = gpt_output.get(field)
        gt_val = ground_truth.get(field)

        if gpt_val is None and gt_val is None:
            field_scores[field] = None  # No penalty if missing in both
        elif gpt_val is None or gt_val is None:
            field_scores[field] = 0.0
        else:
            if field != "party":
                field_scores[field] = 1.0 if gpt_val == gt_val else 0.0
            else:
                field_scores[field] = compare_parties(gpt_val, gt_val)

    return field_scores


In [None]:
# Evaluate key_match and party_count
def evaluate_key_match_and_party_count(gpt_output, true_keys_str, true_party_count):
    """
    Evaluate the structural correctness of GPT output based on:
    - Whether all expected metadata fields (keys) are present
    - Whether the number of 'party' entities matches the ground truth

    Parameters:
        gpt_output: The extracted metadata from GPT
        true_keys_str: A comma-separated string of expected keys (from df['keys'])
        true_party_count: The number of expected parties (from df['party_count'])

    Returns:
        A dictionary with:
        - key_match_score: Fraction of required keys that GPT included
        - party_count_score: 1 if correct number of parties, 0 otherwise
        - supporting details like predicted/expected keys and party counts
    """
    if isinstance(gpt_output, str):
        gpt_output = json.loads(gpt_output)

    expected_keys = set(true_keys_str.split(','))
    gpt_keys = set(gpt_output.keys())

    matched_keys = expected_keys & gpt_keys
    key_match_score = round(len(matched_keys) / len(expected_keys), 2)

    predicted_party_count = len(gpt_output.get('party', [])) if 'party' in gpt_output else 0
    party_count_score = 1.0 if predicted_party_count == true_party_count else 0.0

    return {
        "key_match_score": key_match_score,
        "party_count_score": party_count_score,
        "expected_keys": expected_keys,
        "returned_keys": gpt_keys,
        "expected_party_count": true_party_count,
        "predicted_party_count": predicted_party_count
    }

In [None]:
def summarize_evaluation_across_samples(all_field_scores, all_structural_scores):
    """
    Summarizes evaluation metrics across multiple samples.
    Handles missing fields and avoids ZeroDivisionError.
    """
    total_field_scores = {'effective_date': [], 'jurisdiction': [], 'party': [], 'term': []}
    total_structural_scores = {'key_match': [], 'party_count': []}

    print("\n\n================== 🧾 Evaluation Summary Across Samples ==================\n")

    for i, (field_scores, structural) in enumerate(zip(all_field_scores, all_structural_scores)):
        print(f"\n📄 Sample {i+1}")

        # --- Field-Level Scores ---
        field_sum = 0
        field_count = 0

        for field, score in field_scores.items():
            if score is None:
                print(f"- {field}: (Not present in either) ✅")
            else:
                print(f"- {field}: {score}")
                total_field_scores[field].append(score)
                field_sum += score
                field_count += 1

        avg_field_score = round(field_sum / field_count, 3) if field_count > 0 else "N/A"

        # --- Structural Scores ---
        key_score = structural.get('key_match_score')
        party_score = structural.get('party_count_score')

        if key_score is not None:
            total_structural_scores['key_match'].append(key_score)
        if party_score is not None:
            total_structural_scores['party_count'].append(party_score)

        print(f"- Key Match Score: {key_score}")
        print(f"- Party Count Score: {party_score}")

        combined_structural = [
            s for s in [key_score, party_score] if s is not None
        ]
        avg_structural = round(sum(combined_structural) / len(combined_structural), 3) if combined_structural else "N/A"

        print(f"🔹 Avg Field Score: {avg_field_score}")
        print(f"🔸 Combined Structural Score: {avg_structural}")

    # --- Aggregate Averages ---
    print("\n================== 📊 AVERAGES ==================\n")

    print("🔹 Field-Level Averages:")
    for field, scores in total_field_scores.items():
        if scores:
            avg = round(sum(scores) / len(scores), 3)
            print(f"- {field}: {avg}")
        else:
            print(f"- {field}: (No available comparisons)")

    print("\n🔸 Structural Averages:")
    for k in total_structural_scores:
        scores = total_structural_scores[k]
        if scores:
            avg = round(sum(scores) / len(scores), 3)
            label = "Key Match" if k == "key_match" else "Party Count"
            print(f"- {label} Score: {avg}")
        else:
            label = "Key Match" if k == "key_match" else "Party Count"
            print(f"- {label} Score: (No available comparisons)")


In [None]:
def print_average_scores_only(all_field_scores, all_structural_scores):
    """
    Print only the average field-level and structural scores across all samples.
    Use this when you want a compact summary view.
    """
    print("\n================== 📊 AVERAGE METRICS ==================\n")

    # --- Field-Level Averages ---
    field_totals = {}
    field_counts = {}

    for sample in all_field_scores:
        for field, score in sample.items():
            if score is not None:
                field_totals[field] = field_totals.get(field, 0) + score
                field_counts[field] = field_counts.get(field, 0) + 1

    print("🔹 Field-Level Averages:")
    for field in ['effective_date', 'jurisdiction', 'party', 'term']:
        if field_counts.get(field, 0) > 0:
            avg = round(field_totals[field] / field_counts[field], 3)
            print(f"- {field}: {avg}")
        else:
            print(f"- {field}: (No available comparisons)")

    # --- Structural Averages ---
    total_structural_scores = {
        "key_match": [],
        "party_count": []
    }

    for result in all_structural_scores:
        total_structural_scores["key_match"].append(result["key_match_score"])
        total_structural_scores["party_count"].append(result["party_count_score"])

    print("\n🔸 Structural Averages:")
    for metric_key, values in total_structural_scores.items():
        if values:
            avg = round(sum(values) / len(values), 3)
            label = "Key Match" if metric_key == "key_match" else "Party Count"
            print(f"- {label} Score: {avg}")
        else:
            print(f"- {metric_key} Score: (No data)")


## 5. Structured Prompts

In [None]:
# Define persona, instructions, and formatting rules
persona = """
You are an expert in identifying and extracting metadata from NDA (Non-Disclosure Agreement) documents.
"""

instruction = """
Extract the following metadata:
- effective_date: When the agreement becomes legally binding
- jurisdiction: The governing legal territory
- party: Entities bound by the agreement (May have multiple values)
- term: Duration of the agreement
Return only the metadata fields that are explicitly present in the text. If a field like effective_date or term is missing, exclude it from the output JSON and do not guess or explain.
"""

data_format = """
Output must be a JSON object following these Rules:
- Replace all spaces and colons (:) in attribute values with underscores (_)
- Dates must be in 'YYYY-MM-DD' format
- Jurisdiction: Only the state or country name is included, without prefixes like "State of"
- Party names must follow Title_Case with underscores replacing spaces and corporate suffixes standardized (e.g., "Inc.", "Corp.", "LLC", "Ltd.")
- Duration (term) must be normalized to 'number_units' format (e.g., 2_years, 12_months)
"""




## 6. Zero-shot Prompting

In [None]:
# Zero-shot evaluation: no examples included in the prompt
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

for x in example_indices:
    input_text = df['text'][x]

    # Zero-shot prompt
    prompt = f"""
{persona}

{instruction}

{data_format}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt)

    try:
        # Extract clean JSON from GPT response
        extracted_json = extract_json_from_text(model_output_raw)

        # Display GPT vs Ground Truth
        format_and_print_json(extracted_json, df['extracted'][x])

        # Evaluation Method 1: Field-Level Accuracy
        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        # Evaluation Method 2: Structural Correctness
        print("\n🔍 Structural Evaluation:")
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ GPT Output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary for zero-shot
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)




🔹 GPT Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_&_Struggles_Inc",
        "Second_Party"
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Returned Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Expected Party Count: 2
Predicted Party Count: 2



🔹 GPT Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Limited"
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20",
    "

## 7. Few-shot Prompting

In [None]:
# Few-shot examples
examples = ""
for i in range(3):  # Adjust number of few-shot examples here
    text_sample = df['text'][i]
    metadata_sample = df['extracted'][i]
    examples += f"\n\nExample {i+1}\ntext = {text_sample}\noutput = {metadata_sample}"

# Main prompt test on selected samples
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

for x in example_indices:
    input_text = df['text'][x]

    prompt1 = f"""
{persona}

{instruction}

{data_format}

Here are some examples of NDA metadata extraction:{examples}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt1)

    try:
        extracted_json = extract_json_from_text(model_output_raw)

        format_and_print_json(extracted_json, df['extracted'][x])

        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)
        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        print("\n🔍 Structural Evaluation:")
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)
        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ GPT Output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)





🔹 GPT Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_&_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.33
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Returned Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Expected Party Count: 2
Predicted Party Count: 2



🔹 GPT Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Limited"
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20"

## 8. Chain-of-Thought Prompting

In [None]:
# Zero-shot evaluation: no examples included in the prompt
example_indices = [ 8, 20, 55, 70 ]
all_field_scores = []
all_structural_scores = []

for x in example_indices:
    input_text = df['text'][x]

    # Zero-shot prompt
    prompt = f"""
{persona}

{instruction}

{data_format}

Let’s extract the required metadata by thinking step-by-step.

1. First, read the NDA and look for the effective_date. This is typically found where the agreement says something like "effective as of", "made on", or "entered into on".
2. Next, identify the jurisdiction. This is usually mentioned in a clause like "governed by the laws of..." or "jurisdiction of".
3. Then, extract all the parties. These are the legal entities entering into the agreement. They often appear at the beginning or near the signature lines.
4. Finally, determine the term. Look for duration-related phrases like "for two years", "valid until", or "will terminate after...".


Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    print(f"\n\n======================= Text {x} =======================\n")
    model_output_raw = generate_response(prompt)

    try:
        # Extract clean JSON from GPT response
        extracted_json = extract_json_from_text(model_output_raw)

        # Display GPT vs Ground Truth
        format_and_print_json(extracted_json, df['extracted'][x])

        # Evaluation Method 1: Field-Level Accuracy
        print("\n📊 Field-Level Evaluation:")
        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        for field, score in field_scores.items():
            if score is None:
                print(f"{field}: (Not present in either) ✅")
            else:
                print(f"{field}: {score}")

        # Evaluation Method 2: Structural Correctness
        print("\n🔍 Structural Evaluation:")
        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        print(f"Key Match Score: {key_eval['key_match_score']}")
        print(f"Party Count Score: {key_eval['party_count_score']}")
        print(f"Expected Keys: {key_eval['expected_keys']}")
        print(f"Returned Keys: {key_eval['returned_keys']}")
        print(f"Expected Party Count: {key_eval['expected_party_count']}")
        print(f"Predicted Party Count: {key_eval['predicted_party_count']}")

    except ValueError as e:
        print("⚠️ GPT Output is not valid JSON\nRaw output:\n", model_output_raw)

# Final Summary for zero-shot
summarize_evaluation_across_samples(all_field_scores, all_structural_scores)




🔹 GPT Output:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_&_Struggles_Inc",
        "Second_Party"
    ],
    "term": "5_years"
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2011-05-16",
    "jurisdiction": "Illinois",
    "party": [
        "Heidrick_and_Struggles_Inc.",
        "Richard_W._Pehlke"
    ],
    "term": "5_years"
}
```

📊 Field-Level Evaluation:
effective_date: 1.0
jurisdiction: 1.0
party: 0.0
term: 1.0

🔍 Structural Evaluation:
Key Match Score: 1.0
Party Count Score: 1.0
Expected Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Returned Keys: {'jurisdiction', 'effective_date', 'term', 'party'}
Expected Party Count: 2
Predicted Party Count: 2



🔹 GPT Output:
```json
{
    "effective_date": "2018-04-20",
    "jurisdiction": "Nevada",
    "party": [
        "Elaine_P._Wynn",
        "Wynn_Resorts_Limited"
    ]
}
```

🔸 Ground Truth:
```json
{
    "effective_date": "2018-04-20",
    "

## 9. Evaluation Using Entire Dataset (Few-shot Prompting)

In [None]:
# Prepare few-shot examples
examples = ""
for i in range(3):  # Adjust number of few-shot examples here
    text_sample = df['text'][i]
    metadata_sample = df['extracted'][i]
    examples += f"\n\nExample {i+1}\ntext = {text_sample}\noutput = {metadata_sample}"

# Run across all rows in the DataFrame
example_indices = df.index.tolist()
all_field_scores = []
all_structural_scores = []
results = []

for x in example_indices:
    input_text = df['text'][x]

    prompt1 = f"""
{persona}

{instruction}

{data_format}

Here are some examples of NDA metadata extraction:{examples}

Now extract metadata from the following NDA:
text = {input_text}
output =
"""

    model_output_raw = generate_response(prompt1)

    try:
        extracted_json = extract_json_from_text(model_output_raw)

        field_scores = evaluate_metadata_fields(extracted_json, df['extracted'][x])
        all_field_scores.append(field_scores)

        key_eval = evaluate_key_match_and_party_count(
            extracted_json,
            df['keys'][x],
            df['party_count'][x]
        )
        all_structural_scores.append(key_eval)

        results.append({
            "index": x,
            "model_output": extracted_json,
            "ground_truth": df['extracted'][x],
            "model_keys": key_eval['returned_keys'],
            "true_keys": key_eval['expected_keys'],
            "model_party_count": key_eval['predicted_party_count'],
            "true_party_count": key_eval['expected_party_count']
        })

    except ValueError:
        results.append({
            "index": x,
            "model_output": model_output_raw,
            "ground_truth": df['extracted'][x],
            "model_keys": "INVALID_JSON",
            "true_keys": df['keys'][x],
            "model_party_count": "INVALID_JSON",
            "true_party_count": df['party_count'][x]
        })

# Final Summary
print_average_scores_only(all_field_scores, all_structural_scores)

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("openai.csv", index=False)



🔹 Field-Level Averages:
- effective_date: 0.936
- jurisdiction: 0.99
- party: 0.497
- term: 0.387

🔸 Structural Averages:
- Key Match Score: 0.858
- Party Count Score: 0.875
