In [None]:
import os
import json
import re
import time
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from openai import AzureOpenAI  # Azure OpenAI client
from tqdm import tqdm  # For progress bars
import argparse  # For command-line arguments


# Download NLTK data if not present
nltk.download('punkt', quiet=True)

# Initialize Azure OpenAI client
def initialize_azure_openai_client():
    """Initialize the Azure OpenAI client."""
    client = AzureOpenAI(
        azure_endpoint="https://nora-south-india.openai.azure.com/",  # Replace with your Azure endpoint
        api_key="API_KEY",  # Replace with your API key
        api_version="2024-02-01"
    )
    print("Azure OpenAI Client Initialized")
    return client

# Normalize text for fair comparison
def normalize_text(text):
    """Normalize text for fair comparison."""
    if not text:
        return ""
    return str(text).lower().strip()

# Extract JSON content from generated text
def extract_json_from_text(text):
    """Extract JSON content from generated text, handling multi-line JSON structures."""
    try:
        # Find all possible JSON patterns in text
        json_pattern = r'\{(?:[^{}]|(?:\{.*\}))*\}'  
        matches = re.findall(json_pattern, text, re.DOTALL)
        
        if matches:
            for match in matches[::-1]:  # Check from the last occurrence
                try:
                    return json.loads(match)
                except json.JSONDecodeError:
                    continue
        return {}
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return {}

# Calculate BLEU and ROUGE scores
def calculate_metrics(generated_text, reference_text):
    """Calculate BLEU and ROUGE scores between generated and reference texts."""
    try:
        # Normalize texts
        gen_normalized = normalize_text(generated_text)
        ref_normalized = normalize_text(reference_text)
        
        # Calculate ROUGE scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(ref_normalized, gen_normalized)
        
        # Prepare for BLEU calculation
        reference_tokens = [nltk.word_tokenize(ref_normalized)]
        generated_tokens = nltk.word_tokenize(gen_normalized)
        
        # Calculate BLEU with smoothing
        smooth = SmoothingFunction()
        bleu1 = sentence_bleu(reference_tokens, generated_tokens, 
                             weights=(1, 0, 0, 0),
                             smoothing_function=smooth.method1)
        
        bleu4 = sentence_bleu(reference_tokens, generated_tokens, 
                             weights=(0.25, 0.25, 0.25, 0.25),
                             smoothing_function=smooth.method1)
        
        return {
            'bleu1': bleu1,
            'bleu4': bleu4,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure, 
            'rougeL': rouge_scores['rougeL'].fmeasure
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return {
            'bleu1': 0.0, 'bleu4': 0.0,
            'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0
        }


# Calculate exact match rate for JSON objects
def calculate_exact_match_rate(generated_json, reference_json):
    """Calculate exact match rate for JSON objects."""
    try:
        if not generated_json or not reference_json:
            return 0.0
            
        # Extract missing points arrays
        gen_points = generated_json.get('pointsMissed', [])
        ref_points = reference_json.get('pointsMissed', [])
        
        if not gen_points or not ref_points:
            return 0.0
            
        # Normalize points
        gen_points = [normalize_text(p) for p in gen_points]
        ref_points = [normalize_text(p) for p in ref_points]
        
        # Calculate how many points match exactly
        matches = sum(1 for p in gen_points if p in ref_points)
        
        # Calculate precision: matches / generated points
        precision = matches / len(gen_points) if gen_points else 0
        
        # Calculate recall: matches / reference points
        recall = matches / len(ref_points) if ref_points else 0
        
        # F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    except Exception as e:
        print(f"Error calculating exact match: {e}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

# Process evaluation data
total_metrics = {
    'bleu1': [], 'bleu4': [],
    'rouge1': [], 'rouge2': [], 'rougeL': [],
    'precision': [], 'recall': [], 'f1': []
}

# Generate a response using Azure OpenAI GPT-4
def generate_with_azure_openai(client, prompt, max_retries=3):
    """Generate a response using Azure OpenAI GPT-4 with retries."""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",  # Replace with your deployment name
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=300,
                top_p=0,
                frequency_penalty=0,
                presence_penalty=0
            )
            if response.choices and response.choices[0].message.content:
                return response.choices[0].message.content.strip()
            else:
                print(f"Attempt {attempt + 1}: Empty response from Azure OpenAI.")
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error generating with Azure OpenAI: {e}")
        time.sleep(2)  # Wait before retrying
    return ""

# Load data from JSON file
with open('new_updated_examples.json', 'r') as f:
    data = json.load(f)

# Initialize Azure OpenAI client with your API key
client = AzureOpenAI(
    azure_endpoint="https://nora-south-india.openai.azure.com/",
    api_key=API_KEY,  # Using the API_KEY variable defined earlier
    api_version="2024-02-01"
)
print("Azure OpenAI Client Initialized")

for i in data:
    try:
        mes = '''
Ultra-Strict Expert Evaluation Rules:youare gove a answer to be covered json list blocks you have tand the answrr cointains the points related to points to be covered you are suppoed to select which points list blocks is missing you have to select only one list block ss the aswer and nothing feom other list blocks and by any case do not mix the asnwers 
don't do like this -
reference_output -->{"pointsMissed": ["All is well", "Ending the conversation"]}
Generated outut --> Points missed: [['All is well', 'Informal response', 'Ending the conversation']]
this is wrong the generated output should be  ["All is well", "Ending the conversation"] nothing else even if u get the answers wrong its fine but don't mixup the solution  
''' 

        ans = mes + i['conversations'][1]['content']
        reference_output = i['conversations'][2]['content']
        message = ans
        print('\nEvaluating example:')
        # print('Reference output:', reference_output)
        generated_text = generate_with_azure_openai(client, ans)
        # print(f'Generated outut --> {generated_text}')
        reference_json = extract_json_from_text(reference_output)
        generated_json = extract_json_from_text(generated_text)
        print(f'reference_json: {reference_json}')
        print(f'Generated_json: {generated_json}')
        # Calculate text-level metrics
        text_metrics = calculate_metrics(generated_text, reference_output)
        print("Text Metrics:", text_metrics)
        
        # Calculate JSON-level metrics
        json_metrics = calculate_exact_match_rate(generated_json, reference_json)
        print("JSON Metrics:", json_metrics)
        
        # Accumulate metrics
        for key in text_metrics:
            total_metrics[key].append(text_metrics[key])
        for key in json_metrics:
            total_metrics[key].append(json_metrics[key])
            
    except Exception as e:
        print(f"Error processing example: {e}")
        continue

# Calculate and display average metrics
print("\n===== EVALUATION RESULTS =====")
print("\nAverage Metrics:")
for key in total_metrics:
    if total_metrics[key]:
        avg = np.mean(total_metrics[key])
        print(f"{key}: {avg:.4f}")

print("\nNumber of examples evaluated:", len(total_metrics['bleu1']))