
## Evaluation

In [None]:
%%capture
!pip install unsloth
!pip install rouge-score nltk
from unsloth.chat_templates import get_chat_template


### Test data

In [None]:

path = '/kaggle/input/train-and-test-dataset/formatted_training_data_v2.json'
data = json.load(open(path))
for i in data:
    print(i)
    print('\n') 

In [None]:
#finetuned
from unsloth import FastLanguageModel
import torch 
max_seq_length = 2048
dtype = None
load_in_4bit = True 

model,tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/qwen_2.5_v2/pytorch/default/1",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)



In [None]:
### FastLanguageModel.for_inference(model) # Enable native 2x faster inference
from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer
import json
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
import re

# Make sure we have NLTK data
nltk.download('punkt', quiet=True)

# Better normalization function
def normalize_text(text):
    """Improved text normalization that preserves more semantic information."""
    if not text:
        return ""
    # Convert to lowercase but preserve sentence structure
    text = str(text).lower().strip()
    # Replace multiple spaces but keep punctuation that affects meaning
    text = re.sub(r'\s+', ' ', text)
    # Use stemming for better word matching
    words = nltk.word_tokenize(text)
    stemmer = nltk.stem.PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Improved JSON matching with semantic similarity
def calculate_json_match(generated_json, reference_json, similarity_threshold=0.85):
    """Calculate JSON matching with semantic similarity."""
    try:
        gen_points = generated_json.get('pointsMissed', [])
        ref_points = reference_json.get('pointsMissed', [])
        
        if not gen_points or not ref_points:
            return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        
        # Use sentence transformers for semantic matching
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Embed all points
        gen_embeddings = model.encode(gen_points)
        ref_embeddings = model.encode(ref_points)
        
        # Calculate cosine similarity matrix
        from sklearn.metrics.pairwise import cosine_similarity
        similarity_matrix = cosine_similarity(gen_embeddings, ref_embeddings)
        
        # Count matches using similarity threshold
        matches = sum(1 for row in similarity_matrix if max(row) >= similarity_threshold)
        
        # Calculate metrics
        precision = matches / len(gen_points) if gen_points else 0
        recall = matches / len(ref_points) if ref_points else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {'precision': precision, 'recall': recall, 'f1': f1}
    except Exception as e:
        print(f"Error in semantic matching: {e}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        
def extract_json_from_text(text):
    """Extract JSON content from generated text, handling multi-line JSON structures."""
    print("TEXT", text)
    try:
        # Find JSON pattern in the text (allowing for nested structures)
        json_pattern = r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{[^{}]*\}'
        matches = re.findall(json_pattern, text, re.DOTALL)
        
        if matches:
            for match in matches[::-1]:  # Check from the last occurrence
                try:
                    # Remove extra characters and whitespace
                    match = match.strip()
                    if match.startswith("```json"):
                        match = match[7:].strip()
                    if match.endswith("```"):
                        match = match[:-3].strip()
                    return json.loads(match)
                except json.JSONDecodeError:
                    continue
        return {}
    
    except Exception as e:
        print(f"Error extracting JSON: {e}")
        return {}

def calculate_metrics(generated_text, reference_text):
    """Calculate BLEU and ROUGE scores between generated and reference texts."""
    try:
        # Normalize texts
        gen_normalized = normalize_text(generated_text)
        ref_normalized = normalize_text(reference_text)
        
        # Calculate ROUGE scores
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = scorer.score(ref_normalized, gen_normalized)
        
        # Prepare for BLEU calculation
        reference_tokens = [nltk.word_tokenize(ref_normalized)]
        generated_tokens = nltk.word_tokenize(gen_normalized)
        
        # Calculate BLEU with smoothing
        smooth = SmoothingFunction()
        bleu1 = sentence_bleu(reference_tokens, generated_tokens, 
                             weights=(1, 0, 0, 0),
                             smoothing_function=smooth.method1)
        
        bleu4 = sentence_bleu(reference_tokens, generated_tokens, 
                             weights=(0.25, 0.25, 0.25, 0.25),
                             smoothing_function=smooth.method1)
        
        return {
            'bleu1': bleu1,
            'bleu4': bleu4,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure, 
            'rougeL': rouge_scores['rougeL'].fmeasure
        }
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return {
            'bleu1': 0.0, 'bleu4': 0.0,
            'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0
        }

def calculate_exact_match_rate(generated_json, reference_json):
    """Calculate exact match rate for JSON objects."""
    try:
        if not generated_json or not reference_json:
            print('Not generated')
            return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
            
        # Extract missing points arrays
        gen_points = generated_json.get('pointsMissed', [])
        ref_points = reference_json.get('pointsMissed', [])
        
        if not gen_points or not ref_points:
            return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
            
        # Normalize points
        gen_points = [normalize_text(p) for p in gen_points]
        ref_points = [normalize_text(p) for p in ref_points]
        print(gen_points)
        print(ref_points)
        # Calculate how many points match exactly
        matches = sum(1 for p in gen_points if p in ref_points)
        
        # Calculate precision: matches / generated points
        precision = matches / len(gen_points) if gen_points else 0
        
        # Calculate recall: matches / reference points
        recall = matches / len(ref_points) if ref_points else 0
        
        # F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    except Exception as e:
        print(f"Error calculating exact match: {e}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

# Process evaluation data
total_metrics = {
    'bleu1': [], 'bleu4': [],
    'rouge1': [], 'rouge2': [], 'rougeL': [],
    'precision': [], 'recall': [], 'f1': []
}

# Run evaluation for each example
for i in data:
    try:
        # Prepare the prompt
        mes = 'Act like an expert evaluator, where given an answer and a list of points to be covered in the answer you give a json of missing points'
        ans = mes + i['conversations'][1]['content']
        reference_output = i['conversations'][2]['content']
        message = ans
        
        print('\nEvaluating example:')
        print('Reference output:', reference_output)
        
        # Tokenize input and move to GPU
        inputs = tokenizer(message, return_tensors="pt").to("cuda")
        
        # Initialize the streamer
        text_streamer = TextStreamer(tokenizer, skip_prompt=True)
        
        # Generate response
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            streamer=text_streamer,  # This streams output while generating
            max_new_tokens=300,
            use_cache=True,
            temperature=0.2,
            min_p=0.2,
            top_k=0
        )
        
        # Decode the generated text
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        # print("Generated Output:", generated_text)
        
        # Extract JSON from both texts
        reference_json = extract_json_from_text(reference_output)
        generated_json = extract_json_from_text(generated_text)
        print("Reference JSON:", reference_json)
        print("Generated JSON:", generated_json)
        
        # Calculate text-level metrics
        text_metrics = calculate_metrics(generated_text, reference_output)
        print("Text Metrics:", text_metrics)
        
        # Calculate JSON-level metrics
        json_metrics = calculate_exact_match_rate(generated_json, reference_json)
        print("JSON Metrics:", json_metrics)
        
        # Accumulate metrics
        for key in text_metrics:
            total_metrics[key].append(text_metrics[key])
        for key in json_metrics:
            total_metrics[key].append(json_metrics[key])
            
    except Exception as e:
        print(f"Error processing example: {e}")
        continue

# Calculate and display average metrics
print("\n===== EVALUATION RESULTS =====")
print("\nAverage Metrics:")
for key in total_metrics:
    if total_metrics[key]:
        avg = np.mean(total_metrics[key])
        print(f"{key}: {avg:.4f}")

print("\nNumber of examples evaluated:", len(total_metrics['bleu1']))