In [None]:
import json
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import re
from datetime import datetime

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

class MetricsCalculator:

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        self.smoothing_function = SmoothingFunction().method1

    def calculate_bleu(self, reference, candidate):
        if not reference or not candidate:
            return 0.0

        reference_tokens = reference.lower().split()
        candidate_tokens = candidate.lower().split()

        try:
            score = sentence_bleu(
                [reference_tokens],
                candidate_tokens,
                smoothing_function=self.smoothing_function
            )
            return score
        except:
            return 0.0

    def calculate_rouge_l(self, reference, candidate):
        if not reference or not candidate:
            return 0.0

        try:
            scores = self.rouge_scorer.score(reference, candidate)
            return scores['rougeL'].fmeasure
        except:
            return 0.0

    def calculate_command_accuracy(self, reference, candidate):
        if not reference or not candidate:
            return 0.0

        ref_commands = self.extract_commands(reference)
        cand_commands = self.extract_commands(candidate)

        if not ref_commands and not cand_commands:
            return 1.0
        if not ref_commands or not cand_commands:
            return 0.0

        ref_main = ref_commands[0] if ref_commands else ""
        cand_main = cand_commands[0] if cand_commands else ""

        if ref_main.lower() in cand_main.lower() or cand_main.lower() in ref_main.lower():
            return 1.0

        return 0.0

    def extract_commands(self, text):
        import re
        command_pattern = r'\b(?:git|tar|find|python|pip|head|top|ls|cd|cp|mv|rm|mkdir|chmod|grep|sed|awk)\b[^\n]*'
        commands = re.findall(command_pattern, text.lower())
        return commands

    def score_plan_quality(self, prompt, response):
        if not response:
            return 0

        response_lower = response.lower()

        has_command = bool(re.search(r'\b(?:git|tar|find|python|pip|head|top|ls|cd|cp|mv|rm|mkdir|chmod|grep|sed|awk)\b', response_lower))
        has_steps = bool(re.search(r'\b(?:step|first|then|next|finally|\d+\.)\b', response_lower))

        prompt_lower = prompt.lower()
        relevant_keywords = []

        if "git" in prompt_lower:
            relevant_keywords = ["git", "branch", "checkout"]
        elif "compress" in prompt_lower or "tar" in prompt_lower:
            relevant_keywords = ["tar", "compress", "gz"]
        elif "python" in prompt_lower and "files" in prompt_lower:
            relevant_keywords = ["find", "python", "*.py"]
        elif "virtual environment" in prompt_lower:
            relevant_keywords = ["venv", "pip", "install"]
        elif "lines" in prompt_lower and "file" in prompt_lower:
            relevant_keywords = ["head", "lines"]
        elif "find" in prompt_lower and "replace" in prompt_lower:
            relevant_keywords = ["sed", "find", "replace"]
        elif "monitor" in prompt_lower and "process" in prompt_lower:
            relevant_keywords = ["top", "ps", "monitor"]

        has_relevant = any(keyword in response_lower for keyword in relevant_keywords)

        if has_command and has_relevant:
            if has_steps:
                return 2
            else:
                return 1
        elif has_command or has_relevant:
            return 1
        else:
            return 0

def compare_models(base_data, finetuned_data):
    metrics_calc = MetricsCalculator()
    comparison_results = []

    base_results = {r['prompt']: r for r in base_data['detailed_results']}
    finetuned_results = {r['prompt']: r for r in finetuned_data['detailed_results']}

    for prompt in base_results:
        if prompt not in finetuned_results:
            continue

        base_result = base_results[prompt]
        finetuned_result = finetuned_results[prompt]
        reference = base_result['reference_answer']

        base_response = base_result['generated_response']
        finetuned_response = finetuned_result['generated_response']

        base_bleu = metrics_calc.calculate_bleu(reference, base_response)
        base_rouge = metrics_calc.calculate_rouge_l(reference, base_response)
        base_command_acc = metrics_calc.calculate_command_accuracy(reference, base_response)
        base_plan_quality = metrics_calc.score_plan_quality(prompt, base_response)

        finetuned_bleu = metrics_calc.calculate_bleu(reference, finetuned_response)
        finetuned_rouge = metrics_calc.calculate_rouge_l(reference, finetuned_response)
        finetuned_command_acc = metrics_calc.calculate_command_accuracy(reference, finetuned_response)
        finetuned_plan_quality = metrics_calc.score_plan_quality(prompt, finetuned_response)

        comparison_results.append({
            'prompt_id': base_result['prompt_id'],
            'prompt': prompt,
            'reference_answer': reference,
            'base_response': base_response,
            'finetuned_response': finetuned_response,
            'base_metrics': {
                'bleu_score': base_bleu,
                'rouge_l_score': base_rouge,
                'command_accuracy': base_command_acc,
                'plan_quality': base_plan_quality
            },
            'finetuned_metrics': {
                'bleu_score': finetuned_bleu,
                'rouge_l_score': finetuned_rouge,
                'command_accuracy': finetuned_command_acc,
                'plan_quality': finetuned_plan_quality
            },
            'timestamp': datetime.now().isoformat()
        })

    base_avg_metrics = {
        'bleu_score': sum(r['base_metrics']['bleu_score'] for r in comparison_results) / len(comparison_results),
        'rouge_l_score': sum(r['base_metrics']['rouge_l_score'] for r in comparison_results) / len(comparison_results),
        'command_accuracy': sum(r['base_metrics']['command_accuracy'] for r in comparison_results) / len(comparison_results),
        'plan_quality': sum(r['base_metrics']['plan_quality'] for r in comparison_results) / len(comparison_results)
    }

    finetuned_avg_metrics = {
        'bleu_score': sum(r['finetuned_metrics']['bleu_score'] for r in comparison_results) / len(comparison_results),
        'rouge_l_score': sum(r['finetuned_metrics']['rouge_l_score'] for r in comparison_results) / len(comparison_results),
        'command_accuracy': sum(r['finetuned_metrics']['command_accuracy'] for r in comparison_results) / len(comparison_results),
        'plan_quality': sum(r['finetuned_metrics']['plan_quality'] for r in comparison_results) / len(comparison_results)
    }

    return comparison_results, base_avg_metrics, finetuned_avg_metrics

def generate_markdown_report(comparison_results, base_avg_metrics, finetuned_avg_metrics):
    markdown_content = "# Model Comparison Report\n\n"
    markdown_content += f"**Comparison Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    markdown_content += "## Overview\n\n"
    markdown_content += "This report compares the performance of the base `microsoft/phi-2` model and the fine-tuned `microsoft/phi-2 (fine-tuned with LoRA)` model on seven test prompts, including five standard prompts and two edge cases. Metrics include BLEU, ROUGE-L, command accuracy, and plan quality (scored 0-2).\n\n"

    markdown_content += "## Average Metrics\n\n"
    markdown_content += "| Metric | Base Model | Fine-Tuned Model |\n"
    markdown_content += "|--------|------------|------------------|\n"
    markdown_content += f"| BLEU Score | {base_avg_metrics['bleu_score']:.3f} | {finetuned_avg_metrics['bleu_score']:.3f} |\n"
    markdown_content += f"| ROUGE-L Score | {base_avg_metrics['rouge_l_score']:.3f} | {finetuned_avg_metrics['rouge_l_score']:.3f} |\n"
    markdown_content += f"| Command Accuracy | {base_avg_metrics['command_accuracy']:.3f} | {finetuned_avg_metrics['command_accuracy']:.3f} |\n"
    markdown_content += f"| Plan Quality | {base_avg_metrics['plan_quality']:.3f} | {finetuned_avg_metrics['plan_quality']:.3f} |\n\n"

    markdown_content += "## Detailed Comparison\n\n"
    for result in comparison_results:
        markdown_content += f"### Prompt {result['prompt_id']}: {result['prompt']}\n\n"
        markdown_content += f"**Reference Answer**: `{result['reference_answer']}`\n\n"
        markdown_content += "#### Base Model\n"
        markdown_content += f"- **Response**: `{result['base_response']}`\n"
        markdown_content += f"- **BLEU Score**: {result['base_metrics']['bleu_score']:.3f}\n"
        markdown_content += f"- **ROUGE-L Score**: {result['base_metrics']['rouge_l_score']:.3f}\n"
        markdown_content += f"- **Command Accuracy**: {result['base_metrics']['command_accuracy']:.3f}\n"
        markdown_content += f"- **Plan Quality**: {result['base_metrics']['plan_quality']}/2\n\n"
        markdown_content += "#### Fine-Tuned Model\n"
        markdown_content += f"- **Response**: `{result['finetuned_response']}`\n"
        markdown_content += f"- **BLEU Score**: {result['finetuned_metrics']['bleu_score']:.3f}\n"
        markdown_content += f"- **ROUGE-L Score**: {result['finetuned_metrics']['rouge_l_score']:.3f}\n"
        markdown_content += f"- **Command Accuracy**: {result['finetuned_metrics']['command_accuracy']:.3f}\n"
        markdown_content += f"- **Plan Quality**: {result['finetuned_metrics']['plan_quality']}/2\n\n"

    return markdown_content

def main():
    with open("./phi2Base/eval_static.md", "r") as f:
        base_data = json.load(f)
    with open("./phi2fineTuned/logs/phi2_finetuned_evaluation.json", "r") as f:
        finetuned_data = json.load(f)


    comparison_results, base_avg_metrics, finetuned_avg_metrics = compare_models(base_data, finetuned_data)
    markdown_content = generate_markdown_report(comparison_results, base_avg_metrics, finetuned_avg_metrics)
    with open("model_comparison.md", "w") as f:
        f.write(markdown_content)

    print("Comparison report saved to model_comparison.md")

if __name__ == "__main__":
    main()

Comparison report saved to model_comparison.md
