In [None]:
import os
import random
import time
import pandas as pd
from ollama import Client
import json
import logging
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from pathlib import Path
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------

@dataclass
class Config:
    """Configuration class for the requirements classifier."""
    csv_path: str = '' # Replace with your CSV file path
    model_name: str = "phi4-mini-reasoning:3.8b" # Change to your desired model
    valid_categories: List[str] = None
    sample_sizes: List[int] = None
    num_repeats: int = 1
    delay_between_requests: float = 0.1
    max_retries: int = 1
    output_dir: str = "classification_results"

    def __post_init__(self):
        if self.valid_categories is None:
            self.valid_categories = ['Energy', 'Entertainment', 'Health', 'Safety', 'Other']
        if self.sample_sizes is None:
            # Using a slightly larger sample size to make results more meaningful
            self.sample_sizes = [10]

# Initialize configuration
config = Config()

# -----------------------------------------------------------------------------
# IMPROVED PROMPT PATTERNS
# -----------------------------------------------------------------------------

class PromptTemplates:
    """Enhanced prompt templates with better structure and clarity."""

    @staticmethod
    def get_templates() -> List[Tuple[str, str]]:
        """Returns list of (name, template) tuples."""
        return [
           ("Context Manager", """
You are an expert requirements analyst with deep knowledge of software domains.

CLASSIFICATION TASK:
Categorize this requirement into exactly one of:
Energy | Entertainment | Health | Safety | Other

CONTEXT GUIDELINES:
• Consider primary business purpose and end-user impact.
• Weigh explicit signals (keywords like “battery” or “fitness”) over incidental mentions.
• If ambiguous, choose the category that best aligns with the main functional intent.

REQUIREMENT: "{req}"

STRUCTURED RESPONSE:
1. Reasoning: [Briefly state (in 1–2 sentences) why you assigned the category.]
2. Category: [One word: Energy, Entertainment, Health, Safety, or Other]
"""),

            ("Domain Expert Persona", """
As a senior requirements engineer with 15+ years in software specification, classify this requirement precisely.

DOMAIN EXPERTISE:
• Energy management (e.g., power optimization, battery usage)
• Entertainment platforms (e.g., media streaming, game features)
• Health software (e.g., patient monitoring, wellness tracking)
• Safety systems (e.g., encryption, compliance, risk analysis)

REQUIREMENT TO CLASSIFY: "{req}"

RESPONSE FORMAT:
1. Expert Justification: [One clear sentence linking requirement to chosen domain.]
2. Final Category: [One word: Energy | Entertainment | Health | Safety | Other]
"""),

            ("Structured Chain of Thought", """
REQUIREMENT CLASSIFICATION SYSTEM

CATEGORIES (choose exactly one):
• Energy – Power/battery management or energy efficiency
• Entertainment – Media streaming, gaming, or user engagement features
• Health – Clinical, medical device, fitness, or wellness applications
• Safety – Data protection, compliance, or risk mitigation
• Other – Requirements that do not clearly fit above

CLASSIFICATION PROCESS:
1. Extract Key Terms: [Identify three to five primary verbs/nouns]
2. Primary Stakeholder: [Who benefits most? e.g., end user, clinician, operator]
3. Core Function: [What capability does this requirement enable?]
4. Business Value: [Why is this requirement needed? e.g., cost reduction, increased safety]

REQUIREMENT: "{req}"

ANALYSIS:
• Key Terms:
• Primary Stakeholder:
• Core Function:
• Business Value:

FINAL CATEGORY: [Energy | Entertainment | Health | Safety | Other]
"""),

            ("Enhanced Zero-Shot", """
SOFTWARE REQUIREMENT CLASSIFICATION

Choose exactly one category based on the requirement’s primary purpose and domain:

• Energy – References to power, battery, consumption, or energy management
• Entertainment – Mentions of games, media, streaming, social, or recreational features
• Health – Indicators of medical/clinical, fitness, wellness, or patient care
• Safety – Concerns about security, protection, compliance, or risk mitigation
• Other – Requirements not primarily about the above domains

REQUIREMENT: "{req}"

CLASSIFICATION (one word only): [Energy | Entertainment | Health | Safety | Other]
"""),

            ("Few-Shot with Reasoning", """
CLASSIFICATION EXAMPLES WITH EXPLANATION:

Example 1:
Requirement: "The smart thermostat shall learn user preferences and optimize heating schedules to minimize energy consumption while maintaining comfort."
Reasoning: The emphasis is clearly on reducing energy usage via automated scheduling.
Category: Energy

Example 2:
Requirement: "The fitness app shall provide real-time heart rate monitoring and alert users when heart rate exceeds safe thresholds during exercise."
Reasoning: Core functionality relates to health monitoring and user safety during workouts.
Category: Health

Example 3:
Requirement: "The streaming service shall recommend personalized content based on viewing history and allow users to create custom playlists."
Reasoning: Focuses on user engagement by delivering entertainment-oriented features.
Category: Entertainment

TASK: Classify the following requirement following the same format:

Requirement: "{req}"

Reasoning: [One sentence explaining why]
Category: [Energy | Entertainment | Health | Safety | Other]
""")
        ]

# -----------------------------------------------------------------------------
# ENHANCED CLASSIFICATION SYSTEM
# -----------------------------------------------------------------------------

class RequirementsClassifier:
    """Enhanced requirements classification system with improved error handling and response parsing."""

    def __init__(self, config: Config):
        self.config = config
        self.client = Client()
        self.prompt_templates = PromptTemplates.get_templates()

    def _extract_category_from_response(self, response: str) -> str:
        """Enhanced category extraction with multiple fallback strategies."""
        response = response.strip()

        # Strategy 1: Look for exact category matches (case-insensitive)
        for category in self.config.valid_categories:
            pattern = rf'\b{re.escape(category)}\b'
            if re.search(pattern, response, re.IGNORECASE):
                return category

        # Strategy 2: Look for category after common prefixes
        prefixes = ['category:', 'classification:', 'answer:', 'final:', 'result:', 'consensus category:']
        for prefix in prefixes:
            pattern = rf'{prefix}\s*(\w+)'
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                candidate = match.group(1).capitalize().strip()
                if candidate in self.config.valid_categories:
                    return candidate

        # Strategy 3: Check last line for category
        lines = response.split('\n')
        for line in reversed(lines):
            line = line.strip()
            if line:
                for category in self.config.valid_categories:
                    if category.lower() in line.lower():
                        return category

        # Strategy 4: Return first token if it matches a category
        first_token = response.split()[0] if response.split() else ""
        first_token_cap = first_token.capitalize()
        if first_token_cap in self.config.valid_categories:
            return first_token_cap

        logger.warning(f"Could not extract category from response: {response[:100]}...")
        return "Other"

    def classify_requirement(self, prompt_template: str, requirement_text: str) -> Dict:
        """Classify a single requirement with enhanced error handling."""
        for attempt in range(self.config.max_retries):
            try:
                prompt = prompt_template.format(req=requirement_text.strip())

                response = self.client.generate(
                    model=self.config.model_name,
                    prompt=prompt,
                    options={
                        'temperature': 1.0,
                        'top_p': 0.9,
                        'num_ctx': 16384
                    }
                )

                raw_response = response.get('response', '').strip()
                predicted_category = self._extract_category_from_response(raw_response)

                return {
                    'predicted_category': predicted_category,
                    'raw_response': raw_response,
                    'success': True,
                    'attempt': attempt + 1
                }

            except Exception as e:
                logger.error(f"Attempt {attempt + 1} failed for model '{self.config.model_name}': {str(e)}")
                if attempt < self.config.max_retries - 1:
                    time.sleep(1)
                else:
                    return {
                        'predicted_category': 'Other',
                        'raw_response': f"Error: {str(e)}",
                        'success': False,
                        'attempt': attempt + 1
                    }

        return {
            'predicted_category': 'Other',
            'raw_response': "Max retries exceeded",
            'success': False,
            'attempt': self.config.max_retries
        }

# -----------------------------------------------------------------------------
# COMPREHENSIVE EXPERIMENT RUNNER
# -----------------------------------------------------------------------------

class ComprehensiveExperimentRunner:
    """Runs comprehensive experiments across all techniques and sample sizes."""

    def __init__(self, config: Config):
        self.config = config
        self.classifier = RequirementsClassifier(config)
        self.df = None
        self.results = []

        Path(config.output_dir).mkdir(exist_ok=True)

    def load_dataset(self) -> None:
        """Load and validate the dataset."""
        try:
            self.df = pd.read_csv(self.config.csv_path)
            required_columns = ['requirements', 'application_domain']

            for col in required_columns:
                if col not in self.df.columns:
                    raise ValueError(f"CSV must contain '{col}' column")

            logger.info(f"Loaded dataset with {len(self.df)} requirements")

            self.df.dropna(subset=required_columns, inplace=True)
            self.df['application_domain'] = self.df['application_domain'].str.strip().str.capitalize()

            valid_mask = self.df['application_domain'].isin(self.config.valid_categories)
            if not valid_mask.all():
                invalid_categories = self.df.loc[~valid_mask, 'application_domain'].unique()
                logger.warning(f"Found and filtered out invalid categories: {invalid_categories}")
                self.df = self.df[valid_mask]

            logger.info(f"Dataset ready with {len(self.df)} valid requirements.")

        except FileNotFoundError:
            logger.error(f"Dataset file not found at: {self.config.csv_path}")
            raise
        except Exception as e:
            logger.error(f"Failed to load dataset: {str(e)}")
            raise

    def run_comprehensive_experiments(self) -> None:
        """Run experiments for all techniques across all sample sizes."""
        if self.df is None:
            self.load_dataset()

        total_experiments = len(self.classifier.prompt_templates) * len(self.config.sample_sizes) * self.config.num_repeats
        current_experiment = 0

        logger.info(f"Starting comprehensive experiments: {total_experiments} total runs")

        for technique_idx, (technique_name, prompt_template) in enumerate(self.classifier.prompt_templates, 1):
            logger.info(f"\n{'='*60}\nTECHNIQUE {technique_idx}/{len(self.classifier.prompt_templates)}: {technique_name}\n{'='*60}")

            for sample_size in self.config.sample_sizes:
                for repeat_idx in range(1, self.config.num_repeats + 1):
                    current_experiment += 1
                    logger.info(f"  Running: Sample Size={sample_size}, Repeat={repeat_idx}/{self.config.num_repeats} (Exp {current_experiment}/{total_experiments})")

                    actual_sample_size = min(sample_size, len(self.df))
                    sample_df = self.df.sample(n=actual_sample_size, random_state=random.randint(1, 1_000_000))

                    experiment_result = self._run_single_experiment(
                        technique_idx, technique_name, prompt_template,
                        sample_size, repeat_idx, sample_df
                    )

                    self.results.append(experiment_result)
                    accuracy = experiment_result['accuracy']
                    logger.info(f"    -> Accuracy: {accuracy:.2%} ({experiment_result['num_correct']}/{experiment_result['num_samples']})")

    def _run_single_experiment(self, technique_idx: int, technique_name: str,
                             prompt_template: str, sample_size: int, repeat_idx: int,
                             sample_df: pd.DataFrame) -> Dict:
        """Run a single experiment configuration."""
        correct_count = 0
        failed_count = 0
        per_req_outcomes = []
        category_confusion = {cat: {cat2: 0 for cat2 in self.config.valid_categories}
                            for cat in self.config.valid_categories}

        for idx, row in sample_df.iterrows():
            req_text = row['requirements']
            true_label = row['application_domain']

            result = self.classifier.classify_requirement(prompt_template, req_text)
            predicted_label = result['predicted_category']

            is_correct = (predicted_label == true_label)
            if is_correct:
                correct_count += 1
            if not result['success']:
                failed_count += 1

            if true_label in category_confusion and predicted_label in category_confusion[true_label]:
                category_confusion[true_label][predicted_label] += 1

            per_req_outcomes.append({
                'requirement_id': idx,
                'requirement': req_text,
                'true_label': true_label,
                'predicted_label': predicted_label,
                'correct': is_correct,
                'success': result['success']
            })

            time.sleep(self.config.delay_between_requests)

        total = len(sample_df)
        accuracy = correct_count / total if total > 0 else 0

        return {
            'technique_index': technique_idx, 'technique_name': technique_name,
            'sample_size': sample_size, 'repeat_index': repeat_idx,
            'num_samples': total, 'num_correct': correct_count, 'num_failed': failed_count,
            'accuracy': accuracy, 'failure_rate': failed_count / total if total > 0 else 0,
            'confusion_matrix': category_confusion, 'per_requirement_results': per_req_outcomes
        }

    def generate_comprehensive_report(self) -> None:
        """Generate comprehensive analysis report with visualizations."""
        logger.info("Generating comprehensive report...")

        if not self.results:
            logger.warning("No results to report. Aborting report generation.")
            return

        summary_df = self._create_summary_dataframe()
        detailed_df = self._create_detailed_dataframe()

        technique_metrics = self._analyze_technique_metrics()

        report_sections = {
            'executive_summary': self._generate_executive_summary(summary_df),
            'technique_comparison': self._generate_technique_comparison(summary_df, technique_metrics),
            'sample_size_analysis': self._generate_sample_size_analysis(summary_df),
            'confusion_analysis': self._generate_confusion_analysis(),
            'recommendations': self._generate_recommendations(summary_df)
        }

        self._create_visualizations(summary_df)
        self._generate_html_report(report_sections, summary_df)
        self._save_raw_data(summary_df, detailed_df)

        logger.info(f"Report and data saved in '{self.config.output_dir}/'")

    # --- Dataframe and Metric Calculation ---

    def _create_summary_dataframe(self) -> pd.DataFrame:
        """Create summary dataframe for analysis."""
        return pd.DataFrame([{
            'technique_index': r['technique_index'], 'technique_name': r['technique_name'],
            'sample_size': r['sample_size'], 'repeat_index': r['repeat_index'],
            'accuracy': r['accuracy'], 'failure_rate': r['failure_rate'],
            'num_samples': r['num_samples'], 'num_correct': r['num_correct']
        } for r in self.results])

    def _create_detailed_dataframe(self) -> pd.DataFrame:
        """Create detailed dataframe with per-requirement results."""
        detailed_data = []
        for r in self.results:
            for req in r['per_requirement_results']:
                detailed_data.append({
                    'technique_name': r['technique_name'], 'sample_size': r['sample_size'],
                    'repeat_index': r['repeat_index'], 'requirement_id': req['requirement_id'],
                    'true_label': req['true_label'], 'predicted_label': req['predicted_label'],
                    'correct': req['correct'], 'success': req['success']
                })
        return pd.DataFrame(detailed_data)

    def _calculate_metrics_from_confusion_matrix(self, cm: Dict[str, Dict[str, int]]) -> Dict[str, float]:
        """Calculates precision, recall, and F-scores from a confusion matrix."""
        metrics = {}
        total_tp, total_fp, total_fn, total_samples = 0, 0, 0, 0

        for cat in self.config.valid_categories:
            tp = cm.get(cat, {}).get(cat, 0)
            fp = sum(cm.get(other_cat, {}).get(cat, 0) for other_cat in self.config.valid_categories if other_cat != cat)
            fn = sum(cm.get(cat, {}).get(other_cat, 0) for other_cat in self.config.valid_categories if other_cat != cat)
            support = tp + fn

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if support > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            f2 = 5 * precision * recall / (4 * precision + recall) if (4 * precision + recall) > 0 else 0

            metrics[cat] = {'precision': precision, 'recall': recall, 'f1': f1, 'f2': f2, 'support': support}
            total_tp += tp
            total_fp += fp
            total_fn += fn
            total_samples += support

        # Overall metrics
        macro_f1 = sum(m['f1'] for m in metrics.values()) / len(self.config.valid_categories)
        macro_precision = sum(m['precision'] for m in metrics.values()) / len(self.config.valid_categories)
        macro_recall = sum(m['recall'] for m in metrics.values()) / len(self.config.valid_categories)

        weighted_f1 = sum(m['f1'] * m['support'] for m in metrics.values()) / total_samples if total_samples > 0 else 0

        return {
            'per_category': metrics,
            'macro_f1': macro_f1, 'macro_precision': macro_precision, 'macro_recall': macro_recall,
            'weighted_f1': weighted_f1, 'accuracy': total_tp / total_samples if total_samples > 0 else 0
        }

    def _analyze_technique_metrics(self) -> Dict[str, Dict[str, float]]:
        """Aggregates confusion matrices and calculates metrics for each technique."""
        technique_cms = {}
        for result in self.results:
            tech_name = result['technique_name']
            if tech_name not in technique_cms:
                technique_cms[tech_name] = {cat: {c2: 0 for c2 in self.config.valid_categories} for cat in self.config.valid_categories}

            cm = result['confusion_matrix']
            for true_cat, preds in cm.items():
                for pred_cat, count in preds.items():
                    if true_cat in technique_cms[tech_name] and pred_cat in technique_cms[tech_name][true_cat]:
                        technique_cms[tech_name][true_cat][pred_cat] += count

        technique_metrics = {}
        for tech_name, agg_cm in technique_cms.items():
            metrics = self._calculate_metrics_from_confusion_matrix(agg_cm)
            technique_metrics[tech_name] = {k: v for k, v in metrics.items() if k != 'per_category'}

        return technique_metrics

    # --- Report Generation Sections ---

    def _generate_executive_summary(self, summary_df: pd.DataFrame) -> str:
        """Generate executive summary section."""
        avg_accuracy = summary_df['accuracy'].mean()
        technique_perf = summary_df.groupby('technique_name')['accuracy'].mean()
        best_technique = technique_perf.idxmax()
        best_accuracy = technique_perf.max()

        return f"""
        <h2>Executive Summary</h2>
        <p>This report analyzes the performance of {summary_df['technique_name'].nunique()} prompt engineering techniques for software requirements classification.</p>
        <h3>Key Findings</h3>
        <ul>
            <li><b>Overall Average Accuracy:</b> {avg_accuracy:.2%} across all experiments.</li>
            <li><b>Best Performing Technique:</b> <b>{best_technique}</b>, with an average accuracy of <b>{best_accuracy:.2%}</b>.</li>
            <li><b>Experiment Scope:</b> {len(summary_df)} individual runs were conducted across {len(self.config.sample_sizes)} sample size(s) ({self.config.sample_sizes}) with {self.config.num_repeats} repeat(s) each.</li>
        </ul>
        """

    def _generate_technique_comparison(self, summary_df: pd.DataFrame, technique_metrics: Dict) -> str:
        """Generate technique comparison analysis, now with F1 scores."""
        technique_stats = summary_df.groupby('technique_name').agg(
            accuracy_mean=('accuracy', 'mean'),
            accuracy_std=('accuracy', 'std'),
        ).reset_index()

        technique_stats['accuracy_std'] = technique_stats['accuracy_std'].fillna(0)

        metrics_df = pd.DataFrame.from_dict(technique_metrics, orient='index').reset_index().rename(columns={'index': 'technique_name'})
        technique_stats = pd.merge(technique_stats, metrics_df, on='technique_name')

        technique_stats = technique_stats.sort_values('accuracy_mean', ascending=False)

        comparison = """
        <h2>Technique Comparison Analysis</h2>
        <h3>Performance Ranking</h3>
        <p>Techniques are ranked by their average accuracy. Macro-averaged metrics provide a balanced view of performance across all categories.</p>
        <table>
            <thead>
                <tr>
                    <th>Rank</th><th>Technique</th><th>Avg Accuracy</th><th>Std Dev</th><th>Macro F1-Score</th><th>Macro Precision</th><th>Macro Recall</th>
                </tr>
            </thead>
            <tbody>
        """

        for rank, row in enumerate(technique_stats.itertuples(), 1):
            comparison += (f"<tr><td>{rank}</td><td>{row.technique_name}</td><td>{row.accuracy_mean:.2%}</td><td>{row.accuracy_std:.3f}</td>"
                           f"<td><b>{row.macro_f1:.2%}</b></td><td>{row.macro_precision:.2%}</td><td>{row.macro_recall:.2%}</td></tr>\n")

        comparison += "</tbody></table>"

        best_technique = technique_stats.iloc[0]
        if self.config.num_repeats > 1:
            most_consistent = technique_stats.sort_values('accuracy_std').iloc[0]
            consistency_text = f"<b>Most Consistent:</b> {most_consistent.technique_name} (Std Dev: {most_consistent.accuracy_std:.3f})"
        else:
            consistency_text = "<b>Consistency:</b> Not applicable (requires >1 repeat to measure)."

        comparison += f"""
        <h3>Key Observations</h3>
        <ul>
            <li><b>Top Performer:</b> {best_technique.technique_name} leads in both accuracy and F1-score.</li>
            <li>{consistency_text}</li>
        </ul>
        """
        return comparison

    def _generate_sample_size_analysis(self, summary_df: pd.DataFrame) -> str:
        """Generate sample size impact analysis."""
        if len(self.config.sample_sizes) < 2:
            return "<h2>Sample Size Impact Analysis</h2><p>Not applicable (only one sample size was tested).</p>"

        size_stats = summary_df.groupby('sample_size')['accuracy'].agg(['mean', 'std']).reset_index()
        analysis = "<h2>Sample Size Impact Analysis</h2><table><thead><tr><th>Sample Size</th><th>Avg Accuracy</th><th>Std Dev</th></tr></thead><tbody>"
        for row in size_stats.itertuples():
            analysis += f"<tr><td>{row.sample_size}</td><td>{row.mean:.2%}</td><td>{row.std:.3f}</td></tr>"
        analysis += "</tbody></table>"
        return analysis

    def _generate_confusion_analysis(self) -> str:
        """Generate confusion matrix analysis with overall metrics."""
        total_cm = {cat: {c2: 0 for c2 in self.config.valid_categories} for cat in self.config.valid_categories}
        for result in self.results:
            for true_cat, preds in result['confusion_matrix'].items():
                for pred_cat, count in preds.items():
                     if true_cat in total_cm and pred_cat in total_cm[true_cat]:
                        total_cm[true_cat][pred_cat] += count

        metrics = self._calculate_metrics_from_confusion_matrix(total_cm)

        analysis = "<h2>Overall Classification Performance</h2><h3>Confusion Matrix</h3><table><thead><tr><th>True \\ Pred</th>"
        for cat in self.config.valid_categories:
            analysis += f"<th>{cat}</th>"
        analysis += "</tr></thead><tbody>"

        for true_cat in self.config.valid_categories:
            analysis += f"<tr><td><b>{true_cat}</b></td>"
            for pred_cat in self.config.valid_categories:
                analysis += f"<td>{total_cm[true_cat][pred_cat]}</td>"
            analysis += "</tr>"
        analysis += "</tbody></table>"

        analysis += "<h3>Per-Category Performance</h3><table><thead><tr><th>Category</th><th>Precision</th><th>Recall</th><th>F1-Score</th><th>F2-Score</th><th>Support</th></tr></thead><tbody>"
        for cat, m in metrics['per_category'].items():
            analysis += f"<tr><td>{cat}</td><td>{m['precision']:.2%}</td><td>{m['recall']:.2%}</td><td>{m['f1']:.2%}</td><td>{m['f2']:.2%}</td><td>{m['support']}</td></tr>"

        analysis += f"""
            </tbody><tfoot>
                <tr style="font-weight:bold;"><td>Macro Avg</td><td>{metrics['macro_precision']:.2%}</td><td>{metrics['macro_recall']:.2%}</td><td>{metrics['macro_f1']:.2%}</td><td>-</td><td>{sum(m['support'] for m in metrics['per_category'].values())}</td></tr>
                <tr style="font-weight:bold;"><td>Weighted Avg</td><td>-</td><td>-</td><td>{metrics['weighted_f1']:.2%}</td><td>-</td><td>-</td></tr>
            </tfoot></table>
        """
        return analysis

    def _generate_recommendations(self, summary_df: pd.DataFrame) -> str:
        """Generate recommendations based on analysis."""
        technique_perf = summary_df.groupby('technique_name')['accuracy'].mean()
        best_technique = technique_perf.idxmax()

        return f"""
        <h2>Recommendations</h2>
        <ol>
            <li><b>Optimal Technique:</b> Based on this analysis, the <b>{best_technique}</b> prompt is recommended for its superior performance in both accuracy and F1-score.</li>
            <li><b>Further Testing:</b> To improve confidence, consider re-running experiments with a larger number of repeats (e.g., `num_repeats = 5`) and more varied sample sizes.</li>
            <li><b>Error Analysis:</b> A detailed review of misclassifications (available in `detailed_results.csv`) for the top-performing techniques can provide insights for further prompt refinement.</li>
        </ol>
        """

    # --- Visualization and File Saving ---

    def _create_visualizations(self, summary_df: pd.DataFrame) -> None:
        """Create and save visualization plots."""
        plt.style.use('seaborn-v0_8-whitegrid')

        # Plot 1: Technique Comparison (Accuracy)
        fig, ax = plt.subplots(figsize=(12, 8))
        tech_perf = summary_df.groupby('technique_name')['accuracy'].mean().sort_values()
        ax.barh(tech_perf.index, tech_perf.values, color='skyblue')
        ax.set_xlabel('Mean Accuracy')
        ax.set_title('Technique Performance by Mean Accuracy')
        ax.set_xlim(0, 1)
        for i, v in enumerate(tech_perf.values):
            ax.text(v + 0.01, i, f"{v:.2%}", va='center', fontweight='bold')
        plt.tight_layout()
        plt.savefig(f'{self.config.output_dir}/technique_accuracy.png', dpi=150)
        plt.close(fig)

        # Plot 2: Sample Size Impact
        if len(self.config.sample_sizes) > 1:
            fig, ax = plt.subplots(figsize=(10, 6))
            sns.lineplot(data=summary_df, x='sample_size', y='accuracy', marker='o', ax=ax, errorbar='sd')
            ax.set_title('Impact of Sample Size on Accuracy')
            ax.set_xlabel('Sample Size')
            ax.set_ylabel('Mean Accuracy (with Std Dev)')
            plt.tight_layout()
            plt.savefig(f'{self.config.output_dir}/sample_size_impact.png', dpi=150)
            plt.close(fig)

    def _generate_html_report(self, report_sections: Dict[str, str], summary_df: pd.DataFrame) -> None:
        """Generate comprehensive HTML report."""
        html_content = f"""
        <!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Requirements Classification Analysis</title>
        <style>
            body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; margin: 0; background: #f8f9fa; }}
            .container {{ max-width: 1200px; margin: 2em auto; background: #fff; padding: 2em; box-shadow: 0 0 20px rgba(0,0,0,0.05); border-radius: 8px; }}
            h1, h2, h3 {{ color: #343a40; border-bottom: 2px solid #dee2e6; padding-bottom: 0.3em; }}
            h1 {{ font-size: 2.5em; }} h2 {{ font-size: 1.8em; margin-top: 1.5em;}}
            table {{ width: 100%; border-collapse: collapse; margin-top: 1em; }}
            th, td {{ padding: 0.8em; text-align: left; border: 1px solid #dee2e6; }}
            th {{ background-color: #f8f9fa; font-weight: 600; }}
            tbody tr:nth-child(odd) {{ background-color: #f8f9fa; }}
            img {{ max-width: 100%; height: auto; display: block; margin: 2em 0; border: 1px solid #dee2e6; border-radius: 4px; }}
            .timestamp {{ color: #6c757d; font-size: 0.9em; }}
        </style></head><body><div class="container">
            <h1>Requirements Classification Analysis Report</h1>
            <p class="timestamp">Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            {report_sections.get('executive_summary', '')}
            <h2>Visual Analysis</h2>
            <img src="technique_accuracy.png" alt="Technique Accuracy Comparison">
            { '<img src="sample_size_impact.png" alt="Sample Size Impact">' if len(self.config.sample_sizes) > 1 else '' }
            {report_sections.get('technique_comparison', '')}
            {report_sections.get('sample_size_analysis', '')}
            {report_sections.get('confusion_analysis', '')}
            {report_sections.get('recommendations', '')}
        </div></body></html>
        """
        with open(f'{self.config.output_dir}/comprehensive_report.html', 'w', encoding='utf-8') as f:
            f.write(html_content)

    def _save_raw_data(self, summary_df: pd.DataFrame, detailed_df: pd.DataFrame) -> None:
        """Save raw experimental data to CSV and JSON files."""
        summary_df.to_csv(f'{self.config.output_dir}/summary_results.csv', index=False)
        detailed_df.to_csv(f'{self.config.output_dir}/detailed_results.csv', index=False)

        config_dict = {k: v for k, v in self.config.__dict__.items()}
        experiment_log = {'config': config_dict, 'results': self.results}

        with open(f'{self.config.output_dir}/experiment_log.json', 'w') as f:
            json.dump(experiment_log, f, indent=2, default=str)

        logger.info("Raw data saved to CSV and JSON files.")

# -----------------------------------------------------------------------------
# MAIN EXECUTION FUNCTION
# -----------------------------------------------------------------------------

def main():
    """Main execution function."""
    try:
        logger.info("Starting Requirements Classification Analysis")
        logger.info(f"Configuration: {config}")

        runner = ComprehensiveExperimentRunner(config)
        runner.run_comprehensive_experiments()
        runner.generate_comprehensive_report()

        logger.info("Analysis completed successfully!")

    except FileNotFoundError:
        logger.error(f"CRITICAL ERROR: The dataset file was not found at the specified path in the configuration. Please check `config.csv_path`.")
    except Exception as e:
        logger.error(f"An unexpected error occurred during the analysis: {str(e)}", exc_info=True)

if __name__ == "__main__":
     main()

2025-06-12 17:54:16,720 - INFO - Starting Requirements Classification Analysis
2025-06-12 17:54:16,721 - INFO - Configuration: Config(csv_path='/Users/SONY/Documents/College/Study_pdeu/Summer_internship/IISER_B/Arpit sir/LLM_test/requirements_merged.csv', model_name='phi4-mini-reasoning:3.8b', valid_categories=['Energy', 'Entertainment', 'Health', 'Safety', 'Other'], sample_sizes=[10], num_repeats=1, delay_between_requests=0.1, max_retries=1, output_dir='classification_results')
2025-06-12 17:54:16,742 - INFO - Loaded dataset with 2966 requirements
2025-06-12 17:54:16,747 - INFO - Dataset ready with 2966 valid requirements.
2025-06-12 17:54:16,747 - INFO - Starting comprehensive experiments: 5 total runs
2025-06-12 17:54:16,747 - INFO - 
TECHNIQUE 1/5: Context Manager
2025-06-12 17:54:16,747 - INFO -   Running: Sample Size=10, Repeat=1/1 (Exp 1/5)
2025-06-12 17:54:39,815 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-06-12 17:55:24,948 - INFO - H