In [None]:
# Cell 1: Install and Run Ollama
import os
import asyncio

# # Step 1: Install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

# Step 2: Run Ollama in the background
# We use nohup and & to run it in the background and keep it running.
os.environ['OLLAMA_HOST'] = '0.0.0.0'
!nohup ollama serve > ollama.log 2>&1 &

# Give it a few seconds to start
await asyncio.sleep(5)

# # Step 3: Pull the model
# # This will download the model. It can take a while.
!ollama pull phi4:14b

print("✅ Ollama is installed and the model is ready.")

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25

In [None]:
!ollama list

NAME        ID              SIZE      MODIFIED           
phi4:14b    ac896e5b8b34    9.1 GB    About a minute ago    


In [None]:
# Cell 2: Upload and Unzip Data
from google.colab import files
import os

# Prompt to upload the zip file
print("Please upload the 'Data_for_llm.zip' file.")
uploaded = files.upload()

# Unzip the file
zip_name = list(uploaded.keys())[0]
!unzip -q -o "{zip_name}"

# Check if the directory exists
if os.path.isdir("Data_for_llm"):
    print("\n✅ Data successfully uploaded and unzipped.")
    !ls -R Data_for_llm # Show the file structure to confirm
else:
    print("\n❌ Error: 'Data_for_llm' directory not found after unzipping.")
    print("Please ensure your zip file contains a single root folder named 'Data_for_llm'.")

Please upload the 'Data_for_llm.zip' file.


Saving Data_for_llm.zip to Data_for_llm.zip

✅ Data successfully uploaded and unzipped.
Data_for_llm:
Binary	Quaternary  Quinary  Tertiary

Data_for_llm/Binary:
binary_cluster_Energy_Entertainment.csv
binary_cluster_Energy_Other.csv
binary_cluster_Energy_Safety.csv
binary_cluster_Entertainment_Other.csv
binary_cluster_Entertainment_Safety.csv
binary_cluster_Health_Energy.csv
binary_cluster_Health_Entertainment.csv
binary_cluster_Health_Other.csv
binary_cluster_Health_Safety.csv
binary_cluster_Safety_Other.csv

Data_for_llm/Quaternary:
quaternary_cluster_Energy_Entertainment_Safety_Other.csv
quaternary_cluster_Health_Energy_Entertainment_Other.csv
quaternary_cluster_Health_Energy_Entertainment_Safety.csv
quaternary_cluster_Health_Energy_Safety_Other.csv
quaternary_cluster_Health_Entertainment_Safety_Other.csv

Data_for_llm/Quinary:
quinary_cluster_health_energy_entertainment_safety_other.csv

Data_for_llm/Tertiary:
tertiary_cluster_Energy_Entertainment_Other.csv
tertiary_cluster_Energy_

In [None]:
# Cell 3: The Full, Modified Experiment Script (with added Precision/Recall in summaries)

# =============================================================================
# ALL IMPORTS AND SETUP (No changes needed here)
# =============================================================================
import os
import random
import time
import pandas as pd
from ollama import Client
import json
import logging
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass, field
from pathlib import Path
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import defaultdict
import shutil

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# =============================================================================
# DYNAMICALLY GENERATE DATASET PATHS (No changes needed here)
# =============================================================================
def generate_dataset_paths(base_dir: str) -> Dict[str, List[str]]:
    """Scans a directory and creates the experiment dataset dictionary."""
    base_path = Path(base_dir)
    if not base_path.exists():
        logger.error(f"Base data directory '{base_dir}' not found!")
        return {}
    datasets = {
        "Binary": sorted([str(p) for p in base_path.glob("Binary/*.csv")]),
        "Tertiary": sorted([str(p) for p in base_path.glob("Tertiary/*.csv")]),
        "Quaternary": sorted([str(p) for p in base_path.glob("Quaternary/*.csv")]),
        "Quinary": sorted([str(p) for p in base_path.glob("Quinary/*.csv")]),
    }
    return {k: v for k, v in datasets.items() if v}

COLAB_DATA_DIR = "Data_for_llm"
EXPERIMENT_DATASETS = generate_dataset_paths(COLAB_DATA_DIR)
logger.info(f"Dynamically found datasets: {json.dumps(EXPERIMENT_DATASETS, indent=2)}")

# =============================================================================
# CONFIGURATION, PROMPTS, and CLASSES (MODIFICATION HERE)
# =============================================================================
@dataclass
class ExperimentConfig:
    # FIX: Changed the model name from 'phi4:14b' to 'phi3'.
    # The original 14B model is too large for the Colab environment and causes
    # the 'llama runner process has terminated' (out-of-memory) error.
    # 'phi3' is a capable model that fits within the available resources.
    model_name: str = "phi4:14b"
    num_repeats_per_technique: int = 1
    delay_between_requests: float = 0.1
    max_retries: int = 2
    base_output_dir: str = "phi4:14b"
    temperature: float = 1.0
    top_p: float = 0.9
    num_ctx: int = 16384
    current_experiment_dir: str = ""

class PromptTemplates:
    @staticmethod
    def get_dynamic_templates(valid_categories: List[str]) -> List[Tuple[str, str]]:
        category_list_bullets = "\n".join([f"• {cat}" for cat in valid_categories])
        category_list_pipe = " | ".join(valid_categories)
        category_list_bracket = f"[{', '.join(valid_categories)}]"
        return [

           ("Enhanced Zero-Shot", f"""
SOFTWARE REQUIREMENT CLASSIFICATION
Choose exactly one category based on the requirement’s primary purpose and domain:
{category_list_bullets}
REQUIREMENT: "{{req}}"
FINAL CATEGORY (one word only): [{category_list_pipe}]
"""),
            ("Optimized Few-Shot", f"""
REQUIREMENTS CLASSIFICATION EXAMPLES & TASK:
Example 1:
Requirement: "The system shall automatically reduce screen brightness when battery level drops below 20% to extend device runtime."
Analysis: Focuses on energy optimization to conserve battery life.
FINAL CATEGORY: Energy
Example 2:
Requirement: "The application must encrypt all patient data using AES-256 encryption and maintain audit logs for compliance."
Analysis: Emphasizes data protection and compliance for patient safety.
FINAL CATEGORY: Safety
Now classify the following requirement into one of the available categories:
Requirement: "{{req}}"
Follow this pattern:
Analysis: [One sentence describing primary focus]
FINAL CATEGORY: [{category_list_pipe}]
"""),
            ("Few-Shot with Reasoning", f"""
CLASSIFICATION EXAMPLES WITH EXPLANATION:
Example 1:
Requirement: "The smart thermostat shall learn user preferences and optimize heating schedules to minimize energy consumption while maintaining comfort."
Reasoning: The emphasis is clearly on reducing energy usage via automated scheduling.
Category: Energy
Example 2:
Requirement: "The fitness app shall provide real-time heart rate monitoring and alert users when heart rate exceeds safe thresholds during exercise."
Reasoning: Core functionality relates to health monitoring and user safety during workouts.
Category: Health
TASK: Classify the following requirement following the same format, choosing from {category_list_bracket}:
Requirement: "{{req}}"
Reasoning: [One sentence explaining why]
FINAL CATEGORY: [{category_list_pipe}]
"""),
            ("Fact Check List Prompt", f"""
You are analyzing software requirements.
Step 1: Classify the requirement into one of these domains:
{category_list_bracket}
Step 2: List 3 key facts from the requirement that support your classification(One sentence).
Requirement: "{{req}}"
Output Format:
FINAL CATEGORY: [Domain]
Facts:
1. ...
2. ...
3. ...
"""),
           ("Structured Chain of Thought", f"""
REQUIREMENT CLASSIFICATION SYSTEM
CATEGORIES (choose exactly one):
{category_list_bullets}
CLASSIFICATION PROCESS:
1. Extract Key Terms: [Identify three to five primary verbs/nouns]
2. Primary Stakeholder: [Who benefits most? e.g., end user, clinician, operator]
3. Core Function: [What capability does this requirement enable?(One sentence)]
4. Business Value: [Why is this requirement needed? e.g., cost reduction, increased safety (One sentence)]
REQUIREMENT: "{{req}}"
ANALYSIS:
• Key Terms:
• Primary Stakeholder:
• Core Function:
• Business Value:
FINAL CATEGORY: [{category_list_pipe}]
""")
        ]

class RequirementsClassifier:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.client = Client()


    def _extract_category_from_response(self, response: str, valid_categories: List[str]) -> str:
        """
        Extracts the classification category from the model's response using a prioritized, multi-step approach.
        Handles verbose responses with reasoning blocks (e.g., <think>...</think>).
        """
        # Step 0: Create a case-insensitive lookup map to get the original category casing.
        category_map = {cat.lower(): cat for cat in valid_categories}
        # This regex part will match any of the valid categories, case-insensitively.
        category_regex_part = "|".join(re.escape(cat) for cat in valid_categories)

        # Step 1: Clean the response by removing the "thought process" block.
        # This is crucial to avoid matching categories mentioned in the reasoning or prompt examples.
        # The re.DOTALL flag ensures that '.' matches newlines within the <think> block.
        cleaned_response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL | re.IGNORECASE).strip()

        # If the cleaned response is empty, we can't proceed.
        if not cleaned_response:
            logger.warning(f"Response was empty after removing <think> block. Original: {response[:150]}...")
            return "Unclassified"

        # Step 2: High-Priority Extraction - Look for explicit labels.
        # This is the most reliable pattern, e.g., "Category: Safety", "FINAL CATEGORY: [Health]".
        # It handles optional markdown (**) and brackets ([]).
        label_pattern = re.compile(
            rf"(?:FINAL CATEGORY)\s*[:\s]*\**\s*\[?({category_regex_part})\]?\s*\**",
            re.IGNORECASE
        )
        match = label_pattern.search(cleaned_response)
        if match:
            # Normalize to lower case to look up in our map and return the original casing.
            found_category_lower = match.group(1).lower()
            return category_map[found_category_lower]

        # Step 3: Medium-Priority Extraction - Look for the category on the last line.
        # Models often put the final, concise answer at the very end.
        lines = cleaned_response.splitlines()
        if lines:
            # Take the last non-empty line
            last_line = lines[-1].strip()
            # Clean up potential markdown or punctuation around the word (e.g., "**Entertainment**.")
            potential_category = re.sub(r"[^\w\s]", "", last_line).strip()
            if potential_category.lower() in category_map:
                return category_map[potential_category.lower()]

        # Step 4: Fallback Extraction - Find the LAST occurrence of any category in the cleaned text.
        # We search from the end to catch the conclusion, not an earlier mention in the reasoning.
        # We iterate through all possible matches and find the one with the highest start index.
        last_match_category = None
        last_match_position = -1

        for category in valid_categories:
            # Use finditer to find all occurrences of the category as a whole word (\b)
            for m in re.finditer(rf'\b{re.escape(category)}\b', cleaned_response, re.IGNORECASE):
                if m.start() > last_match_position:
                    last_match_position = m.start()
                    last_match_category = category # Store the original-cased category name

        if last_match_category:
            return last_match_category

        # Step 5: If all other methods fail, log a warning and return Unclassified.
        logger.warning(f"Could not extract a valid category from cleaned response: {cleaned_response[:200]}...")
        return "Unclassified"

    def classify_requirement(self, prompt_template: str, requirement_text: str, valid_categories: List[str]) -> Dict:
        for attempt in range(self.config.max_retries):
            try:
                prompt = prompt_template.format(req=requirement_text.strip())
                response = self.client.generate(
                    model=self.config.model_name,
                    prompt=prompt,
                    options={
                        'temperature': self.config.temperature,
                        'top_p': self.config.top_p,
                        'num_ctx': self.config.num_ctx
                    }
                )
                raw_response = response.get('response', '').strip()
                predicted_category = self._extract_category_from_response(raw_response, valid_categories)
                return {'predicted_category': predicted_category, 'raw_response': raw_response, 'success': True}
            except Exception as e:
                logger.error(f"Attempt {attempt + 1} failed for model '{self.config.model_name}': {str(e)}")
                if attempt < self.config.max_retries - 1:
                    time.sleep(1)
        return {'predicted_category': 'Unclassified', 'raw_response': f"Error after {self.config.max_retries} retries", 'success': False}


# =============================================================================
# ComprehensiveExperimentRunner CLASS (No changes needed)
# =============================================================================
class ComprehensiveExperimentRunner:
    def __init__(self, config: ExperimentConfig, experiment_name: str, dataset_paths: List[str]):
        self.config = config
        self.experiment_name = experiment_name
        self.dataset_paths = dataset_paths
        self.classifier = RequirementsClassifier(self.config)
        self.results_per_dataset = []
        self.config.current_experiment_dir = os.path.join(self.config.base_output_dir, self.experiment_name)
        Path(self.config.current_experiment_dir).mkdir(parents=True, exist_ok=True)

    def run_all_experiments(self):
        for path in self.dataset_paths:
            try:
                dataset_name = Path(path).stem
                logger.info(f"\n{'='*80}\nProcessing Dataset: {dataset_name} (from: {path})\n{'='*80}")
                dataset_result_path = os.path.join(self.config.current_experiment_dir, f"{dataset_name}_results.json")
                if os.path.exists(dataset_result_path):
                    logger.info(f"Results for {dataset_name} already exist. Skipping.")
                    with open(dataset_result_path, 'r') as f:
                        self.results_per_dataset.append(json.load(f))
                    continue
                df = pd.read_csv(path)
                df.dropna(subset=['requirements', 'application_domain'], inplace=True)
                df['application_domain'] = df['application_domain'].str.strip().str.capitalize()
                valid_categories = sorted(df['application_domain'].unique().tolist())
                if 'Unclassified' in valid_categories:
                    raise ValueError("'Unclassified' cannot be a valid category name in the source data.")
                logger.info(f"Loaded {len(df)} samples. Inferred categories: {valid_categories}")
                dynamic_prompts = PromptTemplates.get_dynamic_templates(valid_categories)
                dataset_results = {
                    'dataset_name': dataset_name, 'valid_categories': valid_categories,
                    'technique_results': []
                }
                for tech_name, prompt_template in dynamic_prompts:
                    logger.info(f"--- Running Technique: {tech_name} ---")
                    experiment_result = self._run_single_experiment(tech_name, prompt_template, df, valid_categories)
                    dataset_results['technique_results'].append(experiment_result)
                self.results_per_dataset.append(dataset_results)
                self._save_and_report_for_dataset(dataset_results)
            except FileNotFoundError:
                logger.error(f"Dataset file not found: {path}. Skipping.")
            except Exception as e:
                logger.error(f"An unexpected error occurred while processing {path}: {e}", exc_info=True)

    def _run_single_experiment(self, technique_name: str, prompt_template: str, sample_df: pd.DataFrame, valid_categories: List[str]) -> Dict:
        per_req_outcomes = []
        all_categories_for_cm = valid_categories + ['Unclassified']
        confusion_matrix = {cat: {cat2: 0 for cat2 in all_categories_for_cm} for cat in valid_categories}
        for _, row in sample_df.iterrows():
            req_text, true_label = row['requirements'], row['application_domain']
            result = self.classifier.classify_requirement(prompt_template, req_text, valid_categories)
            predicted_label = result['predicted_category']
            confusion_matrix[true_label][predicted_label] += 1
            per_req_outcomes.append({
                'requirement': req_text, 'true_label': true_label,
                'predicted_label': predicted_label, 'correct': (predicted_label == true_label),
                'raw_response': result['raw_response']
            })
            time.sleep(self.config.delay_between_requests)
        metrics = self._calculate_metrics_from_confusion_matrix(confusion_matrix, valid_categories)
        return {'technique_name': technique_name, 'num_samples': len(sample_df),
                'metrics': metrics, 'confusion_matrix': confusion_matrix,
                'per_requirement_results': per_req_outcomes}

    def _calculate_metrics_from_confusion_matrix(self, cm: Dict, categories: List[str]) -> Dict:
        metrics = {'per_category': {}, 'macro_avg': {}, 'weighted_avg': {}}
        total_samples, all_precisions, all_recalls, all_f1s = 0, [], [], []
        for cat in categories:
            tp = cm.get(cat, {}).get(cat, 0)
            fp = sum(cm.get(other_cat, {}).get(cat, 0) for other_cat in categories if other_cat != cat)
            fn = sum(cm.get(cat, {}).get(other_cat, 0) for other_cat in categories if other_cat != cat)
            precision, recall, f1, support = 0, 0, 0, tp + fn
            if (tp + fp) > 0: precision = tp / (tp + fp)
            if support > 0: recall = tp / support
            if (precision + recall) > 0: f1 = 2 * (precision * recall) / (precision + recall)
            total_samples += support
            metrics['per_category'][cat] = {'precision': precision, 'recall': recall, 'f1_score': f1, 'support': support}
            all_precisions.append(precision); all_recalls.append(recall); all_f1s.append(f1)
        metrics['macro_avg']['precision'] = np.mean(all_precisions) if all_precisions else 0
        metrics['macro_avg']['recall'] = np.mean(all_recalls) if all_recalls else 0
        metrics['macro_avg']['f1_score'] = np.mean(all_f1s) if all_f1s else 0
        metrics['weighted_avg']['f1_score'] = sum(m['f1_score'] * m['support'] for m in metrics['per_category'].values()) / total_samples if total_samples > 0 else 0
        metrics['accuracy'] = sum(cm[c][c] for c in categories) / total_samples if total_samples > 0 else 0
        metrics['unclassified_count'] = sum(cm[c]['Unclassified'] for c in categories)
        return metrics

    def _save_and_report_for_dataset(self, dataset_result: Dict):
        dataset_name = dataset_result['dataset_name']
        output_dir = os.path.join(self.config.current_experiment_dir, dataset_name)
        Path(output_dir).mkdir(exist_ok=True)
        json_path = os.path.join(self.config.current_experiment_dir, f"{dataset_name}_results.json")
        with open(json_path, 'w') as f: json.dump(dataset_result, f, indent=2)
        logger.info(f"Saved full results for {dataset_name} to {json_path}")
        html_report = self._generate_single_dataset_html_report(dataset_result)
        report_path = os.path.join(output_dir, "report.html")
        with open(report_path, 'w', encoding='utf-8') as f: f.write(html_report)
        logger.info(f"Generated HTML report for {dataset_name} at {report_path}")

    def generate_final_summary_report(self):
        if not self.results_per_dataset:
            logger.warning("No results to summarize. Skipping final report."); return
        logger.info("Generating final summary report with aggregated, per-category, and per-dataset views...")

        summary_data = defaultdict(lambda: defaultdict(list))
        detailed_performance_list = []
        per_category_metrics = defaultdict(lambda: defaultdict(lambda: {'precision': [], 'recall': [], 'f1_score': [], 'support': 0}))

        for result in self.results_per_dataset:
            for tech_result in result['technique_results']:
                tech_name, metrics = tech_result['technique_name'], tech_result['metrics']

                summary_data[tech_name]['accuracy'].append(metrics['accuracy'])
                summary_data[tech_name]['macro_precision'].append(metrics['macro_avg']['precision'])
                summary_data[tech_name]['macro_recall'].append(metrics['macro_avg']['recall'])
                summary_data[tech_name]['macro_f1'].append(metrics['macro_avg']['f1_score'])

                detailed_performance_list.append({
                    'Dataset': result['dataset_name'],
                    'Technique': tech_name,
                    'Accuracy': metrics['accuracy'],
                    'Macro Precision': metrics['macro_avg']['precision'],
                    'Macro Recall': metrics['macro_avg']['recall'],
                    'Macro F1': metrics['macro_avg']['f1_score'],
                    'Unclassified': metrics['unclassified_count']
                })

                for category, cat_metrics in metrics['per_category'].items():
                    per_category_metrics[tech_name][category]['precision'].append(cat_metrics['precision'])
                    per_category_metrics[tech_name][category]['recall'].append(cat_metrics['recall'])
                    per_category_metrics[tech_name][category]['f1_score'].append(cat_metrics['f1_score'])
                    per_category_metrics[tech_name][category]['support'] += cat_metrics['support']

        summary_list = [{
            'Technique': n,
            'Avg Accuracy': np.mean(d['accuracy']),
            'Std Dev Accuracy': np.std(d['accuracy']),
            'Avg Macro Precision': np.mean(d['macro_precision']),
            'Avg Macro Recall': np.mean(d['macro_recall']),
            'Avg Macro F1': np.mean(d['macro_f1'])
        } for n, d in summary_data.items()]
        summary_df = pd.DataFrame(summary_list).sort_values(by='Avg Accuracy', ascending=False)

        detailed_df = pd.DataFrame(detailed_performance_list)
        per_category_list = []
        for tech, categories in per_category_metrics.items():
            for cat, metrics_lists in categories.items():
                per_category_list.append({
                    'Technique': tech, 'Category': cat,
                    'Avg Precision': np.mean(metrics_lists['precision']), 'Avg Recall': np.mean(metrics_lists['recall']),
                    'Avg F1-Score': np.mean(metrics_lists['f1_score']), 'Total Support': metrics_lists['support']})
        per_category_df = pd.DataFrame(per_category_list).sort_values(by=['Technique', 'Category'])

        html = self._generate_summary_html_report(summary_df, detailed_df, per_category_df)
        report_path = os.path.join(self.config.current_experiment_dir, "summary_report.html")
        with open(report_path, 'w', encoding='utf-8') as f: f.write(html)

        summary_df.to_csv(os.path.join(self.config.current_experiment_dir, "summary_aggregated_metrics.csv"), index=False)
        detailed_df.to_csv(os.path.join(self.config.current_experiment_dir, "summary_detailed_performance.csv"), index=False)
        per_category_df.to_csv(os.path.join(self.config.current_experiment_dir, "summary_per_category_metrics.csv"), index=False)

        logger.info(f"Final summary report and all CSVs saved in: {self.config.current_experiment_dir}")

    def _generate_single_dataset_html_report(self, dataset_result: Dict) -> str:
        name = dataset_result['dataset_name']
        html = f"<html><head><title>Report for {name}</title>{self._get_html_style()}</head><body><div class='container'><h1>Report for {name}</h1>"
        for res in dataset_result['technique_results']:
            tech, m, cm = res['technique_name'], res['metrics'], res['confusion_matrix']
            html += f"<h2>Technique: {tech}</h2><p><b>Overall Accuracy: {m['accuracy']:.2%}</b> | <b>Unclassified: {m['unclassified_count']}</b></p>"
            df = pd.DataFrame(m['per_category']).T.reset_index().rename(columns={'index':'Category'})
            for col in ['f1_score', 'precision', 'recall']: df[col] = df[col].apply(lambda x: f"{x:.2%}")
            html += "<h3>Performance Metrics</h3>" + df.to_html(index=False, classes='styled-table')
            cm_df = pd.DataFrame(cm).T.fillna(0).astype(int)
            html += "<h3>Confusion Matrix</h3>" + cm_df.to_html(classes='styled-table')
        html += "</div></body></html>"
        return html

    def _generate_summary_html_report(self, summary_df: pd.DataFrame, detailed_df: pd.DataFrame, per_category_df: pd.DataFrame) -> str:
        name, num = self.experiment_name, len(self.dataset_paths)
        html = f"<html><head><title>Experiment Summary: {name}</title>{self._get_html_style()}</head><body><div class='container'><h1>Experiment Summary: {name}</h1>"
        html += f"<p>This report summarizes avg performance of {summary_df.shape[0]} techniques across {num} datasets.</p>"

        df_agg_display = summary_df.copy()
        agg_percent_cols = ['Avg Accuracy', 'Avg Macro Precision', 'Avg Macro Recall', 'Avg Macro F1']
        for col in df_agg_display.columns:
            if col in agg_percent_cols:
                df_agg_display[col] = df_agg_display[col].apply(lambda x: f"{x:.2%}")
            elif 'Std Dev' in col:
                df_agg_display[col] = df_agg_display[col].apply(lambda x: f"{x:.4f}")
        html += "<h2>Aggregated Performance Metrics (Averages)</h2>" + df_agg_display.to_html(index=False, classes='styled-table')

        fig, ax = plt.subplots(figsize=(12, 8))
        s_df = summary_df.sort_values('Avg Accuracy')
        ax.barh(s_df['Technique'], s_df['Avg Accuracy'], xerr=s_df['Std Dev Accuracy'], capsize=5, color='skyblue')
        ax.set_xlabel('Average Accuracy'); ax.set_title(f'Technique Performance Comparison ({name})'); ax.set_xlim(0, 1)
        for i, v in enumerate(s_df['Avg Accuracy']): ax.text(v + 0.02, i, f"{v:.2%}", va='center', fontweight='bold')
        plt.tight_layout()
        img_path = os.path.join(self.config.current_experiment_dir, "summary_accuracy_chart.png")
        plt.savefig(img_path, dpi=150); plt.close(fig)
        html += f"<h2>Performance Chart</h2><img src='{os.path.basename(img_path)}' alt='Technique Performance Chart'>"

        html += "<h2>Detailed Performance by Category & Technique</h2>"
        df_cat_display = per_category_df.copy()
        for col in ['Avg Precision', 'Avg Recall', 'Avg F1-Score']:
            df_cat_display[col] = df_cat_display[col].apply(lambda x: f"{x:.2%}")
        html += df_cat_display.to_html(index=False, classes='styled-table')

        html += "<h2>Detailed Performance per Dataset & Technique</h2>"
        df_detail_display = detailed_df.copy()
        detail_percent_cols = ['Accuracy', 'Macro Precision', 'Macro Recall', 'Macro F1']
        for col in detail_percent_cols:
            df_detail_display[col] = df_detail_display[col].apply(lambda x: f"{x:.2%}")
        html += df_detail_display.to_html(index=False, classes='styled-table')

        html += "</div></body></html>"
        return html

    def _get_html_style(self) -> str:
        return """<style>body{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,Helvetica,Arial,sans-serif;margin:0;background:#f8f9fa;color:#343a40}.container{max-width:1200px;margin:2em auto;background:#fff;padding:2em;box-shadow:0 0 20px rgba(0,0,0,.05);border-radius:8px}h1,h2,h3{color:#343a40;border-bottom:2px solid #dee2e6;padding-bottom:.3em}h1{font-size:2.5em}h2{font-size:1.8em;margin-top:1.5em}table.styled-table{width:100%;border-collapse:collapse;margin-top:1em;box-shadow:0 0 20px rgba(0,0,0,.1)}.styled-table thead tr{background-color:#007bff;color:#fff;text-align:left}.styled-table th,.styled-table td{padding:12px 15px;border:1px solid #dee2e6}.styled-table tbody tr{border-bottom:1px solid #ddd}.styled-table tbody tr:nth-of-type(even){background-color:#f3f3f3}.styled-table tbody tr:last-of-type{border-bottom:2px solid #007bff}img{max-width:100%;height:auto;display:block;margin:2em 0;border:1px solid #dee2e6;border-radius:4px}</style>"""

# =============================================================================
# MAIN EXECUTION FUNCTION (No changes needed)
# =============================================================================
def main():
    available_experiments = list(EXPERIMENT_DATASETS.keys())
    if not any(EXPERIMENT_DATASETS.values()):
        logger.error("No datasets were found. Please check the 'Data_for_llm' directory and its subdirectories.")
        return

    while True:
        print("Please choose an experiment to run:")
        for i, exp_name in enumerate(available_experiments): print(f"  {i+1}. {exp_name}")
        try:
            choice = input(f"Enter the number (1-{len(available_experiments)}) or name of the experiment: ").strip()
            if choice.isdigit() and 1 <= int(choice) <= len(available_experiments):
                experiment_type = available_experiments[int(choice) - 1]; break
            elif choice.title() in available_experiments:
                experiment_type = choice.title(); break
            else: print(f"--- Invalid input '{choice}'. Please try again. ---\n")
        except (ValueError, IndexError): print(f"--- Invalid input. Please enter a valid number or name. ---\n")

    dataset_paths = EXPERIMENT_DATASETS[experiment_type]
    if not dataset_paths or not all(isinstance(p, str) for p in dataset_paths):
         logger.error(f"Dataset list for '{experiment_type}' is empty or misconfigured. Please check EXPERIMENT_DATASETS."); return
    try:
        config = ExperimentConfig()
        logger.info(f"Starting Analysis for '{experiment_type}' Experiment")
        logger.info(f"Model: {config.model_name}, Temp: {config.temperature}, Top-P: {config.top_p}, Ctx: {config.num_ctx}")
        logger.info(f"Output Dir: {os.path.join(config.base_output_dir, experiment_type)}")
        logger.info(f"Processing {len(dataset_paths)} dataset(s).")
        runner = ComprehensiveExperimentRunner(config, experiment_type, dataset_paths)
        runner.run_all_experiments()
        runner.generate_final_summary_report()
        logger.info("Analysis completed successfully!")
    except Exception as e:
        logger.error(f"An unexpected error occurred during the analysis: {e}", exc_info=True)

# Run the main function
if __name__ == "__main__":
    main()

Please choose an experiment to run:
  1. Binary
  2. Tertiary
  3. Quaternary
  4. Quinary
