# Agreement V8: FiQA + InsuranceQA + MedQA + MedCQA
Reviewed Alpha

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from scipy.cluster.hierarchy import dendrogram, linkage
# Core scientific and ML libraries
from scipy.optimize import linear_sum_assignment
from scipy.stats import ttest_rel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

# Key libraries for this implementation
from sentence_transformers import SentenceTransformer
import simpledorff # For Krippendorff's Alpha

from datetime import datetime
import sys

sys.path.append("./../../../../")
from aimw.app.services.factory import runnable_system
from aimw.app.core.ai_config import get_ai_settings
from aimw.app.utils import common_utils
from aimw.app.schemas.enum.ai_enums import Role
from loguru import logger
import json



with open("./output/predictions/classifier_cross_model_prediction_random_400_ALL_M5_20250706_091512.json", encoding="utf-8") as f:
    document_list = json.load(f)
print(f"Total: {len(document_list)}")


# --- Use this function to visualize ---
def plot_dendrogram(subtopics):
    """
    Generates and plots a dendrogram for a list of subtopics to visualize their hierarchy.
    """
    if len(subtopics) < 2:
        print("Need at least 2 subtopics to create a dendrogram.")
        return

    print("Encoding subtopics for dendrogram...")
    embeddings = sentence_model.encode(subtopics)

    # The 'ward' linkage method is often better for visualization and creating balanced clusters.
    print("Creating linkage matrix...")
    linked = linkage(embeddings, method='ward', metric='euclidean')

    plt.figure(figsize=(15, 10))
    print("Plotting dendrogram...")
    dendrogram(
        linked,
        orientation='top',
        labels=subtopics,
        distance_sort='descending',
        show_leaf_counts=True,
        leaf_rotation=90,
        leaf_font_size=8
    )
    plt.title('Hierarchical Clustering Dendrogram of Subtopics', fontsize=16)
    plt.xlabel('Subtopics', fontsize=12)
    plt.ylabel('Euclidean Distance', fontsize=12)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig("subtopic_dendrogram.png", dpi=300)
    plt.show()
    print("Dendrogram saved as 'subtopic_dendrogram.png'")

# --- 1. SETUP: LOAD MODEL AND DATA ---

# Load a powerful sentence embedding model
# This model is adept at capturing nuanced semantic meaning in short phrases
print("Loading sentence embedding model...")
sentence_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
print("Model loaded.")

# Your provided data
# In a real scenario, this would be loaded from a file (e.g., json.load)
data = document_list #[:50]


# --- 2. PAIRWISE AGREEMENT FUNCTIONS (FROM ORIGINAL SCRIPT) ---

def get_subtopic_embeddings(subtopics):
    """Encodes a list of subtopics into sentence embeddings."""
    if not subtopics:
        return np.array([])
    return sentence_model.encode(subtopics)

def jaccard_similarity(list1, list2):
    """Calculates Jaccard similarity between two lists of strings."""
    if not list1 and not list2: return 1.0
    if not list1 or not list2: return 0.0
    set1, set2 = set(list1), set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0.0

def calculate_semantic_agreement(subtopics1, subtopics2, similarity_threshold=0.7, method="soft-f1"):
    """Calculates a semantic agreement score between two lists of subtopics."""
    if method == "jaccard":
        return jaccard_similarity(subtopics1, subtopics2)

    if not subtopics1 or not subtopics2:
        return 0.0

    embeddings1 = get_subtopic_embeddings(subtopics1)
    embeddings2 = get_subtopic_embeddings(subtopics2)
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    if method == "soft-f1":
        # Precision-like score
        matches1_to_2 = np.sum(np.max(similarity_matrix, axis=1) >= similarity_threshold)
        precision_like = matches1_to_2 / len(subtopics1)
        # Recall-like score
        matches2_to_1 = np.sum(np.max(similarity_matrix, axis=0) >= similarity_threshold)
        recall_like = matches2_to_1 / len(subtopics2)
        # F1-like score
        if precision_like + recall_like == 0: return 0.0
        return 2 * (precision_like * recall_like) / (precision_like + recall_like)

    elif method == "bipartite":
        # Use Hungarian algorithm to find optimal pairings
        cost_matrix = 1 - similarity_matrix
        row_ind, col_ind = linear_sum_assignment(cost_matrix)
        # Count valid matches above the threshold
        valid_matches = np.sum(similarity_matrix[row_ind, col_ind] >= similarity_threshold)
        # Calculate F1 score
        precision = valid_matches / len(subtopics1) if len(subtopics1) > 0 else 0
        recall = valid_matches / len(subtopics2) if len(subtopics2) > 0 else 0
        if precision + recall == 0: return 0.0
        return 2 * (precision * recall) / (precision + recall)
        
    elif method == "avg-cosine":
        return np.mean(similarity_matrix)
    else:
        raise ValueError("Invalid method specified.")

# --- 3. KRIPPENDORFF'S ALPHA IMPLEMENTATION ---

def get_conceptual_topics_for_doc(doc_data, distance_threshold=0.7, n_clusters=None):
    """
    Performs semantic clustering on all subtopics for a single document
    to establish a set of common "conceptual topics" for Krippendorff's Alpha.
    
    Args:
        doc_data (dict): The data for a single document.
        distance_threshold (float): The linkage distance threshold for clustering. 
                                   A lower value creates more, finer-grained clusters.
                                   1 - cosine similarity.
    
    Returns:
        list: A list of dictionaries for creating the Alpha DataFrame.
    """
    all_models_data = []
    docid = doc_data['docid']

    # Add our classifier's data
    cir3_subtopics = doc_data["perpectives"]["cir3_classifier_subtopics"]
    if cir3_subtopics:
        all_models_data.append({"model_name": "cir3_classifier", "subtopics": cir3_subtopics})

    # Add cross-model data
    for cross_model in doc_data["perpectives"]["cross_model_classification"]:
        if cross_model["subtopics"]:
            all_models_data.append(cross_model)

    if not all_models_data:
        return []

    # Step 1: Consolidate all subtopics from all models for the document
    corpus = [subtopic for model_data in all_models_data for subtopic in model_data['subtopics']]
    if not corpus:
        return []

    corpus_embeddings = sentence_model.encode(corpus)

    # Step 2: Cluster the embeddings using Agglomerative Clustering
    # This method is good because we don't need to specify the number of clusters beforehand.
    # We use cosine distance and a distance threshold to form clusters.
    # 'euclidean',     # 'ward' linkage requires 'euclidean' distance
    clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        #  Exactly one of n_clusters and distance_threshold has to be set, and the other needs to be None.
        distance_threshold=distance_threshold if n_clusters is None else None,
        metric= 'cosine', # 'euclidean', # 'cosine',
        linkage='average' # 'ward' # 'average'
    ).fit(corpus_embeddings)
    
    cluster_labels = clustering.labels_

    # Step 3: Assign a canonical label to each cluster
    # We'll use the most frequent subtopic in the cluster as its canonical name.
    cluster_map = {}
    for cluster_id in set(cluster_labels):
        indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]
        cluster_subtopics = [corpus[i] for i in indices]
        # Find the most frequent subtopic to name the cluster
        canonical_label = max(set(cluster_subtopics), key=cluster_subtopics.count)
        cluster_map[cluster_id] = canonical_label

    # Step 4: Create the "rating matrix" rows for this document
    krippendorff_data = []
    corpus_idx = 0
    for model_data in all_models_data:
        model_name = model_data['model_name']
        num_subtopics = len(model_data['subtopics'])
        
        # Get the cluster assignments for this model's subtopics
        model_cluster_ids = cluster_labels[corpus_idx : corpus_idx + num_subtopics]
        corpus_idx += num_subtopics
        
        # Map cluster IDs to their canonical names
        conceptual_topics = {cluster_map[cid] for cid in model_cluster_ids}
        
        # Add to our data list for the final DataFrame
        for topic in conceptual_topics:
            krippendorff_data.append({
                'docid': docid,
                'model_name': model_name,
                'conceptual_topic': topic
            })
            
    return krippendorff_data, corpus


# --- 3.1 TWO-STAGE EVALUATION FUNCTIONS ---

def establish_cross_model_consensus_for_doc(doc_data, distance_threshold=0.7, n_clusters=None):
    """
    STAGE 1: Establishes conceptual topics using ONLY cross-model data (excluding cir3).
    This creates the "ground truth" consensus that we'll evaluate cir3 against.
    
    Returns:
        consensus_data: List of dicts for Krippendorff's Alpha among cross-models
        cluster_map: Mapping from cluster IDs to canonical topic names
        corpus_embeddings: Embeddings of the cross-model corpus
        clustering: The fitted clustering object for mapping new data
    """
    docid = doc_data['docid']
    cross_model_data = []
    
    # ONLY collect cross-model data (exclude cir3)
    for cross_model in doc_data["perpectives"]["cross_model_classification"]:
        if cross_model["subtopics"]:
            cross_model_data.append(cross_model)
    
    if not cross_model_data:
        return [], {}, np.array([]), None
    
    # Step 1: Create corpus from cross-models only
    corpus = [subtopic for model_data in cross_model_data for subtopic in model_data['subtopics']]
    if not corpus:
        return [], {}, np.array([]), None
    
    corpus_embeddings = sentence_model.encode(corpus)
    
    # Step 2: Cluster cross-model subtopics to establish consensus concepts
    clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        distance_threshold=distance_threshold if n_clusters is None else None,
        metric='cosine',
        linkage='average'
    ).fit(corpus_embeddings)
    
    cluster_labels = clustering.labels_
    
    # Step 3: Create canonical labels from cross-model consensus
    cluster_map = {}
    for cluster_id in set(cluster_labels):
        indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]
        cluster_subtopics = [corpus[i] for i in indices]
        canonical_label = max(set(cluster_subtopics), key=cluster_subtopics.count)
        cluster_map[cluster_id] = canonical_label
    
    # Step 4: Create Krippendorff's data for cross-models only
    consensus_data = []
    corpus_idx = 0
    for model_data in cross_model_data:
        model_name = model_data['model_name']
        num_subtopics = len(model_data['subtopics'])
        
        model_cluster_ids = cluster_labels[corpus_idx : corpus_idx + num_subtopics]
        corpus_idx += num_subtopics
        
        conceptual_topics = {cluster_map[cid] for cid in model_cluster_ids}
        
        for topic in conceptual_topics:
            consensus_data.append({
                'docid': docid,
                'model_name': model_name,
                'conceptual_topic': topic
            })
    
    return consensus_data, cluster_map, corpus_embeddings, clustering


def evaluate_cir3_against_consensus(doc_data, cluster_map, corpus_embeddings, clustering_obj, similarity_threshold=0.7):
    """
    STAGE 2: Evaluates cir3's subtopics against the established cross-model consensus.
    Uses semantic agreement score as the primary evaluation metric.
    
    Returns:
        semantic_agreement_score: Primary metric for CIR3 performance evaluation
        cir3_conceptual_topics: The consensus topics that cir3 identified
        consensus_topics: All available consensus topics for this document
    """
    docid = doc_data['docid']
    cir3_subtopics = doc_data["perpectives"]["cir3_classifier_subtopics"]
    
    if not cir3_subtopics or not cluster_map:
        return 0.0, set(), set()
    
    # Get all consensus topics available for this document
    consensus_topics = set(cluster_map.values())
    
    # Use the existing calculate_semantic_agreement function with soft-F1 method
    # This provides a more nuanced evaluation than simple precision/recall
    
    # First, reconstruct the consensus subtopics for semantic comparison
    consensus_subtopics = []
    for cross_model in doc_data["perpectives"]["cross_model_classification"]:
        if cross_model["subtopics"]:
            consensus_subtopics.extend(cross_model["subtopics"])
    
    if not consensus_subtopics:
        return 0.0, set(), set()
    
    # Calculate semantic agreement using the soft-F1 method
    # This is the primary metric for CIR3 evaluation
    semantic_agreement_score = calculate_semantic_agreement(
        cir3_subtopics, 
        consensus_subtopics, 
        similarity_threshold=similarity_threshold, 
        method="soft-f1"
    )
    
    # Also identify which conceptual topics CIR3 covered for interpretability
    cir3_embeddings = sentence_model.encode(cir3_subtopics)
    cir3_conceptual_topics = set()
    
    for i, cir3_embedding in enumerate(cir3_embeddings):
        # Calculate similarity to all corpus embeddings
        similarities = cosine_similarity([cir3_embedding], corpus_embeddings)[0]
        
        # Find the most similar subtopic in the corpus
        best_match_idx = np.argmax(similarities)
        best_similarity = similarities[best_match_idx]
        
        if best_similarity >= similarity_threshold:
            # Find which cluster this subtopic belongs to
            corpus_subtopics = []
            for cross_model in doc_data["perpectives"]["cross_model_classification"]:
                if cross_model["subtopics"]:
                    corpus_subtopics.extend(cross_model["subtopics"])
            
            if best_match_idx < len(corpus_subtopics):
                # Find the cluster with the highest average similarity to this subtopic
                best_cluster = None
                max_avg_similarity = 0
                
                for cluster_id, canonical_name in cluster_map.items():
                    # Calculate average similarity to all subtopics in this cluster
                    cluster_similarities = []
                    for idx, subtopic in enumerate(corpus_subtopics):
                        if idx < len(corpus_embeddings):
                            subtopic_sim = cosine_similarity(
                                [corpus_embeddings[idx]], 
                                [sentence_model.encode([canonical_name])[0]]
                            )[0][0]
                            if subtopic_sim >= 0.4750:  # Only consider subtopics that belong to this cluster
                                cluster_similarities.append(
                                    cosine_similarity([corpus_embeddings[idx]], [cir3_embedding])[0][0]
                                )
                    
                    if cluster_similarities:
                        avg_similarity = np.mean(cluster_similarities)
                        if avg_similarity > max_avg_similarity:
                            max_avg_similarity = avg_similarity
                            best_cluster = canonical_name
                
                if best_cluster:
                    cir3_conceptual_topics.add(best_cluster)
    
    return semantic_agreement_score, cir3_conceptual_topics, consensus_topics


def analyze_raw_cross_model_data(data_sample):
    """
    Analyze the raw cross-model data to understand why models agree so much.
    """
    print(f"\n--- RAW CROSS-MODEL DATA ANALYSIS ---")
    print(f"Analyzing {len(data_sample)} documents")
    
    all_subtopics_by_model = {}
    overlap_matrix = {}
    
    for doc_data in data_sample:
        docid = doc_data["docid"]
        print(f"\nDocument {docid}:")
        
        # Collect subtopics by model for this document
        doc_subtopics = {}
        for cross_model in doc_data["perpectives"]["cross_model_classification"]:
            if cross_model["subtopics"]:
                model_name = cross_model["model_name"]
                subtopics = set(cross_model["subtopics"])
                doc_subtopics[model_name] = subtopics
                print(f"  {model_name}: {len(subtopics)} subtopics - {list(subtopics)[:3]}{'...' if len(subtopics) > 3 else ''}")
                
                # Add to global collection
                if model_name not in all_subtopics_by_model:
                    all_subtopics_by_model[model_name] = set()
                all_subtopics_by_model[model_name].update(subtopics)
        
        # Calculate pairwise overlap for this document
        model_names = list(doc_subtopics.keys())
        for i, model1 in enumerate(model_names):
            for model2 in model_names[i+1:]:
                set1 = doc_subtopics[model1]
                set2 = doc_subtopics[model2]
                if set1 and set2:
                    jaccard = len(set1.intersection(set2)) / len(set1.union(set2))
                    print(f"  {model1} ↔ {model2}: {jaccard:.4f} Jaccard similarity")
    
    # Global analysis
    print(f"\n--- GLOBAL ANALYSIS ---")
    for model_name, subtopics in all_subtopics_by_model.items():
        print(f"{model_name}: {len(subtopics)} unique subtopics total")
    
    # Global pairwise similarity
    model_names = list(all_subtopics_by_model.keys())
    print(f"\nGlobal Pairwise Jaccard Similarities:")
    for i, model1 in enumerate(model_names):
        for model2 in model_names[i+1:]:
            set1 = all_subtopics_by_model[model1]
            set2 = all_subtopics_by_model[model2]
            if set1 and set2:
                jaccard = len(set1.intersection(set2)) / len(set1.union(set2))
                print(f"  {model1} ↔ {model2}: {jaccard:.4f}")





def generate_research_tables(pairwise_scores, cross_model_alpha, cir3_results, num_documents):
    """
    Generate publication-ready tables for research paper.
    
    """
    print("\n" + "="*80)
    print("PUBLICATION-READY RESEARCH TABLES")
    print("="*80)
    
    # Calculate CIR3 performance metrics
    cir3_semantic_scores = [result['semantic_agreement_score'] for result in cir3_results]
    
    # Readable table format
    print("\n--- TABLE 1: Cross-Model Agreement Analysis ---")
    print("| Metric      | Mean  | Std Dev | Min   | Max   | n   |")
    print("|-------------|-------|---------|-------|-------|-----|")
    
    for method_name, scores in pairwise_scores.items():
        if scores:
            mean_score = np.mean(scores)
            std_score = np.std(scores)
            min_score = np.min(scores)
            max_score = np.max(scores)
            n_scores = len(scores)
            
            formatted_method = method_name.replace('_', '-').title()
            print(f"| {formatted_method:<11} | {mean_score:.4f} | {std_score:.4f}   | {min_score:.4f} | {max_score:.4f} | {n_scores:<3} |")
    
    print(f"\nNote: Agreement scores between CIR3 and cross-model classifications across {num_documents} documents.")
    print("- Jaccard: Set-based similarity")
    print("- Soft-F1: Semantic precision-recall")  
    print("- Bipartite: Hungarian algorithm matching")
    print("- Avg-Cosine: Average pairwise similarity")
    
    # Table 2: Two-Stage Semantic Evaluation Results
    print("\n--- TABLE 2: Two-Stage Semantic Evaluation Results ---")
    print("| Metric                        | Value | Interpretation |")
    print("|-------------------------------|-------|----------------|")
    print(f"| Cross-Model Reliability (α)   | {cross_model_alpha:.4f} | {'Excellent' if cross_model_alpha >= 0.8 else 'Good' if cross_model_alpha >= 0.67 else 'Moderate':<14} |")
    print(f"| CIR3 Semantic Agreement (Mean) | {np.mean(cir3_semantic_scores):.4f} | {'Excellent' if np.mean(cir3_semantic_scores) >= 0.8 else 'Good' if np.mean(cir3_semantic_scores) >= 0.6 else 'Moderate':<14} |")
    print(f"| CIR3 Semantic Agreement (Std Dev) | {np.std(cir3_semantic_scores):.4f} | {'Low variance' if np.std(cir3_semantic_scores) < 0.1 else 'Moderate variance':<14} |")
    print(f"| Documents Evaluated           | {len(cir3_results):<5} | --             |")
    
    print(f"\nNote: Two-stage evaluation process:")
    print("- Stage 1: Establish consensus from cross-models (Llama3-8B, GPT-4o-mini, Gemma-27B)")
    print("- Stage 2: Evaluate CIR3 alignment against consensus using semantic agreement score")
    print("- Krippendorff's α measures cross-model reliability")
    print("- Semantic Agreement Score uses soft-F1 method for nuanced evaluation")
    print("- Clustering threshold: 0.425 (cosine distance)")
    
    # Table 3: Performance Distribution Analysis
    # Bin the semantic agreement scores
    semantic_bins = [(0.0, 0.6, "Poor"), (0.6, 0.8, "Good"), (0.8, 0.95, "Excellent"), (0.95, 1.0, "Outstanding")]
    
    
    print("\n--- TABLE 3: CIR3 Performance Distribution Analysis ---")
    print("| Performance Range | Semantic Agreement Range | Count | Percentage |")
    print("|-------------------|--------------------------|-------|------------|")
        
    for min_val, max_val, label in semantic_bins:
        if label == "Outstanding":
            count = sum(1 for score in cir3_semantic_scores if min_val < score <= max_val)
        else:
            count = sum(1 for score in cir3_semantic_scores if min_val <= score < max_val)
        percentage = (count / len(cir3_semantic_scores)) * 100 if cir3_semantic_scores else 0
        print(f"| {label:<17} | {min_val:.1f}--{max_val:.1f}               | {count:>5} | {percentage:>8.1f}% |")
    
    print(f"|-------------------|--------------------------|-------|------------|")
    print(f"| {'Total':<17} | --                       | {len(cir3_semantic_scores):>5} | {'100.0':>8}% |")
    
    print(f"\nPerformance Categories:")
    print("- Outstanding (>0.95): Near-perfect agreement")
    print("- Excellent (0.8-0.95): Strong agreement")  
    print("- Good (0.6-0.8): Reasonable agreement")
    print("- Poor (≤0.6): Weak agreement")
    
    # Summary Statistics Table
# --- 4. MAIN EVALUATION SCRIPT ---

if __name__ == "__main__":
    
    # --- Part A: Pairwise Agreement Analysis (Original Method) ---
    print("\n" + "="*80)
    print("PART A: PAIRWISE INTER-MODEL AGREEMENT ANALYSIS")
    print("="*80)
    
    number_of_models = len(get_ai_settings().cross_classifier_agents_params)
    evaluation_methods = ["jaccard", "soft-f1", "bipartite", "avg-cosine"]
    all_scores_by_metric = {method: [] for method in evaluation_methods}


    list_of_scores_bipartite = []
    for doc_data in data:
        docid = doc_data["docid"]

        list_of_scores_per_doc = []
        
        cir3_subtopics = doc_data["perpectives"]["cir3_classifier_subtopics"]
        print(f"\n--- Document ID: {docid} ---")
        
        for cross_model_data in doc_data["perpectives"]["cross_model_classification"]:
            ref_model_name = cross_model_data["model_name"]
            ref_subtopics = cross_model_data["subtopics"]
            
            print(f"  Agreement with {ref_model_name}:")
            # score_sum_per_model = 0.0
            for method_name in evaluation_methods:
                score = calculate_semantic_agreement(cir3_subtopics, ref_subtopics, method=method_name)
                all_scores_by_metric[method_name].append(score)
                print(f"    - {method_name.title():<12}: {score:.4f}")
                if method_name == "bipartite":
                    list_of_scores_per_doc.append(score)
            
        list_of_scores_bipartite.append({"docid": docid, "scores": list_of_scores_per_doc, "avg_score": np.mean(list_of_scores_per_doc), "method": "bipartite"})

    print("\n--- Overall Pairwise Agreement Statistics ---")
    for method_name, scores in all_scores_by_metric.items():
        if scores:
            avg_score = np.mean(scores)
            std_dev = np.std(scores)
            print(f"  {method_name.title():<12} | Average: {avg_score:.4f}, Std Dev: {std_dev:.4f}")
            
    # Paired T-tests (Comparing Metrics)
    print("\n--- Paired T-Tests: Comparing Agreement Metrics ---")
    metrics_to_compare_pairs = list(itertools.combinations(evaluation_methods, 2))
    for m1, m2 in metrics_to_compare_pairs:
        scores1 = all_scores_by_metric[m1]
        scores2 = all_scores_by_metric[m2]
        if len(scores1) > 1 and len(scores1) == len(scores2):
            stat, p_value = ttest_rel(scores1, scores2)
            print(f"\n  Comparing {m1.title()} vs {m2.title()}:")
            print(f"    T-statistic: {stat:.4f}, P-value: {p_value:.4f}")
            if p_value < 0.05:
                print("    Result: Statistically significant difference.")
            else:
                print("    Result: No statistically significant difference.")

    
    # --- Part B: Two-Stage Semantic Evaluation ---
    print("\n\n" + "="*80)
    print("PART B: TWO-STAGE SEMANTIC EVALUATION")
    print("="*80)
    
    CLUSTERING_DISTANCE_THRESHOLD = 0.4750  # Further reduced to 0.4 for more restrictive clustering
    print(f"\nClustering subtopics with distance threshold: {CLUSTERING_DISTANCE_THRESHOLD}")
    print("⚠ Note: Reduced threshold to 0.425 for more fine-grained clustering and specific topic separation")
    
    # DIAGNOSTIC: Analyze raw cross-model data first
    analyze_raw_cross_model_data(data[:5])  # Analyze first 5 documents in detail
    
    # STAGE 1: Establish cross-model consensus
    print("\n--- STAGE 1: Establishing Cross-Model Consensus ---")
    cross_model_consensus_data = []
    all_consensus_corpus = []
    cir3_evaluation_results = []
    
    for doc_data in data:
        docid = doc_data["docid"]
        
        # Establish consensus from cross-models only
        consensus_data, cluster_map, corpus_embeddings, clustering_obj = establish_cross_model_consensus_for_doc(
            doc_data=doc_data,
            distance_threshold=CLUSTERING_DISTANCE_THRESHOLD,
            n_clusters=None  # Use distance threshold
        )
        
        cross_model_consensus_data.extend(consensus_data)
        
        # Collect corpus for visualization
        if len(consensus_data) > 0:
            cross_model_subtopics = []
            for cross_model in doc_data["perpectives"]["cross_model_classification"]:
                if cross_model["subtopics"]:
                    cross_model_subtopics.extend(cross_model["subtopics"])
            all_consensus_corpus.extend(cross_model_subtopics)
        
        # STAGE 2: Evaluate cir3 against this consensus
        semantic_agreement_score, cir3_topics, consensus_topics = evaluate_cir3_against_consensus(
            doc_data=doc_data,
            cluster_map=cluster_map,
            corpus_embeddings=corpus_embeddings,
            clustering_obj=clustering_obj,
            similarity_threshold=0.7
        )
        
        cir3_evaluation_results.append({
            'docid': docid,
            'semantic_agreement_score': semantic_agreement_score,
            'cir3_topics': cir3_topics,
            'consensus_topics': consensus_topics
        })
        
        print(f"\nDocument {docid}:")
        print(f"  Consensus Topics: {consensus_topics}")
        print(f"  CIR3 Topics: {cir3_topics}")
        print(f"  CIR3 Semantic Agreement Score: {semantic_agreement_score:.4f}")
    
    # Calculate cross-model consensus reliability (Krippendorff's Alpha)
    if cross_model_consensus_data:
        consensus_df = pd.DataFrame(cross_model_consensus_data)
        print(f"\n--- Cross-Model Consensus Reliability ---")
        print(f"Sample of consensus data:")
        print(consensus_df.head())
        
        # DIAGNOSTIC: Check clustering results
        print(f"\n--- CLUSTERING DIAGNOSTICS ---")
        unique_topics = consensus_df['conceptual_topic'].unique()
        print(f"Number of unique conceptual topics: {len(unique_topics)}")
        print(f"Conceptual topics: {unique_topics}")
        
        # Check distribution of topics per model
        topic_counts = consensus_df.groupby('model_name')['conceptual_topic'].nunique()
        print(f"\nTopics per model:")
        for model, count in topic_counts.items():
            print(f"  {model}: {count} topics")
            
        # Check if all models have the same topics (indicating over-clustering)
        model_topic_sets = {}
        for model in consensus_df['model_name'].unique():
            model_topics = set(consensus_df[consensus_df['model_name'] == model]['conceptual_topic'].unique())
            model_topic_sets[model] = model_topics
            print(f"  {model} topics: {model_topics}")
        
        
        try:
            cross_model_alpha = simpledorff.calculate_krippendorffs_alpha_for_df(
                consensus_df,
                experiment_col='docid',
                annotator_col='model_name',
                class_col='conceptual_topic'
            )
            print(f"\nCross-Model Krippendorff's Alpha: {cross_model_alpha:.4f}")
            
            # Set alpha_score for compatibility with existing summary code
            alpha_score = cross_model_alpha
                
        except Exception as e:
            print(f"Could not calculate cross-model Alpha: {e}")
            alpha_score = 0.0
    else:
        alpha_score = 0.0
    
    # Analyze CIR3's performance against consensus
    print(f"\n--- CIR3 Performance Against Cross-Model Consensus ---")
    semantic_agreement_scores = [result['semantic_agreement_score'] for result in cir3_evaluation_results]
    
    if semantic_agreement_scores:
        print(f"CIR3 Semantic Agreement Statistics:")
        print(f"  Average Semantic Agreement Score: {np.mean(semantic_agreement_scores):.4f}")
        print(f"  Std Dev Semantic Agreement Score: {np.std(semantic_agreement_scores):.4f}")
        print(f"  Min Semantic Agreement Score: {np.min(semantic_agreement_scores):.4f}")
        print(f"  Max Semantic Agreement Score: {np.max(semantic_agreement_scores):.4f}")
        
        # Performance interpretation
        avg_semantic_score = np.mean(semantic_agreement_scores)
        print(f"\n  CIR3 Performance Interpretation:")
        if avg_semantic_score >= 0.8:
            print("  Excellent: CIR3 strongly aligns with cross-model consensus")
        elif avg_semantic_score >= 0.6:
            print("  Good: CIR3 reasonably aligns with cross-model consensus")
        elif avg_semantic_score >= 0.4:
            print("  Moderate: CIR3 has partial alignment with cross-model consensus")
        else:
            print("  Poor: CIR3 weakly aligns with cross-model consensus")
    
    # Visualization of consensus
    print(f"\n--- Consensus Visualization ---")
    if all_consensus_corpus:
        print(f"Plotting dendrogram for {len(all_consensus_corpus)} consensus subtopics...")
        plot_dendrogram(all_consensus_corpus)
        
        # Also create corpus_list for compatibility with existing code
        corpus_list = all_consensus_corpus
    else:
        corpus_list = []
    
    print("\n" + "="*80)
    print("TWO-STAGE ANALYSIS COMPLETE")
    print("="*80)
    
    generate_research_tables(
        pairwise_scores=all_scores_by_metric,
        cross_model_alpha=alpha_score,
        cir3_results=cir3_evaluation_results,
        num_documents=len(data)
    )




print(f"\n \n \n ----------------- RESTRUCTURE ALL SCORES ------------------")

# [0, 0.33] ]0.33, 0.66] ]0.66, 0.75]  ]0.75, 1]  
list_of_bins_0_50 = []
list_of_bins_50_066 = []
list_of_bins_066_075 = []
list_of_bins_075_1 = []

for item in list_of_scores_bipartite:
    if item["avg_score"] <= 0.50:
        list_of_bins_0_50.append(item)
    elif item["avg_score"] <= 0.66:
        list_of_bins_50_066.append(item)
    elif item["avg_score"] <= 0.75:
        list_of_bins_066_075.append(item)
    else:
        list_of_bins_075_1.append(item)

print(len(list_of_bins_0_50))
print(len(list_of_bins_50_066))
print(len(list_of_bins_066_075))
print(len(list_of_bins_075_1))

import numpy as np

# Define intervals and corresponding lists
intervals = [
    "[0, 0.50]",
    "]0.50, 0.66]",
    "]0.66, 0.75]",
    "]0.75, 1]"
]
lists = [
    list_of_bins_0_50,
    list_of_bins_50_066,
    list_of_bins_066_075,
    list_of_bins_075_1
]

# Print table header
print(f"{'Interval':<15} {'Count':<10} {'Avg':<10} {'Median':<10} {'Min':<10} {'Max':<10}")
print("-" * 70)


count_list = []
avg_scores_list = []
median_scores_list = []
min_scores_list = []
max_scores_list = []

# Print each row
for interval, bin_list in zip(intervals, lists):
    scores = [item["avg_score"] for item in bin_list]
    count = len(scores)
    count_list.append(count)
    avg_score = np.mean(scores) if scores else 0
    avg_scores_list.append(avg_score)
    median_score = np.median(scores) if scores else 0
    median_scores_list.append(median_score)
    min_score = np.min(scores) if scores else 0
    min_scores_list.append(min_score)
    max_score = np.max(scores) if scores else 0
    max_scores_list.append(max_score)
    print(f"{interval:<15} {count:<10} {avg_score:<10.4f} {median_score:<10.4f} {min_score:<10.4f} {max_score:<10.4f}")



print("\n============== Overall Pairwise Agreement Statistics ==============")
for method_name, scores in all_scores_by_metric.items():
    if scores:
        avg_score = np.mean(scores)
        std_dev = np.std(scores)
        print(f"  {method_name.title():<12} | Average: {avg_score:.4f}, Std Dev: {std_dev:.4f}")


print("\n============== Krippendorff's Alpha Result ==============")
print(f"  Alpha Score: {alpha_score:.4f}")




