In [1]:
!pip install transformers datasets torch accelerate evaluate scikit-learn pandas numpy tqdm

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import json
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import random
import pickle
from torch.utils.data import DataLoader

In [3]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7c4fb723f230>

In [4]:
# Check if GPU is available
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU: NVIDIA A100-SXM4-80GB


In [4]:
class ErrorAnalysisTrainer(Trainer):
    """Extended trainer that saves predictions for error analysis"""
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.predictions_log = []

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """Override to capture predictions and probabilities"""
        result = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

        if not prediction_loss_only:
            # Handle both tuple and PredictionOutput formats
            if isinstance(result, tuple):
                logits = result[1]  # (loss, predictions, labels)
            else:
                logits = result.predictions

            probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
            predictions = np.argmax(logits, axis=-1)
            confidences = np.max(probs.numpy(), axis=-1)

            # Store for later analysis - handle tensor conversion properly
            labels = inputs['labels']
            if torch.is_tensor(labels):
                labels = labels.cpu().numpy()

            for pred, conf, true_label in zip(predictions, confidences, labels):
                self.predictions_log.append({
                    'prediction': int(pred),
                    'confidence': float(conf),
                    'true_label': int(true_label) if hasattr(true_label, 'item') else int(true_label)
                })

        return result

def load_full_dataset(pickle_path="dataset_full.pickle"):
    """Load the full cleaned dataset from pickle file"""
    try:
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)

        if isinstance(data, list) and len(data) == 2:
            texts, labels = data[0], data[1]

            # Create DataFrame
            df = pd.DataFrame({
                'text': texts,
                'label': labels
            })

            print(f"‚úÖ Loaded full dataset: {len(df):,} samples")

            # Show distribution
            label_counts = df['label'].value_counts()
            print("Label distribution:")
            for label, count in label_counts.items():
                label_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[label]
                print(f"   {label_name} ({label}): {count:,} ({count/len(df)*100:.1f}%)")

            return df
        else:
            raise ValueError(f"Expected list with 2 elements [texts, labels], got {type(data)}")

    except Exception as e:
        print(f"‚ùå Error loading dataset from {pickle_path}: {e}")
        return None

def create_train_test_split(df, test_size=0.2, random_state=42):
    """Create proper train/test split without data leakage"""

    # Stratified split to maintain class distribution
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['label']
    )

    # Further split training into train/val
    train_df, val_df = train_test_split(
        train_df,
        test_size=0.3,  # 30% of training set for validation
        random_state=random_state,
        stratify=train_df['label']
    )

    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    print(f"Dataset split:")
    print(f"   Train: {len(train_df):,} samples ({len(train_df)/len(df)*100:.1f}%)")
    print(f"   Val:   {len(val_df):,} samples ({len(val_df)/len(df)*100:.1f}%)")
    print(f"   Test:  {len(test_df):,} samples ({len(test_df)/len(df)*100:.1f}%)")

    # Verify no overlap
    train_texts = set(train_df['text'])
    val_texts = set(val_df['text'])
    test_texts = set(test_df['text'])

    assert len(train_texts & val_texts) == 0, "Train/Val overlap detected!"
    assert len(train_texts & test_texts) == 0, "Train/Test overlap detected!"
    assert len(val_texts & test_texts) == 0, "Val/Test overlap detected!"

    print("‚úÖ No data leakage detected")

    return train_df, val_df, test_df

def create_dataset_with_indices(df, tokenizer, max_length):
    """Create dataset with original indices for error analysis"""
    label_map = {'o': 0, 'p': 1, 'n': 2}
    df = df.copy()
    df['labels'] = df['label'].map(label_map)
    df['original_index'] = df.index

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding=False, max_length=max_length)

    dataset = Dataset.from_pandas(df[['text', 'labels', 'original_index']])
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset, df

def calculate_class_weights(train_df):
    """Calculate class weights for imbalanced data"""
    label_map = {'o': 0, 'p': 1, 'n': 2}
    labels = [label_map[label] for label in train_df['label']]

    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(labels),
        y=labels
    )

    return torch.FloatTensor(class_weights).to('cuda' if torch.cuda.is_available() else 'cpu')

def analyze_text_patterns(df, label_col='label', text_col='text'):
    """Analyze text patterns by sentiment class"""
    analysis = {}

    for label in df[label_col].unique():
        subset = df[df[label_col] == label]
        texts = subset[text_col].tolist()

        # Text length analysis
        lengths = [len(text.split()) for text in texts]

        # Common patterns
        all_text = ' '.join(texts).lower()
        words = re.findall(r'\b\w+\b', all_text)

        analysis[label] = {
            'count': len(subset),
            'avg_length': np.mean(lengths),
            'std_length': np.std(lengths),
            'common_words': Counter(words).most_common(10),
            'sample_texts': random.sample(texts, min(5, len(texts)))
        }

    return analysis

def create_error_analysis_report(df_test, predictions, probabilities, true_labels):
    """Create comprehensive error analysis report"""
    label_names = ['neutral', 'positive', 'negative']
    label_map = {0: 'o', 1: 'p', 2: 'n'}

    # Create results dataframe
    results_df = df_test.copy()
    results_df['predicted_label'] = [label_map[p] for p in predictions]
    results_df['predicted_numeric'] = predictions
    results_df['confidence'] = np.max(probabilities, axis=1)
    results_df['correct'] = (predictions == true_labels)

    # Add prediction probabilities
    for i, label in enumerate(label_names):
        results_df[f'prob_{label}'] = probabilities[:, i]

    # Error analysis
    errors_df = results_df[~results_df['correct']].copy()
    correct_df = results_df[results_df['correct']].copy()

    print("="*80)
    print("üìä XLNET ERROR ANALYSIS REPORT")
    print("="*80)

    # Overall metrics
    accuracy = np.mean(results_df['correct'])
    macro_f1 = f1_score(true_labels, predictions, average='macro')

    print(f"\nüéØ Overall Performance:")
    print(f"   ‚Ä¢ Accuracy: {accuracy:.4f} ({np.sum(results_df['correct'])}/{len(results_df)})")
    print(f"   ‚Ä¢ Macro F1: {macro_f1:.4f}")

    # Confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    print(f"\nüìà Confusion Matrix:")
    cm_df = pd.DataFrame(cm, index=label_names, columns=label_names)
    print(cm_df)

    # Class-wise performance
    report = classification_report(true_labels, predictions, target_names=label_names, output_dict=True)
    print(f"\nüìã Per-Class Performance:")
    for i, label in enumerate(label_names):
        f1 = report[label]['f1-score']
        precision = report[label]['precision']
        recall = report[label]['recall']
        support = report[label]['support']
        print(f"   ‚Ä¢ {label.capitalize()}: F1={f1:.3f}, P={precision:.3f}, R={recall:.3f}, N={support}")

    # Error patterns by class
    print(f"\nüîç Error Patterns:")
    for true_label in [0, 1, 2]:
        true_name = label_names[true_label]
        class_errors = errors_df[errors_df['labels'] == true_label]

        if len(class_errors) > 0:
            print(f"\n   {true_name.capitalize()} misclassified as:")
            pred_counts = class_errors['predicted_label'].value_counts()
            for pred_label, count in pred_counts.items():
                pred_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[pred_label]
                pct = (count / len(class_errors)) * 100
                print(f"     - {pred_name}: {count} ({pct:.1f}%)")

    # Confidence analysis
    print(f"\nüé≤ Confidence Analysis:")
    print(f"   ‚Ä¢ Correct predictions avg confidence: {correct_df['confidence'].mean():.3f}")
    print(f"   ‚Ä¢ Incorrect predictions avg confidence: {errors_df['confidence'].mean():.3f}")

    # Low confidence predictions (potential uncertain cases)
    low_conf_threshold = 0.6
    low_conf = results_df[results_df['confidence'] < low_conf_threshold]
    print(f"   ‚Ä¢ Low confidence predictions (<{low_conf_threshold}): {len(low_conf)} ({len(low_conf)/len(results_df)*100:.1f}%)")

    return results_df, errors_df, correct_df

def sample_error_cases(errors_df, n_samples=5):
    """Sample and display error cases for manual inspection"""
    print(f"\nüî¨ Sample Error Cases for Manual Review:")
    print("="*80)

    for label in errors_df['label'].unique():
        label_errors = errors_df[errors_df['label'] == label]
        if len(label_errors) == 0:
            continue

        label_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[label]
        print(f"\n{label_name.upper()} Examples (True: {label_name}):")
        print("-" * 60)

        # Sample both high and low confidence errors
        sample_size = min(n_samples, len(label_errors))
        sampled = label_errors.sample(n=sample_size, random_state=42)

        for idx, row in sampled.iterrows():
            pred_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[row['predicted_label']]
            print(f"\nPredicted: {pred_name} (confidence: {row['confidence']:.3f})")
            print(f"Text: {row['text'][:200]}{'...' if len(row['text']) > 200 else ''}")
            print()

def train_xlnet_model(test_size=0.2, save_model=True):
    """
    Train XLNet model using full dataset with proper train/test split

    Args:
        test_size: Proportion of data to use for testing (default 0.2 = 20%)
        save_model: Whether to save the trained model

    Returns:
        output_dir: Directory where model and datasets are saved
    """
    model_name = 'xlnet-base-cased'
    print(f"üöÄ Training XLNet Model on Full Dataset")
    print(f"   Test size: {test_size*100}% ({100-test_size*100}% for train/val)")

    # Load full dataset
    full_df = load_full_dataset()
    if full_df is None:
        print("‚ùå Failed to load full dataset")
        return None

    # Create proper train/val/test split
    train_df, val_df, test_df = create_train_test_split(full_df, test_size=test_size)

    print(f"   ‚Ä¢ Train: {len(train_df):,} samples")
    print(f"   ‚Ä¢ Val: {len(val_df):,} samples")
    print(f"   ‚Ä¢ Test: {len(test_df):,} samples")

    # Text pattern analysis before training
    print("\nüìù Pre-training text analysis:")
    text_patterns = analyze_text_patterns(train_df)
    for label, stats in text_patterns.items():
        label_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[label]
        print(f"   {label_name}: {stats['count']} samples, avg {stats['avg_length']:.1f} words")

    # Load model and tokenizer
    print(f"\nü§ñ Loading XLNet...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    # Create datasets with indices for error tracking
    print("üî§ Tokenizing data...")
    train_dataset, train_df_indexed = create_dataset_with_indices(train_df, tokenizer, 256)
    val_dataset, val_df_indexed = create_dataset_with_indices(val_df, tokenizer, 256)
    test_dataset, test_df_indexed = create_dataset_with_indices(test_df, tokenizer, 256)

    # Training setup
    output_dir = f'xlnet_model_full_dataset'
    os.makedirs(output_dir, exist_ok=True)

    # Save datasets for later analysis
    train_df_indexed.to_csv(f'{output_dir}/train_data.csv', index=False)
    val_df_indexed.to_csv(f'{output_dir}/val_data.csv', index=False)
    test_df_indexed.to_csv(f'{output_dir}/test_data.csv', index=False)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df)
    print(f"‚öñÔ∏è  Class weights: {class_weights.tolist()}")

    # Save class weights for later use
    torch.save(class_weights, f'{output_dir}/class_weights.pt')

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f'{output_dir}/checkpoints',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        fp16=True,
        dataloader_num_workers=2,
        report_to="none",
    )

    # Initialize trainer
    trainer_kwargs = {
        'class_weights': class_weights,
        'model': model,
        'args': training_args,
        'train_dataset': train_dataset,
        'eval_dataset': val_dataset,
        'data_collator': DataCollatorWithPadding(tokenizer=tokenizer),
    }

    # Use processing_class instead of tokenizer for newer versions
    import transformers
    if hasattr(transformers, '__version__') and tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 46):
        trainer_kwargs['processing_class'] = tokenizer
    else:
        trainer_kwargs['tokenizer'] = tokenizer

    trainer = ErrorAnalysisTrainer(**trainer_kwargs)

    # Train
    print(f"\nüî• Training XLNet...")
    trainer.train()

    # Save the trained model
    if save_model:
        print(f"üíæ Saving trained model...")
        trainer.save_model(f'{output_dir}/final_model')
        tokenizer.save_pretrained(f'{output_dir}/final_model')

        # Save training info
        training_info = {
            'model_name': model_name,
            'data_split': 'full_dataset_proper_split',
            'total_samples': len(full_df),
            'train_samples': len(train_df),
            'val_samples': len(val_df),
            'test_samples': len(test_df),
            'test_size': test_size,
            'class_weights': class_weights.tolist(),
            'training_args': training_args.to_dict(),
            'timestamp': datetime.now().isoformat()
        }

        with open(f'{output_dir}/training_info.json', 'w') as f:
            json.dump(training_info, f, indent=2)

    print(f"‚úÖ Training complete! Model saved to: {output_dir}")

    return output_dir

def analyze_xlnet_errors(model_dir, n_error_samples=5):
    """
    Analyze errors from a pre-trained XLNet model

    Args:
        model_dir: Directory containing trained model and data
        n_error_samples: Number of error samples to display per class

    Returns:
        results_df, errors_df, summary: Analysis results
    """
    print(f"üîç Starting XLNet Error Analysis")
    print(f"   Using model from: {model_dir}")

    # Check if model directory exists
    if not os.path.exists(model_dir):
        print(f"‚ùå Model directory not found: {model_dir}")
        print("   Please run train_xlnet_model() first")
        return None, None, None

    # Load training info
    try:
        with open(f'{model_dir}/training_info.json', 'r') as f:
            training_info = json.load(f)
        print(f"   Model trained on: {training_info['data_split']}")
        print(f"   Training date: {training_info['timestamp'][:19]}")
    except:
        print("   ‚ö†Ô∏è  Training info not found, proceeding anyway...")

    # Load test data
    test_df = pd.read_csv(f'{model_dir}/test_data.csv')
    print(f"   ‚Ä¢ Test samples: {len(test_df):,}")

    # Load model and tokenizer
    print(f"ü§ñ Loading trained model...")
    model_path = f'{model_dir}/final_model'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Load class weights
    try:
        class_weights = torch.load(f'{model_dir}/class_weights.pt')
        print(f"‚öñÔ∏è  Class weights: {class_weights.tolist()}")
    except:
        print("   ‚ö†Ô∏è  Class weights not found, proceeding without...")
        class_weights = None

    # Create test dataset
    print("üî§ Preparing test data...")
    test_dataset, test_df_indexed = create_dataset_with_indices(test_df, tokenizer, 256)

    # Setup for prediction
    training_args = TrainingArguments(
        output_dir=f'{model_dir}/analysis',
        per_device_eval_batch_size=16,
        dataloader_num_workers=2,
        report_to="none",
    )

    # Initialize trainer for prediction
    trainer_kwargs = {
        'model': model,
        'args': training_args,
        'data_collator': DataCollatorWithPadding(tokenizer=tokenizer),
    }

    # Handle tokenizer parameter
    import transformers
    if hasattr(transformers, '__version__') and tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 46):
        trainer_kwargs['processing_class'] = tokenizer
    else:
        trainer_kwargs['tokenizer'] = tokenizer

    trainer = Trainer(**trainer_kwargs)

    # Get predictions with probabilities
    print(f"üîç Generating predictions...")
    predictions_output = trainer.predict(test_dataset)
    predictions = np.argmax(predictions_output.predictions, axis=-1)
    probabilities = torch.nn.functional.softmax(torch.tensor(predictions_output.predictions), dim=-1).numpy()
    true_labels = predictions_output.label_ids

    # Error Analysis
    results_df, errors_df, correct_df = create_error_analysis_report(
        test_df_indexed, predictions, probabilities, true_labels
    )

    # Sample error cases for manual review
    sample_error_cases(errors_df, n_samples=n_error_samples)

    # Save detailed results
    analysis_dir = f'{model_dir}/analysis_results'
    os.makedirs(analysis_dir, exist_ok=True)

    results_file = f'{analysis_dir}/error_analysis_results.csv'
    results_df.to_csv(results_file, index=False)
    print(f"\nüíæ Detailed results saved to: {results_file}")

    # Save error analysis summary
    summary = {
        'model_dir': model_dir,
        'total_samples': int(len(results_df)),
        'accuracy': float(np.mean(results_df['correct'])),
        'macro_f1': float(f1_score(true_labels, predictions, average='macro')),
        'error_count': int(len(errors_df)),
        'error_rate': float(len(errors_df) / len(results_df)),
        'class_distribution': {str(k): int(v) for k, v in test_df['label'].value_counts().items()},
        'confusion_matrix': confusion_matrix(true_labels, predictions).tolist(),
        'analysis_timestamp': datetime.now().isoformat()
    }

    with open(f'{analysis_dir}/analysis_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"\n‚úÖ Error analysis complete! Results saved to: {analysis_dir}")

    return results_df, errors_df, summary

In [11]:
print("üìö Step 1 - Train Model (run once):")
print("   model_dir = train_xlnet_model()")
print("   # or custom test size: model_dir = train_xlnet_model(test_size=0.15)")
print()
print("üîç Step 2 - Analyze Errors (run multiple times with same model):")
print("   results_df, errors_df, summary = analyze_xlnet_errors(model_dir)")
print("   # or: results_df, errors_df, summary = analyze_xlnet_errors('xlnet_model_full_dataset')")
print()
print("‚ö†Ô∏è  IMPORTANT: Previous results using fold combination had data leakage!")
print("   This version ensures proper train/test separation.")
print()
print("üöÄ Quick start - Train and analyze:")

# Train model first
print("\n" + "="*50)
print("Training model on full dataset...")
model_dir = train_xlnet_model()

üìö Step 1 - Train Model (run once):
   model_dir = train_xlnet_model()
   # or custom test size: model_dir = train_xlnet_model(test_size=0.15)

üîç Step 2 - Analyze Errors (run multiple times with same model):
   results_df, errors_df, summary = analyze_xlnet_errors(model_dir)
   # or: results_df, errors_df, summary = analyze_xlnet_errors('xlnet_model_full_dataset')

‚ö†Ô∏è  IMPORTANT: Previous results using fold combination had data leakage!
   This version ensures proper train/test separation.

üöÄ Quick start - Train and analyze:

Training model on full dataset...
üöÄ Training XLNet Model on Full Dataset
   Test size: 20.0% (80.0% for train/val)
‚úÖ Loaded full dataset: 7,980 samples
Label distribution:
   neutral (o): 6,999 (87.7%)
   positive (p): 728 (9.1%)
   negative (n): 253 (3.2%)
Dataset split:
   Train: 4,468 samples (56.0%)
   Val:   1,916 samples (24.0%)
   Test:  1,596 samples (20.0%)
‚úÖ No data leakage detected
   ‚Ä¢ Train: 4,468 samples
   ‚Ä¢ Val: 1,916 samples

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üî§ Tokenizing data...


Map:   0%|          | 0/4468 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1596 [00:00<?, ? examples/s]

‚öñÔ∏è  Class weights: [0.38002893328666687, 3.650326728820801, 10.562647819519043]

üî• Training XLNet...


Epoch,Training Loss,Validation Loss
1,0.813,1.089115
2,0.9614,1.372187
3,0.6341,1.378162


üíæ Saving trained model...
‚úÖ Training complete! Model saved to: xlnet_model_full_dataset


In [13]:
# Then analyze errors
if model_dir:
    print("\n" + "="*50)
    print("Analyzing errors...")
    results_df, errors_df, summary = analyze_xlnet_errors(model_dir, n_error_samples=5)


Analyzing errors...
üîç Starting XLNet Error Analysis
   Using model from: xlnet_model_full_dataset
   Model trained on: full_dataset_proper_split
   Training date: 2025-09-22T02:26:57
   ‚Ä¢ Test samples: 1,596
ü§ñ Loading trained model...
‚öñÔ∏è  Class weights: [0.38002893328666687, 3.650326728820801, 10.562647819519043]
üî§ Preparing test data...


Map:   0%|          | 0/1596 [00:00<?, ? examples/s]

üîç Generating predictions...


üìä XLNET ERROR ANALYSIS REPORT

üéØ Overall Performance:
   ‚Ä¢ Accuracy: 0.7769 (1240/1596)
   ‚Ä¢ Macro F1: 0.5278

üìà Confusion Matrix:
          neutral  positive  negative
neutral      1106       247        47
positive       22       118         5
negative       14        21        16

üìã Per-Class Performance:
   ‚Ä¢ Neutral: F1=0.870, P=0.968, R=0.790, N=1400.0
   ‚Ä¢ Positive: F1=0.444, P=0.306, R=0.814, N=145.0
   ‚Ä¢ Negative: F1=0.269, P=0.235, R=0.314, N=51.0

üîç Error Patterns:

   Neutral misclassified as:
     - positive: 247 (84.0%)
     - negative: 47 (16.0%)

   Positive misclassified as:
     - neutral: 22 (81.5%)
     - negative: 5 (18.5%)

   Negative misclassified as:
     - positive: 21 (60.0%)
     - neutral: 14 (40.0%)

üé≤ Confidence Analysis:
   ‚Ä¢ Correct predictions avg confidence: 0.950
   ‚Ä¢ Incorrect predictions avg confidence: 0.812
   ‚Ä¢ Low confidence predictions (<0.6): 127 (8.0%)

üî¨ Sample Error Cases for Manual Review:

NEUTRAL Exam

In [None]:
# =========================
# XLNet Citation Sentiment - Full Pipeline
# =========================

import os, json, time, random, re, pickle, hashlib
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

from torch.utils.data import DataLoader, WeightedRandomSampler

# --------------- Utilities ---------------

def load_full_dataset(pickle_path="dataset_full.pickle"):
    """
    Load the full cleaned dataset from pickle file.
    Expected format: [texts, labels] where labels are 'o'/'p'/'n'.
    """
    try:
        with open(pickle_path, 'rb') as f:
            data = pickle.load(f)

        if isinstance(data, list) and len(data) == 2:
            texts, labels = data[0], data[1]
            df = pd.DataFrame({'text': texts, 'label': labels})
            print(f"‚úÖ Loaded full dataset: {len(df):,} samples")
            # Distribution
            label_map_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}
            counts = df['label'].value_counts()
            print("Label distribution:")
            for lab, cnt in counts.items():
                print(f"   {label_map_name.get(lab, lab)} ({lab}): {cnt:,} ({cnt/len(df)*100:.1f}%)")
            return df
        else:
            raise ValueError(f"Expected list [texts, labels], got {type(data)}")
    except Exception as e:
        print(f"‚ùå Error loading dataset from {pickle_path}: {e}")
        return None


def create_train_test_split(df, test_size=0.2, random_state=42):
    """
    Proper stratified train/val/test split without leakage.
    """
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state, stratify=df['label']
    )
    train_df, val_df = train_test_split(
        train_df, test_size=0.3, random_state=random_state, stratify=train_df['label']
    )

    train_df = train_df.reset_index(drop=True)
    val_df   = val_df.reset_index(drop=True)
    test_df  = test_df.reset_index(drop=True)

    print("Dataset split:")
    print(f"   Train: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"   Val:   {len(val_df):,} ({len(val_df)/len(df)*100:.1f}%)")
    print(f"   Test:  {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")

    # Overlap checks (by text)
    tr = set(train_df['text']); va = set(val_df['text']); te = set(test_df['text'])
    assert len(tr & va) == 0, "Train/Val overlap detected!"
    assert len(tr & te) == 0, "Train/Test overlap detected!"
    assert len(va & te) == 0, "Val/Test overlap detected!"
    print("‚úÖ No data leakage detected")
    return train_df, val_df, test_df


def analyze_text_patterns(df, label_col='label', text_col='text'):
    """
    Simple text stats per class (optional logging).
    """
    analysis = {}
    for label in df[label_col].unique():
        sub = df[df[label_col] == label]
        texts = sub[text_col].tolist()
        lengths = [len(t.split()) for t in texts]
        all_text = ' '.join(texts).lower()
        words = re.findall(r'\b\w+\b', all_text)
        analysis[label] = {
            'count': len(sub),
            'avg_length': float(np.mean(lengths)) if lengths else 0.0,
            'std_length': float(np.std(lengths)) if lengths else 0.0,
            'common_words': Counter(words).most_common(10),
            'sample_texts': random.sample(texts, min(5, len(texts)))
        }
    return analysis


def create_dataset_with_indices(df, tokenizer, max_length):
    """
    Create HF dataset; remove raw `text` after tokenization to avoid collator crash.
    """
    label_map = {'o': 0, 'p': 1, 'n': 2}
    df = df.copy()
    df['labels'] = df['label'].map(label_map).astype('int64')
    df['original_index'] = df.index.astype('int64')

    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=False,       # dynamic padding via collator
            max_length=max_length
        )

    dataset = Dataset.from_pandas(df[['text', 'labels', 'original_index']])
    tokenized = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text']   # IMPORTANT: drop raw text so collator doesn't try to pad strings
    )
    return tokenized, df


def calculate_class_weights(train_df):
    """
    Compute class weights for CE (if you use CE).
    """
    label_map = {'o': 0, 'p': 1, 'n': 2}
    labels = [label_map[l] for l in train_df['label']]
    class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    return torch.FloatTensor(class_weights).to(device)


def build_sampling_weights(hf_dataset, num_labels=3, alpha=0.5):
    """
    Per-example weights for WeightedRandomSampler:
    w_i = 1 / (count(label_i) ** alpha)
    """
    labels = np.array(hf_dataset['labels'])
    counts = np.bincount(labels, minlength=num_labels)
    class_w = (1.0 / (counts + 1e-12)) ** alpha
    example_w = class_w[labels]
    example_w = example_w / example_w.mean()  # normalize (optional)
    return torch.DoubleTensor(example_w)


# --------------- Trainer ---------------

class ErrorAnalysisTrainer(Trainer):
    """
    Trainer with:
      - optional weighted sampler
      - optional focal loss (recommended)
      - prediction logging
    """
    def __init__(self, use_weighted_sampler=False, focal_gamma=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_weighted_sampler = use_weighted_sampler
        self.focal_gamma = focal_gamma  # None -> CE; float -> focal gamma
        self.predictions_log = []

    def get_train_dataloader(self):
        if not self.train_dataset:
            return super().get_train_dataloader()

        if self.use_weighted_sampler:
            weights = build_sampling_weights(
                self.train_dataset,
                num_labels=self.model.config.num_labels,
                alpha=0.5
            )
            sampler = WeightedRandomSampler(
                weights=weights,
                num_samples=len(self.train_dataset),
                replacement=True
            )
            return DataLoader(
                self.train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=sampler,
                collate_fn=self.data_collator,
                num_workers=self.args.dataloader_num_workers,
                pin_memory=self.args.dataloader_pin_memory
            )
        else:
            return super().get_train_dataloader()

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')

        if self.focal_gamma is None:
            # Standard CE (no class weights here to avoid double balancing with sampler)
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        else:
            # Multiclass focal loss
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            probs = torch.exp(log_probs)
            true_logp = log_probs.gather(dim=-1, index=labels.unsqueeze(-1)).squeeze(-1)
            true_p    = probs.gather(dim=-1, index=labels.unsqueeze(-1)).squeeze(-1)
            gamma = float(self.focal_gamma)
            focal_factor = (1.0 - true_p) ** gamma
            loss = -(focal_factor * true_logp).mean()

        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        result = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
        if not prediction_loss_only:
            logits = result[1] if isinstance(result, tuple) else result.predictions
            probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
            predictions = np.argmax(logits, axis=-1)
            confidences = np.max(probs.numpy(), axis=-1)
            labels = inputs['labels']
            if torch.is_tensor(labels):
                labels = labels.cpu().numpy()
            for pred, conf, true_label in zip(predictions, confidences, labels):
                self.predictions_log.append({
                    'prediction': int(pred),
                    'confidence': float(conf),
                    'true_label': int(true_label) if hasattr(true_label, 'item') else int(true_label)
                })
        return result


# --------------- Reporting ---------------

def create_error_analysis_report(df_test, predictions, probabilities, true_labels):
    label_names = ['neutral', 'positive', 'negative']
    inv_map = {0: 'o', 1: 'p', 2: 'n'}

    results_df = df_test.copy()
    results_df['predicted_label'] = [inv_map[p] for p in predictions]
    results_df['predicted_numeric'] = predictions
    results_df['confidence'] = np.max(probabilities, axis=1)
    results_df['correct'] = (predictions == true_labels)

    # attach per-class probs
    for i, lbl in enumerate(label_names):
        results_df[f'prob_{lbl}'] = probabilities[:, i]

    errors_df = results_df[~results_df['correct']].copy()
    correct_df = results_df[results_df['correct']].copy()

    print("="*80)
    print("üìä XLNET ERROR ANALYSIS REPORT")
    print("="*80)

    accuracy = np.mean(results_df['correct'])
    macro_f1 = f1_score(true_labels, predictions, average='macro')
    print(f"\nüéØ Overall Performance:")
    print(f"   ‚Ä¢ Accuracy: {accuracy:.4f} ({np.sum(results_df['correct'])}/{len(results_df)})")
    print(f"   ‚Ä¢ Macro F1: {macro_f1:.4f}")

    cm = confusion_matrix(true_labels, predictions)
    print(f"\nüìà Confusion Matrix:")
    cm_df = pd.DataFrame(cm, index=label_names, columns=label_names)
    print(cm_df)

    report = classification_report(true_labels, predictions, target_names=label_names, output_dict=True)
    print(f"\nüìã Per-Class Performance:")
    for i, label in enumerate(label_names):
        f1 = report[label]['f1-score']
        precision = report[label]['precision']
        recall = report[label]['recall']
        support = report[label]['support']
        print(f"   ‚Ä¢ {label.capitalize()}: F1={f1:.3f}, P={precision:.3f}, R={recall:.3f}, N={support}")

    print(f"\nüîç Error Patterns:")
    # we stored df_test with df indices; need numeric labels to slice
    label_map = {'o': 0, 'p': 1, 'n': 2}
    # Ensure df_test has numeric labels column for matching (if not already present)
    if 'labels' not in df_test.columns:
        df_test['labels'] = df_test['label'].map(label_map).astype('int64')

    for true_label in [0, 1, 2]:
        true_name = label_names[true_label]
        class_errors = errors_df[errors_df['labels'] == true_label] if 'labels' in errors_df.columns else \
                       results_df[(results_df['labels'] == true_label) & (~results_df['correct'])]
        if len(class_errors) > 0:
            print(f"\n   {true_name.capitalize()} misclassified as:")
            pred_counts = class_errors['predicted_label'].value_counts()
            for pred_label, count in pred_counts.items():
                pred_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[pred_label]
                pct = (count / len(class_errors)) * 100
                print(f"     - {pred_name}: {count} ({pct:.1f}%)")

    print(f"\nüé≤ Confidence Analysis:")
    print(f"   ‚Ä¢ Correct predictions avg confidence: {correct_df['confidence'].mean():.3f}")
    print(f"   ‚Ä¢ Incorrect predictions avg confidence: {errors_df['confidence'].mean():.3f}")
    low_conf_threshold = 0.6
    low_conf = results_df[results_df['confidence'] < low_conf_threshold]
    print(f"   ‚Ä¢ Low confidence predictions (<{low_conf_threshold}): {len(low_conf)} ({len(low_conf)/len(results_df)*100:.1f}%)")

    return results_df, errors_df, correct_df


def sample_error_cases(errors_df, n_samples=5):
    print(f"\nüî¨ Sample Error Cases for Manual Review:")
    print("="*80)
    for label in errors_df['label'].unique():
        label_errors = errors_df[errors_df['label'] == label]
        if len(label_errors) == 0:
            continue
        label_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[label]
        print(f"\n{label_name.upper()} Examples (True: {label_name}):")
        print("-" * 60)
        sampled = label_errors.sample(n=min(n_samples, len(label_errors)), random_state=42)
        for _, row in sampled.iterrows():
            pred_name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[row['predicted_label']]
            print(f"\nPredicted: {pred_name} (confidence: {row['confidence']:.3f})")
            txt = str(row['text'])
            print(f"Text: {txt[:200]}{'...' if len(txt) > 200 else ''}")


# --------------- Train / Evaluate ---------------

def train_xlnet_model(
    pickle_path="dataset_full.pickle",
    test_size=0.2,
    save_model=True,
    use_weighted_sampler=True,
    use_focal_loss=True,
    focal_gamma=2.0,
    max_length=256
):
    """
    Train XLNet with a clean split; optional sampler + focal loss.
    """
    model_name = 'xlnet-base-cased'
    print(f"üöÄ Training XLNet Model on Full Dataset")
    print(f"   Test size: {test_size*100:.0f}%  |  weighted sampler: {use_weighted_sampler}  |  focal loss: {use_focal_loss} (Œ≥={focal_gamma})")

    full_df = load_full_dataset(pickle_path)
    if full_df is None:
        print("‚ùå Failed to load full dataset")
        return None

    train_df, val_df, test_df = create_train_test_split(full_df, test_size=test_size)

    print("\nüìù Pre-training text analysis (train):")
    stats = analyze_text_patterns(train_df)
    for lab, s in stats.items():
        name = {'o': 'neutral', 'p': 'positive', 'n': 'negative'}[lab]
        print(f"   {name}: {s['count']} samples, avg {s['avg_length']:.1f} words")

    print(f"\nü§ñ Loading XLNet...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

    print("üî§ Tokenizing data...")
    train_dataset, train_df_indexed = create_dataset_with_indices(train_df, tokenizer, max_length)
    val_dataset,   val_df_indexed   = create_dataset_with_indices(val_df, tokenizer, max_length)
    test_dataset,  test_df_indexed  = create_dataset_with_indices(test_df, tokenizer, max_length)

    output_dir = 'xlnet_model_full_dataset'
    os.makedirs(output_dir, exist_ok=True)

    # Save splits for analysis
    train_df_indexed.to_csv(f'{output_dir}/train_data.csv', index=False)
    val_df_indexed.to_csv(  f'{output_dir}/val_data.csv',   index=False)
    test_df_indexed.to_csv( f'{output_dir}/test_data.csv',  index=False)

    training_args = TrainingArguments(
        output_dir=f'{output_dir}/checkpoints',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        fp16=True,
        dataloader_num_workers=2,
        report_to="none",
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

    # Build trainer
    import transformers
    trainer_kwargs = {
        'model': model,
        'args': training_args,
        'train_dataset': train_dataset,
        'eval_dataset':  val_dataset,
        'data_collator': data_collator,
        'use_weighted_sampler': use_weighted_sampler,
        'focal_gamma': (focal_gamma if use_focal_loss else None),
    }
    if hasattr(transformers, '__version__') and tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 46):
        trainer_kwargs['processing_class'] = tokenizer
    else:
        trainer_kwargs['tokenizer'] = tokenizer

    trainer = ErrorAnalysisTrainer(**trainer_kwargs)

    print("\nüî• Training XLNet...")
    trainer.train()

    if save_model:
        print("üíæ Saving trained model...")
        trainer.save_model(f'{output_dir}/final_model')
        tokenizer.save_pretrained(f'{output_dir}/final_model')
        training_info = {
            'model_name': model_name,
            'data_split': 'full_dataset_proper_split',
            'total_samples': len(full_df),
            'train_samples': len(train_df),
            'val_samples': len(val_df),
            'test_samples': len(test_df),
            'test_size': test_size,
            'sampler_alpha': (0.5 if use_weighted_sampler else None),
            'focal_gamma': (focal_gamma if use_focal_loss else None),
            'timestamp': datetime.now().isoformat()
        }
        with open(f'{output_dir}/training_info.json', 'w') as f:
            json.dump(training_info, f, indent=2)

    print(f"‚úÖ Training complete! Model saved to: {output_dir}")
    return output_dir


def analyze_xlnet_errors(model_dir, n_error_samples=5, max_length=256):
    """
    Load a saved model and run prediction + error analysis on the saved test split.
    """
    print("üîç Starting XLNet Error Analysis")
    print(f"   Using model from: {model_dir}")
    if not os.path.exists(model_dir):
        print(f"‚ùå Model dir not found: {model_dir}")
        return None, None, None

    try:
        with open(f'{model_dir}/training_info.json', 'r') as f:
            info = json.load(f)
        print(f"   Model trained on: {info['data_split']} at {info['timestamp'][:19]}")
    except Exception:
        print("   ‚ö†Ô∏è training_info.json not found; continuing...")

    test_df = pd.read_csv(f'{model_dir}/test_data.csv')
    print(f"   ‚Ä¢ Test samples: {len(test_df):,}")

    model_path = f'{model_dir}/final_model'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Build dataset
    print("üî§ Preparing test data...")
    test_dataset, test_df_indexed = create_dataset_with_indices(test_df, tokenizer, max_length)

    # Predict
    pred_args = TrainingArguments(
        output_dir=f'{model_dir}/analysis',
        per_device_eval_batch_size=16,
        dataloader_num_workers=2,
        report_to="none",
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

    import transformers
    pkwargs = {
        'model': model,
        'args': pred_args,
        'data_collator': data_collator,
    }
    if hasattr(transformers, '__version__') and tuple(map(int, transformers.__version__.split('.')[:2])) >= (4, 46):
        pkwargs['processing_class'] = tokenizer
    else:
        pkwargs['tokenizer'] = tokenizer

    pred_trainer = Trainer(**pkwargs)

    print("üîç Generating predictions...")
    out = pred_trainer.predict(test_dataset)
    preds = np.argmax(out.predictions, axis=-1)
    probs = torch.nn.functional.softmax(torch.tensor(out.predictions), dim=-1).numpy()
    y_true = out.label_ids

    results_df, errors_df, correct_df = create_error_analysis_report(
        test_df_indexed, preds, probs, y_true
    )

    # Sample errors
    sample_error_cases(errors_df, n_error_samples)

    # Save artifacts
    analysis_dir = f'{model_dir}/analysis_results'
    os.makedirs(analysis_dir, exist_ok=True)
    results_file = f'{analysis_dir}/error_analysis_results.csv'
    results_df.to_csv(results_file, index=False)
    summary = {
        'model_dir': model_dir,
        'total_samples': int(len(results_df)),
        'accuracy': float(np.mean(results_df['correct'])),
        'macro_f1': float(f1_score(y_true, preds, average='macro')),
        'error_count': int(len(errors_df)),
        'error_rate': float(len(errors_df) / len(results_df)),
        'class_distribution': {str(k): int(v) for k, v in test_df['label'].value_counts().items()},
        'confusion_matrix': confusion_matrix(y_true, preds).tolist(),
        'analysis_timestamp': datetime.now().isoformat()
    }
    with open(f'{analysis_dir}/analysis_summary.json', 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"\nüíæ Detailed results saved to: {results_file}")
    print(f"‚úÖ Error analysis complete! Results saved to: {analysis_dir}")
    return results_df, errors_df, summary


# --------------- Run ---------------

if __name__ == "__main__":
    # Train
    model_dir = train_xlnet_model(
        pickle_path="dataset_full.pickle",  # change path if needed
        test_size=0.2,
        use_weighted_sampler=True,          # set False to disable sampler
        use_focal_loss=True,                # set False to use plain CE
        focal_gamma=2.0,
        max_length=256
    )

    # Analyze
    if model_dir:
        print("\n" + "="*50)
        print("Analyzing errors...")
        results_df, errors_df, summary = analyze_xlnet_errors(model_dir, n_error_samples=5, max_length=256)


üöÄ Training XLNet Model on Full Dataset
   Test size: 20%  |  weighted sampler: True  |  focal loss: True (Œ≥=2.0)
‚úÖ Loaded full dataset: 7,980 samples
Label distribution:
   neutral (o): 6,999 (87.7%)
   positive (p): 728 (9.1%)
   negative (n): 253 (3.2%)
Dataset split:
   Train: 4,468 (56.0%)
   Val:   1,916 (24.0%)
   Test:  1,596 (20.0%)
‚úÖ No data leakage detected

üìù Pre-training text analysis (train):
   neutral: 3919 samples, avg 36.4 words
   positive: 408 samples, avg 33.6 words
   negative: 141 samples, avg 35.0 words

ü§ñ Loading XLNet...


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üî§ Tokenizing data...


Map:   0%|          | 0/4468 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1596 [00:00<?, ? examples/s]


üî• Training XLNet...


Epoch,Training Loss,Validation Loss
1,0.1826,0.180424
2,0.0323,0.281812
3,0.0308,0.266906


üíæ Saving trained model...
‚úÖ Training complete! Model saved to: xlnet_model_full_dataset

Analyzing errors...
üîç Starting XLNet Error Analysis
   Using model from: xlnet_model_full_dataset
   Model trained on: full_dataset_proper_split at 2025-09-22T04:23:04
   ‚Ä¢ Test samples: 1,596
üî§ Preparing test data...


Map:   0%|          | 0/1596 [00:00<?, ? examples/s]

üîç Generating predictions...


üìä XLNET ERROR ANALYSIS REPORT

üéØ Overall Performance:
   ‚Ä¢ Accuracy: 0.8503 (1357/1596)
   ‚Ä¢ Macro F1: 0.6029

üìà Confusion Matrix:
          neutral  positive  negative
neutral      1244        98        58
positive       48        84        13
negative       17         5        29

üìã Per-Class Performance:
   ‚Ä¢ Neutral: F1=0.918, P=0.950, R=0.889, N=1400.0
   ‚Ä¢ Positive: F1=0.506, P=0.449, R=0.579, N=145.0
   ‚Ä¢ Negative: F1=0.384, P=0.290, R=0.569, N=51.0

üîç Error Patterns:

   Neutral misclassified as:
     - positive: 98 (62.8%)
     - negative: 58 (37.2%)

   Positive misclassified as:
     - neutral: 48 (78.7%)
     - negative: 13 (21.3%)

   Negative misclassified as:
     - neutral: 17 (77.3%)
     - positive: 5 (22.7%)

üé≤ Confidence Analysis:
   ‚Ä¢ Correct predictions avg confidence: 0.890
   ‚Ä¢ Incorrect predictions avg confidence: 0.685
   ‚Ä¢ Low confidence predictions (<0.6): 141 (8.8%)

üî¨ Sample Error Cases for Manual Review:

NEUTRAL Examp