# Extrinsic Evaluation: Sentiment Analysis

This notebook evaluates the impact of dialect normalization on sentiment analysis performance.

## Overview
- Compares sentiment analysis performance on:
  - Original dialect texts
  - Normalized/translated texts  
  - Standard texts (reference)
- Uses pre-trained model: `5CD-AI/Vietnamese-Sentiment-visobert`

## Reproducibility
- **Random Seed**: 42 (set for Python, NumPy, and PyTorch)
- All experiments use the same seed for reproducibility

## Metrics
- Accuracy
- Precision (weighted)
- Recall (weighted)
- F1-score (weighted)


In [None]:
!pip install datasets==4.0.0 evaluate==0.4.6 accelerate==1.11.0 rouge_score==0.1.2 jiwer==4.0.0 editdistance==0.8.1 -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)
import evaluate
import torch
from jiwer import wer as calculate_wer
import editdistance
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import random
import os

# Set random seed for reproducibility
RANDOM_SEED = 42  # Seed value used for all experiments

def set_seed(seed):
    """Set random seed for reproducibility across all libraries"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

# Apply seed
set_seed(RANDOM_SEED)
print(f"Random seed set to: {RANDOM_SEED}")

# Use a pipeline as a high-level helper
from transformers import pipeline

# Download necessary NLTK resources
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

train_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/train.csv").dropna()
valid_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/dev.csv").dropna()
test_df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/test.csv").dropna()

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "VietAI/vit5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model loaded on device: {device}")


max_length = 50
def preprocess_function(examples):
    inputs = examples["dialect"]
    targets = examples["standard"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )
    tokenizer.src_lang = "vi_VN"
    tokenizer.tgt_lang = "vi_VN"

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 with the pad token ID before decoding
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Now decode both sequences
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = {}

    # Calculate ROUGE metrics (with proper key handling)
    try:
        rouge_result = rouge.compute(
            predictions=decoded_preds,
            references=decoded_labels,
            use_stemmer=True
        )
        # Use rougeL since that's what's available
        result["rouge_l"] = rouge_result["rougeL"]
    except Exception as e:
        print(f"ROUGE calculation error: {e}")
        result["rouge_l"] = float('nan')

    # Calculate BLEU (with corrected format)
    try:
        # The metric automatically wraps references into list of lists if provided as list of strings (see bleu.py)
        bleu_result = bleu.compute(
            predictions=decoded_preds,
            references=decoded_labels
        )
        result["bleu"] = bleu_result["bleu"]
    except Exception as e:
        print(f"BLEU calculation error: {e}")
        # Fallback: calculate BLEU manually using NLTK
        try:
            bleu_scores = []
            smoothing_function = SmoothingFunction().method1
            for pred, label in zip(decoded_preds, decoded_labels):
                pred_tokens = pred.split()
                label_tokens = label.split()
                if len(label_tokens) > 0:
                    score = sentence_bleu([label_tokens], pred_tokens, smoothing_function=smoothing_function)
                    bleu_scores.append(score)
            result["bleu"] = np.mean(bleu_scores) if bleu_scores else float('nan')
        except Exception as e2:
            print(f"Manual BLEU calculation also failed: {e2}")
            result["bleu"] = float('nan')

    # Calculate METEOR
    try:
        meteor_scores = []
        for pred, label in zip(decoded_preds, decoded_labels):
            # METEOR expects a list of tokens
            pred_tokens = pred.split()
            label_tokens = label.split()
            if len(label_tokens) > 0:  # Avoid empty references
                meteor_scores.append(meteor_score([label_tokens], pred_tokens))

        result["meteor"] = np.mean(meteor_scores) if meteor_scores else float('nan')
    except Exception as e:
        print(f"METEOR calculation error: {e}")
        result["meteor"] = float('nan')

    # Calculate WER (Word Error Rate)
    try:
        valid_pairs = [(ref, pred) for ref, pred in zip(decoded_labels, decoded_preds) if len(ref.strip()) > 0]

        if valid_pairs:
            # Unzip the valid pairs
            valid_refs, valid_preds = zip(*valid_pairs)

            # Calculate WER only on valid pairs
            wer_scores = [calculate_wer(ref, pred) for ref, pred in zip(valid_refs, valid_preds)]
            result["wer"] = np.mean(wer_scores)
        else:
            result["wer"] = float('nan')
    except Exception as e:
        print(f"WER calculation error: {e}")
        result["wer"] = float('nan')

    # Calculate CER (Character Error Rate)
    def calculate_cer(ref, pred):
        if len(ref) == 0:
            return 1.0 if len(pred) > 0 else 0.0
        return editdistance.eval(ref, pred) / max(len(ref), 1)

    try:
        cer_scores = [calculate_cer(ref, pred) for ref, pred in zip(decoded_labels, decoded_preds)]
        result["cer"] = np.mean(cer_scores)
    except Exception as e:
        print(f"CER calculation error: {e}")
        result["cer"] = float('nan')

    # Round all results for better readability
    return {k: round(v, 4) if not isinstance(v, float) or not np.isnan(v) else v for k, v in result.items()}

# Enhanced Training Arguments with Early Stopping Configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,  # Maximum epochs - early stopping will terminate earlier if needed
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none",
    metric_for_best_model="eval_bleu",  # Primary metric for early stopping decision
    greater_is_better=True,  # Higher BLEU scores are better
    save_strategy="epoch",  # Save model after each epoch
    load_best_model_at_end=True,  # Load the best performing model at the end
    eval_accumulation_steps=1,  # Evaluate immediately without accumulation
    save_steps=500,  # Additional checkpoint saving
    logging_first_step=True,  # Log the first training step
    dataloader_pin_memory=torch.cuda.is_available(),  # Performance optimization
    seed=RANDOM_SEED,  # Set seed for data shuffling and training
    data_seed=RANDOM_SEED,  # Set seed for data sampling
)

# Initialize Early Stopping Callback with enhanced configuration
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=1,  # Stop if no improvement for 3 consecutive evaluations
    early_stopping_threshold=0.01  # Minimum improvement threshold
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Training with early stopping monitoring
print("Starting training with early stopping...")
print(f"Maximum epochs: {training_args.num_train_epochs}")
print(f"Early stopping patience: {early_stopping_callback.early_stopping_patience}")
print(f"Monitoring metric: {training_args.metric_for_best_model}")

In [None]:
trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_test, metric_key_prefix="test")
print(f"Test results: {test_results}")

# Function for inference on new text
def normalize_text(text, max_length=25):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate prediction with deterministic settings
    output = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=False,  # Use greedy decoding for reproducibility
        num_beams=1,  # No beam search for deterministic results
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode prediction
    normalized_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return normalized_text

model_save_path = "./best_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
def translate(text, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    inputs.pop("token_type_ids", None)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate with deterministic settings for reproducibility
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        do_sample=False,  # Use greedy decoding for reproducibility
        num_beams=1,  # No beam search for deterministic results
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(translate("Cấy mấn ni nỏ hở rứa mô, hấn có phần lót bên trong là áo da màu nude mà"))

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random

# Set random seed for reproducibility (same as Cell 2)
RANDOM_SEED = 42

device = 0 if torch.cuda.is_available() else -1
pipe = pipeline("text-classification",
                model="5CD-AI/Vietnamese-Sentiment-visobert",
                device=device,
                batch_size=64,
                truncation=True,
                padding=True)

def translate_batch(texts, batch_size=32, max_length=128):
    """Translate a batch of texts at once"""
    if len(texts) == 0:
        return []

    inputs = tokenizer(texts,
                       return_tensors="pt",
                       truncation=True,
                       padding="max_length",
                       max_length=512)

    inputs.pop("token_type_ids", None)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs,
                                 max_length=max_length,
                                 num_beams=1,  # No beam search for deterministic results
                                 do_sample=False,  # Use greedy decoding for reproducibility
                                 pad_token_id=tokenizer.pad_token_id,
                                 eos_token_id=tokenizer.eos_token_id)

    translated_texts = [tokenizer.decode(output, skip_special_tokens=True)
                        for output in outputs]

    return translated_texts

def translate_all_texts(texts, batch_size=32, max_length=128):
    """Translate all texts with batch processing"""
    all_translated = []

    print(f"Translating {len(texts)} texts in batches of {batch_size}...")

    for i in tqdm(range(0, len(texts), batch_size), desc="Translation"):
        batch = texts[i:i+batch_size]
        translated_batch = translate_batch(batch, batch_size, max_length)
        all_translated.extend(translated_batch)

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return all_translated

def predict_sentiment_optimized(texts, batch_size=64):
    """Predict sentiment with larger batch size and optimization"""
    if len(texts) == 0:
        return []

    predictions = []

    print(f"Analyzing sentiment for {len(texts)} texts in batches of {batch_size}...")

    for i in tqdm(range(0, len(texts), batch_size), desc="Sentiment Analysis"):
        batch = texts[i:i+batch_size]

        try:
            results = pipe(batch)
            predictions.extend(results)
        except Exception as e:
            print(f"Error processing batch {i//batch_size}: {e}")
            for text in batch:
                try:
                    result = pipe(text)
                    predictions.extend(result if isinstance(result, list) else [result])
                except:
                    predictions.append({'label': 'NEU', 'score': 0.0})

    return predictions

def parallel_sentiment_analysis(text_groups, group_names, max_workers=3):
    """Perform parallel sentiment analysis for multiple text groups"""
    results = {}

    def analyze_group(texts, name):
        print(f"Starting analysis for {name}...")
        predictions = predict_sentiment_optimized(texts)
        pred_labels = [convert_sentiment_label(pred['label']) for pred in predictions]
        return name, pred_labels

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_name = {
            executor.submit(analyze_group, texts, name): name
            for texts, name in zip(text_groups, group_names)
        }

        # Collect results
        for future in as_completed(future_to_name):
            name, pred_labels = future.result()
            results[name] = pred_labels
            print(f"Completed analysis for {name}")

    return results

def convert_sentiment_label(label):
    """Convert label from model to standard format"""
    label_map = {
        'POS': 'positive',
        'NEG': 'negative',
        'NEU': 'neutral'
    }
    return label_map.get(label, label)

def evaluate_predictions(y_true, y_pred, label_name):
    """Calculate metrics"""
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average='weighted')

    print(f"\n=== {label_name} ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    print(f"\nClassification Report for {label_name}:")
    print(classification_report(y_true, y_pred))

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def plot_confusion_matrix(y_true, y_pred, title, labels=None):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - {title}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

def analyze_improvement_cases(analysis_df):
    """Analyze improvement cases in detail"""

    # Filter improvement cases (dialect wrong, translated correct)
    improved_samples = analysis_df[
        (~analysis_df['dialect_correct']) &
        (analysis_df['translated_correct'])
    ].copy()

    print(f"\n" + "="*70)
    print("DETAILED ANALYSIS OF IMPROVEMENT CASES")
    print("="*70)
    print(f"Total improvement cases: {len(improved_samples)} out of {len(analysis_df)} samples")
    print(f"Improvement rate: {len(improved_samples)/len(analysis_df)*100:.2f}%")

    if len(improved_samples) == 0:
        print("No improvement cases found.")
        return

    # Analyze by sentiment categories
    print(f"\nIMPROVEMENT BY TRUE SENTIMENT:")
    sentiment_improvement = improved_samples['true_sentiment'].value_counts()
    for sentiment, count in sentiment_improvement.items():
        total_sentiment = len(analysis_df[analysis_df['true_sentiment'] == sentiment])
        improvement_rate = count / total_sentiment * 100
        print(f"  {sentiment}: {count}/{total_sentiment} cases ({improvement_rate:.1f}%)")

    # Analyze patterns of incorrect dialect predictions
    print(f"\nDIALECT PREDICTION ERRORS FIXED BY TRANSLATION:")
    error_patterns = improved_samples.groupby(['true_sentiment', 'dialect_pred']).size()
    for (true_sent, pred_sent), count in error_patterns.items():
        print(f"  True: {true_sent} → Dialect predicted: {pred_sent} ({count} cases)")

    # Display detailed examples
    print(f"\n" + "="*70)
    print("DETAILED EXAMPLES OF IMPROVEMENTS")
    print("="*70)

    # Get diverse examples by sentiment
    examples_to_show = []
    sentiments = improved_samples['true_sentiment'].unique()

    for sentiment in sentiments:
        sentiment_samples = improved_samples[improved_samples['true_sentiment'] == sentiment]
        # Get up to 5 examples for each sentiment
        examples_to_show.extend(sentiment_samples.to_dict('records'))

    # Limit total number of examples
    examples_to_show = examples_to_show  # Up to 15 examples

    for i, example in enumerate(examples_to_show, 1):
        print(f"\n{'='*50}")
        print(f"EXAMPLE {i}")
        print(f"{'='*50}")
        print(f"True Sentiment: {example['true_sentiment']}")
        print(f"Original Dialect Text:")
        print(f"   '{example['dialect']}'")
        print(f"Translated Text:")
        print(f"   '{example['translated']}'")
        print(f"[INCORRECT] Dialect Prediction: {example['dialect_pred']}")
        print(f"[CORRECT] Translated Prediction: {example['translated_pred']}")
        print(f"Standard Text Prediction: {example['standard_pred']}")

        # Add analysis about text length
        dialect_len = len(example['dialect'].split())
        translated_len = len(example['translated'].split())
        print(f"Text Length: Dialect ({dialect_len} words) → Translated ({translated_len} words)")

    # Overall statistics about improvement patterns
    print(f"\n" + "="*70)
    print("IMPROVEMENT STATISTICS")
    print("="*70)

    # Compare text lengths
    improved_samples['dialect_length'] = improved_samples['dialect'].apply(lambda x: len(x.split()))
    improved_samples['translated_length'] = improved_samples['translated'].apply(lambda x: len(x.split()))

    print(f"Average text length in improved cases:")
    print(f"  Dialect: {improved_samples['dialect_length'].mean():.1f} words")
    print(f"  Translated: {improved_samples['translated_length'].mean():.1f} words")

    # Analyze sentiment distribution in improved cases
    print(f"\nSentiment distribution in improvement cases:")
    for sentiment in ['POSITIVE', 'NEGATIVE', 'NEUTRAL']:
        count = len(improved_samples[improved_samples['true_sentiment'] == sentiment])
        percentage = count / len(improved_samples) * 100
        print(f"  {sentiment}: {count} cases ({percentage:.1f}%)")

    return improved_samples

def analyze_regression_cases(analysis_df):
    """Analyze regression cases in detail (cases that became worse after translation)"""

    # Filter regression cases (dialect correct, translated wrong)
    regression_samples = analysis_df[
        (analysis_df['dialect_correct']) &
        (~analysis_df['translated_correct'])
    ].copy()

    print(f"\n" + "="*70)
    print("DETAILED ANALYSIS OF REGRESSION CASES")
    print("="*70)
    print(f"Total regression cases: {len(regression_samples)} out of {len(analysis_df)} samples")
    print(f"Regression rate: {len(regression_samples)/len(analysis_df)*100:.2f}%")

    if len(regression_samples) == 0:
        print("No regression cases found.")
        return

    # Analyze by sentiment categories
    print(f"\nREGRESSION BY TRUE SENTIMENT:")
    sentiment_regression = regression_samples['true_sentiment'].value_counts()
    for sentiment, count in sentiment_regression.items():
        total_sentiment = len(analysis_df[analysis_df['true_sentiment'] == sentiment])
        regression_rate = count / total_sentiment * 100
        print(f"  {sentiment}: {count}/{total_sentiment} cases ({regression_rate:.1f}%)")

    # Analyze patterns of incorrect translated predictions
    print(f"\nTRANSLATED PREDICTION ERRORS (ORIGINALLY CORRECT):")
    error_patterns = regression_samples.groupby(['true_sentiment', 'translated_pred']).size()
    for (true_sent, pred_sent), count in error_patterns.items():
        print(f"  True: {true_sent} → Translated predicted: {pred_sent} ({count} cases)")

    # Display detailed examples
    print(f"\n" + "="*70)
    print("DETAILED EXAMPLES OF REGRESSIONS")
    print("="*70)

    # Get up to 10 examples
    examples_to_show = regression_samples.head(10).to_dict('records')

    for i, example in enumerate(examples_to_show, 1):
        print(f"\n{'='*50}")
        print(f"REGRESSION EXAMPLE {i}")
        print(f"{'='*50}")
        print(f"True Sentiment: {example['true_sentiment']}")
        print(f"Original Dialect Text:")
        print(f"   '{example['dialect']}'")
        print(f"Translated Text:")
        print(f"   '{example['translated']}'")
        print(f"[CORRECT] Dialect Prediction: {example['dialect_pred']}")
        print(f"[INCORRECT] Translated Prediction: {example['translated_pred']}")
        print(f"Standard Text Prediction: {example['standard_pred']}")

        # Add analysis about text length
        dialect_len = len(example['dialect'].split())
        translated_len = len(example['translated'].split())
        print(f"Text Length: Dialect ({dialect_len} words) → Translated ({translated_len} words)")

    return regression_samples

def analyze_translated_errors(analysis_df, max_examples=20):
    """Analyze all prediction error cases after translation"""

    # Filter cases where translated prediction is wrong
    translated_errors = analysis_df[~analysis_df['translated_correct']].copy()

    print(f"\n" + "="*80)
    print("COMPREHENSIVE ANALYSIS OF TRANSLATED TEXT PREDICTION ERRORS")
    print("="*80)
    print(f"Total error cases: {len(translated_errors)} out of {len(analysis_df)} samples")
    print(f"Error rate: {len(translated_errors)/len(analysis_df)*100:.2f}%")

    if len(translated_errors) == 0:
        print("No prediction errors found in translated texts.")
        return

    # Classify errors by type
    improvement_cases = translated_errors[~translated_errors['dialect_correct']]  # dialect wrong, translated also wrong
    regression_cases = translated_errors[translated_errors['dialect_correct']]   # dialect correct, translated wrong

    print(f"\nERROR BREAKDOWN:")
    print(f"  Still wrong after translation: {len(improvement_cases)} cases")
    print(f"  Became wrong after translation: {len(regression_cases)} cases")

    # Statistics of errors by sentiment
    print(f"\nERRORS BY TRUE SENTIMENT:")
    for sentiment in sorted(translated_errors['true_sentiment'].unique()):
        sentiment_errors = translated_errors[translated_errors['true_sentiment'] == sentiment]
        total_sentiment = len(analysis_df[analysis_df['true_sentiment'] == sentiment])
        error_rate = len(sentiment_errors) / total_sentiment * 100
        print(f"  {sentiment}: {len(sentiment_errors)}/{total_sentiment} errors ({error_rate:.1f}%)")

        # Detailed prediction patterns for this sentiment
        prediction_patterns = sentiment_errors['translated_pred'].value_counts()
        for pred, count in prediction_patterns.items():
            print(f"    → Predicted as {pred}: {count} cases")

    # Confusion matrix for translated errors
    print(f"\nCONFUSION PATTERNS IN TRANSLATED ERRORS:")
    error_patterns = translated_errors.groupby(['true_sentiment', 'translated_pred']).size().sort_values(ascending=False)
    for (true_sent, pred_sent), count in error_patterns.items():
        percentage = count / len(translated_errors) * 100
        print(f"  True: {true_sent} → Predicted: {pred_sent} ({count} cases, {percentage:.1f}%)")

    # Compare with dialect predictions to understand patterns
    print(f"\nCOMPARISON WITH DIALECT PREDICTIONS:")
    print("Cases where both dialect and translated are wrong:")
    both_wrong = translated_errors[~translated_errors['dialect_correct']]
    same_wrong_prediction = both_wrong[both_wrong['dialect_pred'] == both_wrong['translated_pred']]
    different_wrong_prediction = both_wrong[both_wrong['dialect_pred'] != both_wrong['translated_pred']]

    print(f"  Same wrong prediction: {len(same_wrong_prediction)} cases")
    print(f"  Different wrong predictions: {len(different_wrong_prediction)} cases")

    # Detailed examples
    print(f"\n" + "="*80)
    print("DETAILED ERROR EXAMPLES")
    print("="*80)

    # Get diverse examples
    examples_to_show = []

    # Get examples from each sentiment
    for sentiment in sorted(translated_errors['true_sentiment'].unique()):
        sentiment_errors = translated_errors[translated_errors['true_sentiment'] == sentiment]
        examples_to_show.extend(sentiment_errors.head(max_examples//3).to_dict('records'))

    # Limit total number of examples
    examples_to_show = examples_to_show[:max_examples]

    for i, example in enumerate(examples_to_show, 1):
        print(f"\n{'='*60}")
        print(f"ERROR EXAMPLE {i}")
        print(f"{'='*60}")
        print(f"True Sentiment: {example['true_sentiment']}")
        print(f"Original Dialect Text:")
        print(f"   '{example['dialect']}'")
        print(f"Translated Text:")
        print(f"   '{example['translated']}'")
        print(f"Standard Text:")
        print(f"   '{example['standard']}'")

        # Predictions comparison
        dialect_status = "[CORRECT]" if example['dialect_correct'] else "[INCORRECT]"
        translated_status = "[INCORRECT]"  # Always wrong because these are error cases
        standard_status = "[CORRECT]" if example['standard_pred'] == example['true_sentiment'] else "[INCORRECT]"

        print(f"Predictions:")
        print(f"   Dialect: {example['dialect_pred']} {dialect_status}")
        print(f"   Translated: {example['translated_pred']} {translated_status}")
        print(f"   Standard: {example['standard_pred']} {standard_status}")

        # Text characteristics
        dialect_len = len(example['dialect'].split()) if example['dialect'] else 0
        translated_len = len(example['translated'].split()) if example['translated'] else 0
        standard_len = len(example['standard'].split()) if example['standard'] else 0

        print(f"Text Lengths:")
        print(f"   Dialect: {dialect_len} words")
        print(f"   Translated: {translated_len} words")
        print(f"   Standard: {standard_len} words")

        # Error type classification
        if example['dialect_correct']:
            error_type = "REGRESSION (was correct, became wrong)"
        else:
            if example['dialect_pred'] == example['translated_pred']:
                error_type = "PERSISTENT ERROR (same wrong prediction)"
            else:
                error_type = "DIFFERENT ERROR (different wrong prediction)"

        print(f"Error Type: {error_type}")

    # Final statistics
    print(f"\n" + "="*80)
    print("ERROR STATISTICS SUMMARY")
    print("="*80)

    # Text length analysis
    translated_errors['dialect_length'] = translated_errors['dialect'].apply(lambda x: len(str(x).split()))
    translated_errors['translated_length'] = translated_errors['translated'].apply(lambda x: len(str(x).split()))

    print(f"Average text length in error cases:")
    print(f"  Dialect: {translated_errors['dialect_length'].mean():.1f} words")
    print(f"  Translated: {translated_errors['translated_length'].mean():.1f} words")

    # Most problematic sentiments
    error_rates_by_sentiment = {}
    for sentiment in analysis_df['true_sentiment'].unique():
        total = len(analysis_df[analysis_df['true_sentiment'] == sentiment])
        errors = len(translated_errors[translated_errors['true_sentiment'] == sentiment])
        error_rates_by_sentiment[sentiment] = errors / total * 100

    print(f"\nError rates by sentiment:")
    for sentiment, rate in sorted(error_rates_by_sentiment.items(), key=lambda x: x[1], reverse=True):
        print(f"  {sentiment}: {rate:.1f}%")

    return translated_errors

def main():
    start_time = time.time()

    # Set seed again in main function for reproducibility
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)

    # Read CSV file
    print("Reading CSV file...")
    df = pd.read_csv("https://huggingface.co/datasets/Biu3010/ViDia2Std/resolve/main/test.csv").dropna()

    print(f"Loaded {len(df)} samples")
    print(f"Sentiment distribution:")
    print(df['sentiment'].value_counts())

    # Get data
    dialect_texts = df['dialect'].tolist()
    standard_texts = df['standard'].tolist()
    true_labels = df['sentiment'].tolist()

    print("\n" + "="*50)
    print("PERFORMING SENTIMENT ANALYSIS WITH BATCH OPTIMIZATION")
    print("="*50)

    # 1. Translate dialect texts with batch processing
    print("\n1. Translating dialect texts to standard...")
    translation_start = time.time()
    translated_texts = translate_all_texts(dialect_texts, batch_size=32)
    translation_time = time.time() - translation_start
    print(f"Translation completed in {translation_time:.2f} seconds")

    # 2. Parallel sentiment analysis for 3 text groups
    print("\n2. Performing parallel sentiment analysis...")
    sentiment_start = time.time()

    text_groups = [dialect_texts, translated_texts, standard_texts]
    group_names = ['dialect', 'translated', 'standard']

    # Run parallel sentiment analysis
    sentiment_results = parallel_sentiment_analysis(text_groups, group_names)

    dialect_pred_labels = sentiment_results['dialect']
    translated_pred_labels = sentiment_results['translated']
    standard_pred_labels = sentiment_results['standard']

    sentiment_time = time.time() - sentiment_start
    print(f"Sentiment analysis completed in {sentiment_time:.2f} seconds")

    # Create detailed analysis DataFrame immediately after getting results for shared use
    analysis_df = pd.DataFrame({
        'dialect': dialect_texts,
        'standard': standard_texts,
        'translated': translated_texts,
        'true_sentiment': true_labels,
        'dialect_pred': dialect_pred_labels,
        'translated_pred': translated_pred_labels,
        'standard_pred': standard_pred_labels
    })
    analysis_df['dialect_correct'] = (analysis_df['true_sentiment'] == analysis_df['dialect_pred'])
    analysis_df['translated_correct'] = (analysis_df['true_sentiment'] == analysis_df['translated_pred'])
    analysis_df['improvement'] = analysis_df['translated_correct'] & ~analysis_df['dialect_correct']

    # 3. Evaluate results
    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)

    # Compare with true labels
    metrics_dialect = evaluate_predictions(true_labels, dialect_pred_labels, "Dialect Texts vs True Labels")
    metrics_translated = evaluate_predictions(true_labels, translated_pred_labels, "Translated Texts vs True Labels")
    metrics_standard = evaluate_predictions(true_labels, standard_pred_labels, "Standard Texts vs True Labels")

    # Direct comparison between dialect and translated
    print("\n" + "="*30)
    print("DIRECT COMPARISON")
    print("="*30)

    # Agreement between dialect and translated predictions
    agreement = sum(1 for a, b in zip(dialect_pred_labels, translated_pred_labels) if a == b)
    agreement_rate = agreement / len(dialect_pred_labels)
    print(f"\nAgreement between dialect and translated predictions: {agreement_rate:.4f} ({agreement}/{len(dialect_pred_labels)})")

    # Create summary table
    summary_df = pd.DataFrame({
        'Method': ['Dialect Texts', 'Translated Texts', 'Standard Texts'],
        'Accuracy': [metrics_dialect['accuracy'], metrics_translated['accuracy'], metrics_standard['accuracy']],
        'Precision': [metrics_dialect['precision'], metrics_translated['precision'], metrics_standard['precision']],
        'Recall': [metrics_dialect['recall'], metrics_translated['recall'], metrics_standard['recall']],
        'F1-Score': [metrics_dialect['f1'], metrics_translated['f1'], metrics_standard['f1']]
    })

    print("\n" + "="*50)
    print("SUMMARY TABLE")
    print("="*50)
    print(summary_df.to_string(index=False, float_format='%.4f'))

    # *** NEW STATISTICS TABLE ***
    # Add statistics table for correct/incorrect sample counts
    print("\n" + "="*50)
    print("PREDICTION CORRECT/INCORRECT COUNT STATISTICS")
    print("="*50)

    dialect_correct_count = analysis_df['dialect_correct'].sum()
    dialect_incorrect_count = len(df) - dialect_correct_count

    translated_correct_count = analysis_df['translated_correct'].sum()
    translated_incorrect_count = len(df) - translated_correct_count

    stats_data = {
        'Status': ['Correct Predictions', 'Incorrect Predictions', 'Total'],
        'Before Normalization (Dialect)': [dialect_correct_count, dialect_incorrect_count, len(df)],
        'After Normalization (Translated)': [translated_correct_count, translated_incorrect_count, len(df)]
    }
    stats_df = pd.DataFrame(stats_data)

    print(stats_df.to_string(index=False))
    # *** END OF NEW TABLE ***

    # Performance metrics
    total_time = time.time() - start_time
    print(f"\n" + "="*30)
    print("PERFORMANCE METRICS")
    print("="*30)
    print(f"Total execution time: {total_time:.2f} seconds")
    print(f"Translation time: {translation_time:.2f} seconds")
    print(f"Sentiment analysis time: {sentiment_time:.2f} seconds")
    print(f"Average time per sample: {total_time/len(df):.4f} seconds")

    # Improvement calculation
    acc_improvement = metrics_translated['accuracy'] - metrics_dialect['accuracy']
    f1_improvement = metrics_translated['f1'] - metrics_dialect['f1']

    print(f"\n" + "="*30)
    print("IMPROVEMENT ANALYSIS")
    print("="*30)
    print(f"Accuracy improvement: {acc_improvement:+.4f}")
    print(f"F1-score improvement: {f1_improvement:+.4f}")

    if acc_improvement > 0:
        print("Translation improves sentiment analysis accuracy.")
    else:
        print("Translation does not improve sentiment analysis accuracy.")

    # DETAILED ANALYSIS OF IMPROVEMENT CASES
    improved_cases = analyze_improvement_cases(analysis_df)

    # DETAILED ANALYSIS OF REGRESSION CASES (CASES THAT BECAME WORSE)
    regression_cases = analyze_regression_cases(analysis_df)

    # COMPREHENSIVE ANALYSIS OF ALL PREDICTION ERROR CASES AFTER TRANSLATION
    translated_error_cases = analyze_translated_errors(analysis_df, max_examples=15)

    # Save results
    analysis_df.to_csv('sentiment_analysis_results.csv', index=False, encoding='utf-8')
    summary_df.to_csv('sentiment_metrics_summary.csv', index=False)

    # Save improvement cases separately
    if len(improved_cases) > 0:
        improved_cases.to_csv('improvement_cases_detailed.csv', index=False, encoding='utf-8')
        print(f"\nImprovement cases saved to 'improvement_cases_detailed.csv'")

    # Save regression cases separately
    if len(regression_cases) > 0:
        regression_cases.to_csv('regression_cases_detailed.csv', index=False, encoding='utf-8')
        print(f"Regression cases saved to 'regression_cases_detailed.csv'")

    # Save all translation error cases separately
    if len(translated_error_cases) > 0:
        translated_error_cases.to_csv('translated_error_cases_detailed.csv', index=False, encoding='utf-8')
        print(f"All translated error cases saved to 'translated_error_cases_detailed.csv'")

    print(f"\nDetailed results saved to 'sentiment_analysis_results.csv'")
    print(f"Summary metrics saved to 'sentiment_metrics_summary.csv'")

In [None]:
if __name__ == "__main__":
    main()