In [6]:
# ============================================================================
# HCR-SPA: Arabic Sentiment Swap - Complete Pipeline
# AraSentEval 2026: Subtask 2
# Task: Rewrite Arabic sentences to invert sentiment while preserving meaning
# ============================================================================
# Column formats:
#   Train/Val: id, source_polarity, source, target
#   Test:      id, text
# ============================================================================

# ============================================================================
# CELL 1: Install Required Packages
# ============================================================================

# !pip install -q transformers torch pandas openpyxl scikit-learn sentencepiece sacrebleu nltk

import nltk
nltk.download('punkt', quiet=True)
print("✓ All packages installed successfully!")


# ============================================================================
# CELL 2: Upload and Extract Dataset
# ============================================================================

from google.colab import files
import zipfile
import os

print("Please upload your dataset ZIP file:")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"\nExtracting {filename}...")
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('/content/data')
    print("✓ Extraction complete!")

print("\nFiles in /content/data:")
for root, dirs, file_list in os.walk('/content/data'):
    for file in file_list:
        print(f"  - {os.path.join(root, file)}")


# ============================================================================
# CELL 3: Imports & Seeds
# ============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    MT5ForConditionalGeneration,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
import pandas as pd
import numpy as np
from dataclasses import dataclass
from tqdm.auto import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

print("✓ Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


# ============================================================================
# CELL 4: Configuration
# ============================================================================

@dataclass
class Config:
    # Model
    model_name: str = "google/mt5-small"   # Fits on T4 (16GB)

    # Training
    batch_size: int = 2
    gradient_accumulation_steps: int = 8   # Effective batch = 16
    learning_rate: float = 5e-5
    num_epochs: int = 5
    warmup_steps: int = 200
    gradient_clip: float = 1.0

    # Sequence
    max_length: int = 96

    # Generation
    num_beams: int = 4
    no_repeat_ngram_size: int = 2

    # Paths
    train_path: str = '/content/SentimentSwapSharedTaskTrain.xlsx'
    val_path:   str = '/content/SentimentSwapSharedTaskVal.xlsx'
    test_path:  str = '/content/SentimentSwapSharedTassTest.xlsx'
    save_path:  str = '/content/best_hcr_spa_model.pt'

config = Config()
print("✓ Configuration initialized")
print(f"  Model: {config.model_name}")
print(f"  Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")
print(f"  Max sequence length: {config.max_length}")



✓ All packages installed successfully!
Please upload your dataset ZIP file:


Saving SentimentSwapSharedTaskTrain.xlsx to SentimentSwapSharedTaskTrain.xlsx
Saving SentimentSwapSharedTaskVal.xlsx to SentimentSwapSharedTaskVal.xlsx
Saving SentimentSwapSharedTassTest.xlsx to SentimentSwapSharedTassTest.xlsx

Extracting SentimentSwapSharedTaskTrain.xlsx...
✓ Extraction complete!

Extracting SentimentSwapSharedTaskVal.xlsx...
✓ Extraction complete!

Extracting SentimentSwapSharedTassTest.xlsx...
✓ Extraction complete!

Files in /content/data:
  - /content/data/[Content_Types].xml
  - /content/data/_rels/.rels
  - /content/data/customXml/item1.xml
  - /content/data/customXml/itemProps1.xml
  - /content/data/customXml/_rels/item1.xml.rels
  - /content/data/xl/styles.xml
  - /content/data/xl/sharedStrings.xml
  - /content/data/xl/workbook.xml
  - /content/data/xl/connections.xml
  - /content/data/xl/_rels/workbook.xml.rels
  - /content/data/xl/tables/table1.xml
  - /content/data/xl/tables/_rels/table1.xml.rels
  - /content/data/xl/theme/theme1.xml
  - /content/data/xl/q

In [None]:

# ============================================================================
# CELL 5: Dataset Class
# ============================================================================

class SentimentSwapDataset(Dataset):
    """
    Handles both train/val (has source_polarity, source, target)
    and test (has id, text) splits.
    """

    POLARITY_AR = {"Positive": "سلبي", "Negative": "إيجابي"}

    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_length: int = 96, split: str = "train"):
        """
        Args:
            split: 'train', 'val', or 'test'
        """
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split

        # ---- Detect column names robustly ----
        cols = {c.lower().strip(): c for c in self.data.columns}

        # id column
        self.id_col = cols.get('id', self.data.columns[0])

        # source text column — train/val use 'source', test uses 'text'
        if 'source' in cols:
            self.source_col = cols['source']
        elif 'text' in cols:
            self.source_col = cols['text']
        else:
            # fallback: first object column that isn't id
            for c in self.data.columns:
                if c != self.id_col and self.data[c].dtype == object:
                    self.source_col = c
                    break

        # polarity column (train/val only)
        self.polarity_col = cols.get('source_polarity', None)

        # target column (train/val only)
        self.target_col = cols.get('target', None)

        print(f"  [{split}] Columns detected → id='{self.id_col}', "
              f"source='{self.source_col}', polarity='{self.polarity_col}', "
              f"target='{self.target_col}' | rows={len(self.data)}")

    def _build_prompt(self, source_text: str, polarity: str) -> str:
        target_pol_ar = self.POLARITY_AR.get(polarity, "سلبي")
        return f"حول النص التالي إلى {target_pol_ar}: {source_text}"

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        source_text = str(row[self.source_col])
        item_id = row[self.id_col]

        # Polarity (test data has no polarity — default to Positive so we flip to Negative)
        if self.polarity_col and self.polarity_col in row.index and pd.notna(row[self.polarity_col]):
            polarity = str(row[self.polarity_col]).strip()
        else:
            polarity = "Positive"  # default for test set

        input_text = self._build_prompt(source_text, polarity)

        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids':      encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'source_text':    source_text,
            'polarity':       polarity,
            'id':             item_id,
        }

        # Labels for train/val
        if self.split in ('train', 'val') and self.target_col and pd.notna(row.get(self.target_col, None)):
            target_text = str(row[self.target_col])
            target_enc = self.tokenizer(
                target_text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            labels = target_enc['input_ids'].squeeze()
            # Replace padding token id with -100 so loss ignores padding
            labels[labels == self.tokenizer.pad_token_id] = -100
            item['labels'] = labels
            item['target_text'] = target_text

        return item

print("✓ Dataset class defined")


# ============================================================================
# CELL 6: Model Wrapper
# ============================================================================

class HCRSPA_Model:
    """Lightweight MT5 wrapper with OOM protection."""

    def __init__(self, config: Config, device: torch.device):
        self.config = config
        self.device = device

        print(f"  Loading {config.model_name} ...")
        self.model = MT5ForConditionalGeneration.from_pretrained(config.model_name)
        self.model.gradient_checkpointing_enable()
        self.model = self.model.to(device)

        total = sum(p.numel() for p in self.model.parameters())
        print(f"  ✓ Parameters: {total:,}")
        if torch.cuda.is_available():
            print(f"  ✓ GPU memory used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

    # ---- delegation helpers ----
    def train(self):  self.model.train()
    def eval(self):   self.model.eval()
    def parameters(self): return self.model.parameters()
    def state_dict(self): return self.model.state_dict()
    def load_state_dict(self, sd): self.model.load_state_dict(sd)

    def forward(self, input_ids, attention_mask, labels=None):
        try:
            return self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                return_dict=True,
            )
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache(); gc.collect()
                print("⚠ OOM — skipping batch")
                return None
            raise

    def generate(self, input_ids, attention_mask):
        try:
            return self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=self.config.max_length,
                num_beams=self.config.num_beams,
                no_repeat_ngram_size=self.config.no_repeat_ngram_size,
                early_stopping=True,
            )
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                # greedy fallback
                return self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=self.config.max_length,
                    num_beams=1,
                )
            raise

print("✓ Model wrapper defined")


# ============================================================================
# CELL 7: Training & Validation Functions
# ============================================================================

def run_train_epoch(model, loader, optimizer, scheduler, device, config, epoch):
    model.train()
    total_loss, valid_batches, oom_count = 0.0, 0, 0
    optimizer.zero_grad()

    pbar = tqdm(loader, desc=f"Train Epoch {epoch}")
    for step, batch in enumerate(pbar):
        if 'labels' not in batch:
            continue  # skip samples without target

        outputs = model.forward(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device),
        )

        if outputs is None:
            oom_count += 1
            optimizer.zero_grad()
            continue

        loss = outputs.loss
        if torch.isnan(loss) or torch.isinf(loss):
            optimizer.zero_grad()
            continue

        (loss / config.gradient_accumulation_steps).backward()
        total_loss += loss.item()

        if (step + 1) % config.gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            valid_batches += 1
            pbar.set_postfix({'loss': f'{loss.item():.4f}', 'oom': oom_count})

        if (step + 1) % 50 == 0:
            torch.cuda.empty_cache()

    avg_loss = total_loss / max(valid_batches * config.gradient_accumulation_steps, 1)
    print(f"  → Train loss: {avg_loss:.4f}  (oom: {oom_count})")
    return avg_loss


def run_validation(model, loader, device):
    model.eval()
    total_loss, count = 0.0, 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            if 'labels' not in batch:
                continue
            outputs = model.forward(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
                labels=batch['labels'].to(device),
            )
            if outputs is not None and not torch.isnan(outputs.loss):
                total_loss += outputs.loss.item()
                count += 1

    avg_loss = total_loss / max(count, 1)
    print(f"  → Val loss: {avg_loss:.4f}")
    return avg_loss

print("✓ Training functions defined")


# ============================================================================
# CELL 8: Main Training Loop
# ============================================================================

def train_model():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"\n{'='*70}")
    print("HCR-SPA TRAINING")
    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
    print(f"Device: {gpu_name}")
    if torch.cuda.is_available():
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"{'='*70}\n")

    torch.cuda.empty_cache(); gc.collect()

    # ---- Load data ----
    print("[1/4] Loading data ...")
    train_df = pd.read_excel(config.train_path)
    val_df   = pd.read_excel(config.val_path)
    print(f"  Train: {len(train_df)} rows | Val: {len(val_df)} rows")
    print(f"  Train columns: {train_df.columns.tolist()}")
    print(f"  Val columns:   {val_df.columns.tolist()}")

    # ---- Tokenizer ----
    print("\n[2/4] Loading tokenizer ...")
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    print("  ✓ Tokenizer loaded")

    # ---- Datasets ----
    print("\n[3/4] Creating datasets ...")
    train_ds = SentimentSwapDataset(train_df, tokenizer, config.max_length, split='train')
    val_ds   = SentimentSwapDataset(val_df,   tokenizer, config.max_length, split='val')

    train_loader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True,  num_workers=2, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=config.batch_size, shuffle=False, num_workers=2, pin_memory=True)
    print(f"  Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

    # ---- Model ----
    print("\n[4/4] Initializing model ...")
    model = HCRSPA_Model(config, device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0.01)
    total_steps = len(train_loader) * config.num_epochs // config.gradient_accumulation_steps
    scheduler = get_linear_schedule_with_warmup(optimizer, config.warmup_steps, total_steps)

    print(f"  Total optimizer steps: {total_steps} | Warmup: {config.warmup_steps}")

    # ---- Training ----
    print(f"\n{'='*70}")
    print(f"Training for {config.num_epochs} epochs ...")
    print(f"{'='*70}\n")

    best_val_loss = float('inf')
    history = []

    for epoch in range(1, config.num_epochs + 1):
        print(f"\nEpoch {epoch}/{config.num_epochs}")
        train_loss = run_train_epoch(model, train_loader, optimizer, scheduler, device, config, epoch)
        val_loss   = run_validation(model, val_loader, device)

        history.append({'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss})

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), config.save_path)
            print(f"  ✓ Saved best model (val_loss={best_val_loss:.4f})")

        torch.cuda.empty_cache(); gc.collect()

    print(f"\n{'='*70}")
    print(f"Training complete! Best val loss: {best_val_loss:.4f}")
    print(f"{'='*70}\n")

    return model, tokenizer


# Run training
model, tokenizer = train_model()


# ============================================================================
# CELL 9: Inference Function
# ============================================================================

def run_inference(model, tokenizer, dataframe: pd.DataFrame, split: str = 'test'):
    """
    Run inference on any dataframe.
    Returns a DataFrame with columns: id, source, polarity, prediction
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load best weights
    try:
        model.load_state_dict(torch.load(config.save_path, map_location=device))
        print("✓ Loaded best model weights")
    except Exception as e:
        print(f"⚠ Could not load saved weights ({e}) — using current model")

    model.eval()

    dataset = SentimentSwapDataset(dataframe, tokenizer, config.max_length, split=split)
    loader  = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

    ids, sources, polarities, predictions = [], [], [], []

    print(f"\nGenerating predictions for {len(dataset)} samples ...")
    with torch.no_grad():
        for batch in tqdm(loader, desc="Inference"):
            generated = model.generate(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['attention_mask'].to(device),
            )
            decoded = tokenizer.batch_decode(generated, skip_special_tokens=True)

            predictions.extend(decoded)
            sources.extend(batch['source_text'])
            polarities.extend(batch['polarity'])

            bid = batch['id']
            ids.extend(bid.tolist() if isinstance(bid, torch.Tensor) else list(bid))

            if len(predictions) % 50 == 0:
                torch.cuda.empty_cache()

    print(f"✓ Generated {len(predictions)} predictions")

    result_df = pd.DataFrame({
        'id':         ids,
        'source':     sources,
        'polarity':   polarities,
        'prediction': predictions,
    })
    return result_df

print("✓ Inference function defined")


# ============================================================================
# CELL 10: Generate Predictions on Validation Set (with Quality Check)
# ============================================================================

val_df = pd.read_excel(config.val_path)
val_preds_df = run_inference(model, tokenizer, val_df, split='val')

# Sample display
print("\n" + "="*80)
print("VALIDATION SAMPLE PREDICTIONS")
print("="*80)

val_with_gt = val_df.copy()
for i in range(min(5, len(val_preds_df))):
    row = val_preds_df.iloc[i]
    print(f"\nExample {i+1} | ID: {row['id']} | Polarity: {row['polarity']}")
    print(f"  Source     : {row['source'][:120]}")
    print(f"  Prediction : {row['prediction'][:120]}")
    # Ground truth if available
    if 'target' in val_df.columns:
        gt_rows = val_df[val_df.iloc[:, 0] == row['id']]
        if not gt_rows.empty:
            gt = gt_rows['target'].values[0]
            if pd.notna(gt):
                print(f"  Ground Truth: {str(gt)[:120]}")

# Quality check
print("\n" + "="*80)
print("QUALITY CHECK (Validation)")
print("="*80)
empty = sum(1 for p in val_preds_df['prediction'] if not str(p).strip())
avg_len = val_preds_df['prediction'].apply(lambda x: len(str(x).split())).mean()
same = sum(1 for s, p in zip(val_preds_df['source'], val_preds_df['prediction']) if str(s).strip() == str(p).strip())
print(f"  Total predictions : {len(val_preds_df)}")
print(f"  Empty predictions : {empty}")
print(f"  Avg output length : {avg_len:.1f} words")
print(f"  Identical to source: {same}")


# ============================================================================
# CELL 11: Generate Predictions on TEST Set
# ============================================================================

print("\n" + "="*80)
print("GENERATING TEST SET PREDICTIONS")
print("="*80)

test_df = pd.read_excel(config.test_path)
print(f"Test columns: {test_df.columns.tolist()}")
print(f"Test samples: {len(test_df)}")
print(test_df.head(3))

test_preds_df = run_inference(model, tokenizer, test_df, split='test')

# Sample display
print("\nTest Sample Predictions:")
for i in range(min(5, len(test_preds_df))):
    row = test_preds_df.iloc[i]
    print(f"\n{i+1}. ID: {row['id']}")
    print(f"   Source     : {row['source'][:120]}")
    print(f"   Prediction : {row['prediction'][:120]}")

# Quality check
empty_t = sum(1 for p in test_preds_df['prediction'] if not str(p).strip())
avg_len_t = test_preds_df['prediction'].apply(lambda x: len(str(x).split())).mean()
same_t = sum(1 for s, p in zip(test_preds_df['source'], test_preds_df['prediction']) if str(s).strip() == str(p).strip())
print(f"\n  Total test predictions : {len(test_preds_df)}")
print(f"  Empty predictions      : {empty_t}")
print(f"  Avg output length      : {avg_len_t:.1f} words")
print(f"  Identical to source    : {same_t}")



✓ Dataset class defined
✓ Model wrapper defined
✓ Training functions defined

HCR-SPA TRAINING
Device: Tesla T4
GPU Memory: 15.64 GB

[1/4] Loading data ...
  Train: 6263 rows | Val: 1315 rows
  Train columns: ['id', 'source_polarity', 'source', 'target']
  Val columns:   ['id', 'source_polarity', 'source', 'target']

[2/4] Loading tokenizer ...




config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

  ✓ Tokenizer loaded

[3/4] Creating datasets ...
  [train] Columns detected → id='id', source='source', polarity='source_polarity', target='target' | rows=6263
  [val] Columns detected → id='id', source='source', polarity='source_polarity', target='target' | rows=1315
  Train batches: 3132 | Val batches: 658

[4/4] Initializing model ...
  Loading google/mt5-small ...


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/192 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  ✓ Parameters: 556,291,456
  ✓ GPU memory used: 2.23 GB
  Total optimizer steps: 1957 | Warmup: 200

Training for 5 epochs ...


Epoch 1/5


Train Epoch 1:   0%|          | 0/3132 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


  → Train loss: 11.9341  (oom: 0)


Validation:   0%|          | 0/658 [00:00<?, ?it/s]

  → Val loss: 2.0419
  ✓ Saved best model (val_loss=2.0419)

Epoch 2/5


Train Epoch 2:   0%|          | 0/3132 [00:00<?, ?it/s]

  → Train loss: 2.8546  (oom: 0)


Validation:   0%|          | 0/658 [00:00<?, ?it/s]

  → Val loss: 1.5868
  ✓ Saved best model (val_loss=1.5868)

Epoch 3/5


Train Epoch 3:   0%|          | 0/3132 [00:00<?, ?it/s]

  → Train loss: 2.3262  (oom: 0)


Validation:   0%|          | 0/658 [00:00<?, ?it/s]

  → Val loss: 1.4150
  ✓ Saved best model (val_loss=1.4150)

Epoch 4/5


Train Epoch 4:   0%|          | 0/3132 [00:00<?, ?it/s]

  → Train loss: 2.1330  (oom: 0)


Validation:   0%|          | 0/658 [00:00<?, ?it/s]

  → Val loss: 1.3662
  ✓ Saved best model (val_loss=1.3662)

Epoch 5/5


Train Epoch 5:   0%|          | 0/3132 [00:00<?, ?it/s]

  → Train loss: 2.0534  (oom: 0)


Validation:   0%|          | 0/658 [00:00<?, ?it/s]

  → Val loss: 1.3453
  ✓ Saved best model (val_loss=1.3453)

Training complete! Best val loss: 1.3453

✓ Inference function defined
✓ Loaded best model weights
  [val] Columns detected → id='id', source='source', polarity='source_polarity', target='target' | rows=1315

Generating predictions for 1315 samples ...


Inference:   0%|          | 0/1315 [00:00<?, ?it/s]

✓ Generated 1315 predictions

VALIDATION SAMPLE PREDICTIONS

Example 1 | ID: 2 | Polarity: Positive
  Source     : انصح كل شخص يعاني من زيادة في شحوم البطن والام في الظهر باقتناء هذه السلعة الرائعة
  Prediction : لا انصح كل شخص يعاني من زيادة في شحوم البطن والام في الظهر باقتناء هذه السلعة الرائعة
  Ground Truth: لا انصح كل شخص يعاني من زيادة في شحوم البطن والام في الظهر باقتناء هذه السلعة

Example 2 | ID: 3 | Polarity: Positive
  Source     : لعبة تستحق التجربة وبسعر قليل
  Prediction : لعبة تستحق التجربة وبسعر قليل
  Ground Truth: لعبة لا تستحق التجربة وبسعر مرتفع

Example 3 | ID: 4 | Polarity: Positive
  Source     : الافطار لذيذ جميل ومتنوع بكل شي خاصة ان فيه اطلالة على الحرم
  Prediction : لا الافطار سيء ومتنوع بكل شي خاصة ان فيه اطلالة على الحرم
  Ground Truth: الإفطار غير لذيذ وغير جميل وغير متنوع ولا يطل على الحرم

Example 4 | ID: 5 | Polarity: Positive
  Source     : رواية يلفها الغموض من البداية. من نوع الخيال العلمي الرائع
  Prediction : رواية يلفها الغموض من البداية. من نوع

Inference:   0%|          | 0/646 [00:00<?, ?it/s]

In [8]:
# ============================================================================
# CELL 12: Save and Download Submission Files (FIXED)
# ============================================================================

import zipfile as ziplib

# ---- Validation submission ----
val_csv_path = '/content/HCR_SPA_subtask2_dev_pred_1.csv'
val_zip_path = '/content/HCR_SPA_subtask2_dev_pred_1.zip'

# FIX: Use utf-8-sig encoding to prevent corruption
val_preds_df.to_csv(val_csv_path, index=False, encoding='utf-8-sig')
with ziplib.ZipFile(val_zip_path, 'w', ziplib.ZIP_DEFLATED) as z:
    z.write(val_csv_path, 'HCR_SPA_subtask2_dev_pred_1.csv')
print(f"✓ Validation submission saved: {val_zip_path}")

# ---- Test submission ----
test_csv_path = '/content/HCR_SPA_subtask2_test_pred_1.csv'
test_zip_path = '/content/HCR_SPA_subtask2_test_pred_1.zip'

# FIX: Use utf-8-sig encoding to prevent corruption
test_preds_df.to_csv(test_csv_path, index=False, encoding='utf-8-sig')
with ziplib.ZipFile(test_zip_path, 'w', ziplib.ZIP_DEFLATED) as z:
    z.write(test_csv_path, 'HCR_SPA_subtask2_test_pred_1.csv')
print(f"✓ Test submission saved: {test_zip_path}")

# Verify the files are readable
print("\n" + "="*80)
print("VERIFICATION: Reading saved files to check encoding")
print("="*80)

# Test reading the validation file
val_check = pd.read_csv(val_csv_path, encoding='utf-8-sig')
print(f"\nValidation file check:")
print(f"  Rows: {len(val_check)}")
print(f"  Sample text (first row): {val_check.iloc[0]['prediction'][:50]}")

# Test reading the test file
test_check = pd.read_csv(test_csv_path, encoding='utf-8-sig')
print(f"\nTest file check:")
print(f"  Rows: {len(test_check)}")
print(f"  Sample text (first row): {test_check.iloc[0]['prediction'][:50]}")

# ---- Download ----
try:
    from google.colab import files
    files.download(val_zip_path)
    files.download(test_zip_path)
    print("\n✓ Files downloaded successfully!")
except Exception as e:
    print(f"\n⚠ Auto-download failed ({e})")
    print(f"  Manually download from: {val_zip_path} and {test_zip_path}")

print("\n" + "="*80)
print("COMPLETE! Submission files ready.")
print("="*80)

✓ Validation submission saved: /content/HCR_SPA_subtask2_dev_pred_1.zip
✓ Test submission saved: /content/HCR_SPA_subtask2_test_pred_1.zip

VERIFICATION: Reading saved files to check encoding

Validation file check:
  Rows: 1315
  Sample text (first row): لا انصح كل شخص يعاني من زيادة في شحوم البطن والام 

Test file check:
  Rows: 646
  Sample text (first row): سويفل حاجة سيئة جدا 😃


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✓ Files downloaded successfully!

COMPLETE! Submission files ready.
