# PIXEL-M4 Finetuning for SIB-200 Classification

This notebook demonstrates how to finetune a PIXEL-M4 model for SIB-200 language classification using bigrams rendering. It's based on the `run_sib_bigrams.py` script but configured for notebook use with simple configuration instead of command-line arguments.

## 1. Import Required Libraries

First, let's import all the necessary libraries including transformers, torch, datasets, and PIXEL-specific components.

In [1]:
import os
import sys
import copy
import random
import logging
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
import numpy as np
import datasets
import transformers
from datasets import load_dataset, get_dataset_config_names, load_from_disk
from evaluate import load as load_metric
from PIL import Image
from sklearn.metrics import f1_score as compute_f1_score

# PIXEL imports
from pixel import (
    AutoConfig,
    AutoModelForSequenceClassification,
    Modality,
    PangoCairoTextRenderer,
    PIXELForSequenceClassification,
    PIXELTrainer,
    PIXELTrainingArguments,
    PoolingMode,
    PyGameTextRenderer,
    get_attention_mask,
    get_transforms,
    glue_strip_spaces,
    resize_model_embeddings,
)
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    EvalPrediction,
    PretrainedConfig,
    PreTrainedTokenizerFast,
    set_seed,
)
from pixel.data.rendering.pangocairo_renderer_bigrams_iso_char import PangoCairoTextRenderer as PangoCairoBigramsRenderer

# Enable auto-reload for development
%load_ext autoreload
%autoreload 2

print("All libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


All libraries imported successfully!


## 2. Configuration Setup

Configure all training parameters directly in the notebook instead of using command-line arguments.

In [2]:
# Configuration constants
SIB_200_HF_ID = "Davlan/sib200"

# Configuration parameters
@dataclass
class Config:
    # Model arguments
    model_name_or_path: str = "../Team-PIXEL/pixel-m4"
    config_name: Optional[str] = None
    processor_name: Optional[str] = None
    rendering_backend: str = "bigrams"  # Use bigrams rendering
    fallback_fonts_dir: str = "../fallback_fonts"
    render_rgb: bool = False
    cache_dir: Optional[str] = None
    model_revision: str = "main"
    use_auth_token: Optional[str] = None
    pooling_mode: str = "mean"
    pooler_add_layer_norm: bool = True
    dropout_prob: float = 0.1
    
    # Data arguments
    data_dir: Optional[str] = None
    language: str = "arz_Arab"  # Arabic (Egypt) as example
    dataset_name: str = SIB_200_HF_ID
    dataset_config_name: Optional[str] = None
    max_seq_length: int = 196
    overwrite_cache: bool = False
    pad_to_max_length: bool = True
    max_train_samples: Optional[int] = None
    max_eval_samples: Optional[int] = None
    max_predict_samples: Optional[int] = None
    train_file: Optional[str] = None
    validation_file: Optional[str] = None
    test_file: Optional[str] = None
    
    # Training arguments
    output_dir: str = f"../logs/pixel-m4/sib-200/arz_Arab"
    overwrite_output_dir: bool = True
    do_train: bool = True
    do_eval: bool = True
    do_predict: bool = True
    per_device_train_batch_size: int = 16
    per_device_eval_batch_size: int = 16
    gradient_accumulation_steps: int = 1
    learning_rate: float = 5e-5
    num_train_epochs: int = 10
    max_steps: int = -1
    warmup_steps: int = 100
    logging_steps: int = 100
    eval_steps: int = 500
    save_steps: int = 500
    evaluation_strategy: str = "steps"
    save_strategy: str = "steps"
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_f1"
    greater_is_better: bool = True
    early_stopping: bool = True
    early_stopping_patience: int = 3
    seed: int = 42
    fp16: bool = True if torch.cuda.is_available() else False
    dataloader_num_workers: int = 4
    remove_unused_columns: bool = False
    log_predictions: bool = False
    report_to: str = "none"  # Can be "wandb" or "tensorboard" if desired

# Create config instance
config = Config()

print(f"Configuration set for language: {config.language}")
print(f"Model path: {config.model_name_or_path}")
print(f"Output directory: {config.output_dir}")
print(f"Rendering backend: {config.rendering_backend}")
print(f"Max sequence length: {config.max_seq_length}")
print(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

Configuration set for language: arz_Arab
Model path: ../Team-PIXEL/pixel-m4
Output directory: ../logs/pixel-m4/sib-200/arz_Arab
Rendering backend: bigrams
Max sequence length: 196
Device: GPU


## 3. Setup Logging and Seed

Initialize logging and set random seed for reproducibility.

In [3]:
# Setup logging
log_level = logging.INFO
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=log_level,
)
logger = logging.getLogger(__name__)
logger.setLevel(log_level)

# Set logging levels for libraries
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Set seed for reproducibility
set_seed(config.seed)

logger.info(f"Training parameters configured")
logger.info(f"Seed set to: {config.seed}")
logger.info(f"Device: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
logger.info(f"Using FP16: {config.fp16}")

print("Logging and seed setup complete!")

11/15/2025 04:58:24 - INFO - __main__ - Training parameters configured
11/15/2025 04:58:24 - INFO - __main__ - Seed set to: 42
11/15/2025 04:58:24 - INFO - __main__ - Seed set to: 42
11/15/2025 04:58:24 - INFO - __main__ - Device: NVIDIA GeForce RTX 3090
11/15/2025 04:58:24 - INFO - __main__ - Using FP16: True
Logging and seed setup complete!
11/15/2025 04:58:24 - INFO - __main__ - Device: NVIDIA GeForce RTX 3090
11/15/2025 04:58:24 - INFO - __main__ - Using FP16: True
Logging and seed setup complete!


## 4. Data Loading

Load the SIB-200 dataset for the specified language and prepare the data.

In [16]:
from camel_tools.utils.charmap import CharMapper

transliterator = CharMapper.builtin_mapper('ar2hsb')

# Load the SIB-200 dataset
if config.data_dir is None:
    # Load from HuggingFace Hub
    assert config.language is not None
    logger.info(f"Loading SIB-200 dataset for language: {config.language}")
    raw_datasets = load_dataset(SIB_200_HF_ID, config.language, cache_dir=config.cache_dir)
    
    # Create label mapping
    categories = sorted(raw_datasets["train"].unique("category"))
    category2id = {category: idx for idx, category in enumerate(categories)}
    add_label_id = lambda example: {"label": category2id[example["category"]]}
    raw_datasets = raw_datasets.map(add_label_id)
    transliterate_text = lambda example: {"text": transliterator(example["text"])}
    raw_datasets = raw_datasets.map(transliterate_text)
else:
    # Load from local directory
    assert config.language is not None
    lang_data_dir = os.path.join(os.path.abspath(config.data_dir), config.language)
    raw_datasets = load_from_disk(lang_data_dir)
    categories = sorted(raw_datasets["train"].unique("category"))
    category2id = {category: idx for idx, category in enumerate(categories)}
    add_label_id = lambda example: {"label": category2id[example["category"]]}
    raw_datasets = raw_datasets.map(add_label_id)
    transliterate_text = lambda example: {"text": transliterator(example["text"])}
    raw_datasets = raw_datasets.map(transliterate_text)

# Get labels and number of classes
label_list = sorted(raw_datasets["train"].unique("label"))
num_labels = len(label_list)

print(f"Dataset loaded successfully!")
print(f"Number of labels: {num_labels}")
print(f"Categories: {categories}")
print(f"Dataset splits: {list(raw_datasets.keys())}")
print(f"Training examples: {len(raw_datasets['train'])}")
print(f"Validation examples: {len(raw_datasets['validation'])}")
print(f"Test examples: {len(raw_datasets['test'])}")

# Show a sample from the dataset
sample = raw_datasets["train"][0]
print(f"\nSample data point:")
print(f"Text: '{sample['text'][:100]}...'")
print(f"Category: {sample['category']}")
print(f"Label: {sample['label']}")

11/15/2025 05:11:07 - INFO - __main__ - Loading SIB-200 dataset for language: arz_Arab


Loading Dataset Infos from /home/bens/miniconda3/envs/pixel-m4/lib/python3.9/site-packages/datasets/packaged_modules/csv


11/15/2025 05:11:09 - INFO - datasets.info - Loading Dataset Infos from /home/bens/miniconda3/envs/pixel-m4/lib/python3.9/site-packages/datasets/packaged_modules/csv


Overwrite dataset info from restored data version if exists.


11/15/2025 05:11:09 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358


11/15/2025 05:11:09 - INFO - datasets.info - Loading Dataset info from /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358


Found cached dataset sib200 (/home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358)


11/15/2025 05:11:09 - INFO - datasets.builder - Found cached dataset sib200 (/home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358)


Loading Dataset info from /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358


11/15/2025 05:11:09 - INFO - datasets.info - Loading Dataset info from /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358


Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-2777f9a8998d3d39.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-2777f9a8998d3d39.arrow


Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-3ee3cba1a0d6deb6.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-3ee3cba1a0d6deb6.arrow


Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-805e85a19d1b07f4.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Loading cached processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-805e85a19d1b07f4.arrow


Map:   0%|          | 0/701 [00:00<?, ? examples/s]Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-96feb659c6969735.arrow
Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-96feb659c6969735.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-96feb659c6969735.arrow


Map: 100%|██████████| 701/701 [00:00<00:00, 10783.18 examples/s]
Map:   0%|          | 0/99 [00:00<?, ? examples/s]Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-fb103a6da4cc846c.arrow

Map:   0%|          | 0/99 [00:00<?, ? examples/s]Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-fb103a6da4cc846c.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-fb103a6da4cc846c.arrow


Map: 100%|██████████| 99/99 [00:00<00:00, 8184.09 examples/s]
Map:   0%|          | 0/204 [00:00<?, ? examples/s]Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-530fcfa9d4af2b48.arrow

Map:   0%|          | 0/204 [00:00<?, ? examples/s]Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-530fcfa9d4af2b48.arrow


11/15/2025 05:11:09 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/bens/.cache/huggingface/datasets/Davlan___sib200/arz_Arab/0.0.0/38977a667f6fc264d5c26ec57a01e16db040b358/cache-530fcfa9d4af2b48.arrow


Map: 100%|██████████| 204/204 [00:00<00:00, 10590.50 examples/s]

Dataset loaded successfully!
Number of labels: 7
Categories: ['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']
Dataset splits: ['train', 'validation', 'test']
Training examples: 701
Validation examples: 99
Test examples: 204

Sample data point:
Text: 'trkyA mtHAwTħ bAlbHAr mn jhAt tlAtħ: bHr Ăyjh mn Alγrb ، wAlbHr AlÂswd mn AlšmAl wAlbHr AlÂbyD Almtw...'
Category: geography
Label: 1





## 5. Model Setup

Load and configure the PIXEL model for sequence classification.

In [5]:
# Setup model configuration
config_kwargs = {
    "cache_dir": config.cache_dir,
    "revision": config.model_revision,
    "use_auth_token": config.use_auth_token if config.use_auth_token else None,
}

# Load model configuration
model_config = AutoConfig.from_pretrained(
    config.config_name if config.config_name else config.model_name_or_path,
    num_labels=num_labels,
    attention_probs_dropout_prob=config.dropout_prob,
    hidden_dropout_prob=config.dropout_prob,
    **config_kwargs,
)

logger.info(f"Using dropout with probability {config.dropout_prob}")

# Load the model
if model_config.model_type in ["bert", "roberta", "xlm-roberta"]:
    model = AutoModelForSequenceClassification.from_pretrained(
        config.model_name_or_path,
        config=model_config,
        **config_kwargs,
    )
    modality = Modality.TEXT
elif model_config.model_type in ["vit_mae", "pixel"]:
    pooling_mode = PoolingMode.from_string(config.pooling_mode)
    model = PIXELForSequenceClassification.from_pretrained(
        config.model_name_or_path,
        config=model_config,
        pooling_mode=pooling_mode,
        add_layer_norm=config.pooler_add_layer_norm,
        **config_kwargs,
    )
    modality = Modality.IMAGE
else:
    raise ValueError(f"Model type {model_config.model_type} not supported.")

# Setup label mappings
model.config.label2id = {l: i for i, l in enumerate(categories)}
model.config.id2label = {i: l for i, l in enumerate(categories)}

print(f"Model loaded successfully!")
print(f"Model type: {model_config.model_type}")
print(f"Modality: {modality}")
print(f"Number of parameters: {model.num_parameters():,}")
print(f"Label mappings: {model.config.label2id}")

[INFO|configuration_utils.py:666] 2025-11-15 04:58:34,208 >> loading configuration file ../Team-PIXEL/pixel-m4/config.json
[INFO|configuration_utils.py:720] 2025-11-15 04:58:34,210 >> Model config PIXELConfig {
  "_name_or_path": "../Team-PIXEL/pixel-m4",
  "architectures": [
    "PIXELForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "decoder_hidden_size": 512,
  "decoder_intermediate_size": 2048,
  "decoder_num_attention_heads": 16,
  "decoder_num_hidden_layers": 8,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "image_size": [
    16,
    8464
  ],
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "mask_

11/15/2025 04:58:34 - INFO - __main__ - Using dropout with probability 0.1


[INFO|modeling_utils.py:2531] 2025-11-15 04:58:34,213 >> loading weights file ../Team-PIXEL/pixel-m4/pytorch_model.bin
- This IS expected if you are initializing PIXELForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing PIXELForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
- This IS expected if you are initializing PIXELForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if 

Model loaded successfully!
Model type: pixel
Modality: Modality.IMAGE
Number of parameters: 86,651,911
Label mappings: {'entertainment': 0, 'geography': 1, 'health': 2, 'politics': 3, 'science/technology': 4, 'sports': 5, 'travel': 6}
