In [None]:
%pip install transformers datasets conllu torch huggingface_hub

In [None]:
from transformers import RobertaConfig, XLMRobertaConfig, Trainer, TrainingArguments
from transformers import RobertaForMaskedLM, XLMRobertaForMaskedLM
import torch
from datasets import load_dataset
from huggingface_hub import login

# Login to Hugging Face
login(token="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")




In [None]:
# MicroBERT configuration
MICROBERT_CONFIG = {
    "hidden_size": 100,
    "num_hidden_layers": 3,
    "num_attention_heads": 5,
    "intermediate_size": 400,
    "max_position_embeddings": 512,
    "vocab_size": 32000  # Adjust based on actual tokenizer
}

def create_student_model(teacher_model):
    """Create student model matching MicroBERT architecture"""
    if "xlm" in teacher_model.config.model_type:
        config = XLMRobertaConfig(**MICROBERT_CONFIG)
        student = XLMRobertaForMaskedLM(config)
    else:
        config = RobertaConfig(**MICROBERT_CONFIG)
        student = RobertaForMaskedLM(config)
    return student

In [None]:
# Distillation training setup
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):
        student_output = model(**inputs)
        with torch.no_grad():
            teacher_output = self.teacher(**inputs)

        # KL divergence loss
        loss = torch.nn.functional.kl_div(
            torch.nn.functional.log_softmax(student_output.logits, dim=-1),
            torch.nn.functional.softmax(teacher_output.logits, dim=-1),
            reduction="batchmean")

        # Add multitask losses (POS + Parsing)
        if "pos_labels" in inputs:
            pos_loss = torch.nn.functional.cross_entropy(
                student_output.logits.view(-1, student_output.logits.size(-1)),
                inputs["pos_labels"].view(-1)
            )
            loss += pos_loss

        if "parse_labels" in inputs:
            parse_loss = torch.nn.functional.cross_entropy(
                student_output.logits.view(-1, student_output.logits.size(-1)),
                inputs["parse_labels"].view(-1)
            )
            loss += parse_loss

        return (loss, student_output) if return_outputs else loss

In [None]:
# Training arguments matching paper specs
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=200,
    per_device_train_batch_size=32,
    save_steps=1000,
    logging_steps=100,
    learning_rate=5e-5,
    gradient_accumulation_steps=1,
    fp16=True,
)

In [None]:
# Load dataset from MicroBERT repo format
def load_microbert_dataset(language="wolof"):
    return load_dataset(
        "text",
        data_files={
            "train": f"data/{language}/mlm/train.txt",
            "validation": f"data/{language}/mlm/dev.txt"
        }
    )

In [None]:
# Example usage for Wolof
dataset = load_microbert_dataset("wolof")

# Initialize teacher and student
teacher = XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-base")
student = create_student_model(teacher)

In [None]:
# Train
trainer = DistillationTrainer(
    model=student,
    teacher_model=teacher,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

trainer.train()

In [None]:
# Save to Hugging Face Hub
student.push_to_hub("microbert-xlmr-wolof")

In [None]:
# Full MicroBERT architecture details
config = RobertaConfig(
    vocab_size=32000,
    hidden_size=100,
    num_hidden_layers=3,
    num_attention_heads=5,
    intermediate_size=400,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    max_position_embeddings=512,
    type_vocab_size=1,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    use_cache=True,
    pad_token_id=1,
    bos_token_id=0,
    eos_token_id=2,
)


In [None]:
LANGUAGES = ["wolof", "coptic", "maltese", "uyghur", "tamil", "indonesian"]

for lang in LANGUAGES:
    dataset = load_microbert_dataset(lang)
    student = create_student_model(teacher)
    trainer = DistillationTrainer(...)
    trainer.train()
    student.push_to_hub(f"microbert-xlmr-{lang}")
