In [None]:
# Installing core dependencies

!pip install -q transformers[torch] datasets sentencepiece pyconll wandb


In [None]:
# Cloning repositories

!git clone -q https://github.com/lgessler/microbert

In [None]:
# Preparing the data
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from datasets import load_dataset, DatasetDict, ClassLabel
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM,
    RobertaConfig, XLMRobertaConfig,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)

In [None]:
def load_conllu_data(lang: str):
    """Load dataset with POS tags aligned with MicroBERT paper"""
    base_path = Path(f"microbert/data/{lang}/UD_*/")
    conll_files = list(base_path.glob("*.conllu"))

    label_feature = ClassLabel(names_file=f"microbert/data/{lang}/upos_labels.txt")

    raw_datasets = DatasetDict({
        split: load_dataset('conllu',
                          data_files=str(file),
                          split='train',
                          features=Features({
                              'tokens': Sequence(Value('string')),
                              'upos': Sequence(label_feature)
                          }))
        for file, split in zip(conll_files, ['train', 'dev', 'test'])
    })

    return raw_datasets.map(
        lambda ex: {'text': ' '.join(ex['tokens']), 'pos_tags': ex['upos']},
        batched=True,
        remove_columns=['id', 'lemma', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    )

In [None]:
class MicroBERT(torch.nn.Module):

    def __init__(self, teacher, model_type='roberta'):
        super().__init__()
        self.config = self._get_config(teacher, model_type)
        self.teacher = teacher
        self.student = self._create_student()

    def _get_config(self, teacher, model_type):
        """Create configuration matching MicroBERT paper"""
        if model_type == 'roberta':

            return RobertaConfig(
                vocab_size=teacher.config.vocab_size,
                hidden_size=512,
                num_hidden_layers=6,
                num_attention_heads=8,
                intermediate_size=2048,
                num_labels=len(ClassLabel(names_file="microbert/data/english/upos_labels.txt"))
            )

        return XLMRobertaConfig(
            vocab_size=teacher.config.vocab_size,
            hidden_size=768,
            num_hidden_layers=8,
            num_attention_heads=12,
            intermediate_size=3072,
            num_labels=len(ClassLabel(names_file="microbert/data/english/upos_labels.txt"))
        )

In [None]:
def _create_student(self):
  """Create student with dual MLM + POS heads"""
  model = AutoModelForMaskedLM.from_config(self.config)
  model.pos_classifier = torch.nn.Linear(self.config.hidden_size, self.config.num_labels)
  return model

In [None]:
def tokenize_with_pos(examples, tokenizer):
    """Tokenize text while aligning POS tags to subwords"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=128,
        return_offsets_mapping=True,
        return_special_tokens_mask=True
    )

    pos_tags = []
    for i, offsets in enumerate(tokenized['offset_mapping']):
        word_ids = [idx for idx, (start, end) in enumerate(offsets) if start != end]
        aligned_pos = [-100] * len(offsets)

        for subword_idx, word_idx in enumerate(word_ids):
            if word_idx is not None:
                aligned_pos[subword_idx] = examples['pos_tags'][i][word_idx]

        pos_tags.append(aligned_pos)

    tokenized['pos_tags'] = pos_tags
    return tokenized

In [None]:
class MicroBERTTrainer(Trainer):
    def __init__(self, *args, alpha=0.7, temperature=4.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature
        self.mse_loss = torch.nn.MSELoss()
        self.ce_loss = torch.nn.CrossEntropyLoss(ignore_index=-100)

    def compute_loss(self, model, inputs, return_outputs=False):
        # Forward passes
        student_outputs = model.student(**inputs)
        with torch.no_grad():
            teacher_outputs = model.teacher(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )

        # Loss calculations
        mlm_loss = student_outputs.loss
        pos_loss = self.ce_loss(
            student_outputs.pos_logits.view(-1, model.config.num_labels),
            inputs['pos_tags'].view(-1)
        )

        # Distillation losses
        kl_loss = torch.nn.functional.kl_div(
            torch.log_softmax(student_outputs.logits / self.temperature, dim=-1),
            torch.softmax(teacher_outputs.logits / self.temperature, dim=-1),
            reduction='batchmean'
        ) * (self.temperature ** 2)

        mse_loss = self.mse_loss(
            student_outputs.hidden_states[-1],
            teacher_outputs.hidden_states[-1]
        )

        # Combined loss (matches paper weights)
        total_loss = (
            0.3 * mlm_loss +
            0.2 * pos_loss +
            0.5 * (0.7 * kl_loss + 0.3 * mse_loss)
        )

        return (total_loss, student_outputs) if return_outputs else total_loss

In [None]:
def train_microbert(lang: str, model_type='roberta'):
    # Load data and models
    dataset = load_conllu_data(lang)
    teacher = AutoModelForMaskedLM.from_pretrained(
        'xlm-roberta-base' if model_type == 'xlm' else 'roberta-base'
    )
    model = MicroBERT(teacher, model_type)

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(teacher.name_or_path)
    tokenized_ds = dataset.map(
        lambda ex: tokenize_with_pos(ex, tokenizer),
        batched=True,
        batch_size=32
    )

In [None]:
args = TrainingArguments(
        output_dir=f"results/{lang}_{model_type}",
        learning_rate=5e-5,
        num_train_epochs=10,
        per_device_train_batch_size=32,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        fp16=True,
        report_to="wandb"
    )

In [None]:
trainer = MicroBERTTrainer(
        model=model,
        args=args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["dev"],
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15),
        alpha=0.7
    )

In [None]:
trainer.train()

In [None]:
# Save and push to Hub
    trainer.model.student.push_to_hub(
        f"microbert-{lang}-distilled",
        use_auth_token="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"
    )

In [None]:
tokenizer.push_to_hub(
        f"microbert-{lang}-distilled",
        use_auth_token="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"
    )

In [None]:
def evaluate_model(lang: str, model_type: str):
    # Load test data
    test_ds = load_conllu_data(lang)['test']
    model = AutoModelForMaskedLM.from_pretrained(f"microbert-{lang}-distilled")
    tokenizer = AutoTokenizer.from_pretrained(f"microbert-{lang}-distilled")

    # POS Accuracy
    inputs = tokenizer(test_ds['text'], truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        pos_preds = torch.argmax(outputs.pos_logits, dim=-1)

    pos_accuracy = (pos_preds[inputs['attention_mask'] == 1] ==
                   torch.tensor(test_ds['pos_tags'])[inputs['attention_mask'] == 1]).float().mean()

    # Perplexity
    eval_results = trainer.evaluate(tokenized_ds["test"])
    perplexity = np.exp(eval_results["eval_loss"])

    return {
        'POS Accuracy': pos_accuracy.item(),
        'Perplexity': perplexity
    }

In [None]:
LANGUAGES = ['coptic', 'english', 'greek', 'greek_old', 'indonesian',
             'latin', 'maltese', 'tamil', 'uyghur', 'wolof', 'wolof_old']

results = []
for lang in LANGUAGES:
    # Train models
    train_microbert(lang, 'roberta')
    train_microbert(lang, 'xlm')

    # Evaluate
    roberta_metrics = evaluate_model(lang, 'roberta')
    xlm_metrics = evaluate_model(lang, 'xlm')

    results.append({
        'Language': lang,
        'RoBERTa POS Acc': roberta_metrics['POS Accuracy'],
        'XLM-R POS Acc': xlm_metrics['POS Accuracy'],
        'RoBERTa PPL': roberta_metrics['Perplexity'],
        'XLM-R PPL': xlm_metrics['Perplexity']
    })

In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv('microbert_results.csv', index=False)
print(results_df)