In [None]:
# Installing core dependencies

!pip install transformers[torch] datasets sentencepiece wandb pyconll


In [None]:
# Cloning repositories
!git clone https://github.com/lgessler/microbert
!git clone https://github.com/huggingface/transformers

In [None]:
# Preparing the data

from pathlib import Path
from datasets import load_dataset, DatasetDict
import pyconll

LANGUAGES = ['coptic', 'english', 'greek', 'greek_old', 'indonesian',
             'latin', 'maltese', 'tamil', 'uyghur', 'wolof', 'wolof_old']

def load_conllu_data(lang: str):
    """Load and process UD data for target language"""
    base_path = Path(f"microbert/data/{lang}/UD_*/")
    conll_files = list(base_path.glob("*.conllu"))

    raw_datasets = DatasetDict({
        split: load_dataset('conllu', data_files=str(file), split='train')
        for file, split in zip(conll_files, ['train', 'dev', 'test'])
    })

    return raw_datasets.map(
        lambda ex: {'text': ' '.join(ex['tokens'])},
        batched=True,
        remove_columns=['id', 'lemma', 'upos', 'xpos', 'feats',
                       'head', 'deprel', 'deps', 'misc']
    )

In [None]:
# Distilling the Models


from transformers import RobertaConfig, RobertaForMaskedLM, XLMRobertaConfig, XLMRobertaForMaskedLM

def create_student_model(teacher_model, model_type='roberta'):
    """Create distilled student model"""
    if model_type == 'roberta':
        configuration = RobertaConfig(
            vocab_size=teacher_model.config.vocab_size,
            hidden_size=512,  # Reduced from 768
            num_hidden_layers=6,  # Reduced from 12
            num_attention_heads=8,
            intermediate_size=2048,
        )
    elif model_type == 'xlm-roberta':
        configuration = XLMRobertaConfig(
            vocab_size=teacher_model.config.vocab_size,
            hidden_size=768,  # Reduced from 1024
            num_hidden_layers=8,  # Reduced from 12
            num_attention_heads=12,
            intermediate_size=3072,
        )

    return RobertaForMaskedLM(configuration) if model_type == 'roberta' \
           else XLMRobertaForMaskedLM(configuration)


In [None]:
# Training Pipeline

from transformers import TrainingArguments, Trainer
import torch
from torch.nn import KLDivLoss, MSELoss

class DistillationTrainer(Trainer):
    """Custom trainer for knowledge distillation"""
    def __init__(self, *args, teacher_model=None, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.alpha = alpha  # Weight between KL and MSE losses
        self.temperature = temperature
        self.kl_loss = KLDivLoss(reduction='batchmean')
        self.mse_loss = MSELoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        student_outputs = model(**inputs)

        # Get teacher predictions
        with torch.no_grad():
            teacher_outputs = self.teacher(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )

        # Calculate losses
        loss_ce = student_outputs.loss
        loss_kl = self.kl_loss(
            torch.log_softmax(student_outputs.logits / self.temperature, dim=-1),
            torch.softmax(teacher_outputs.logits / self.temperature, dim=-1)
        ) * (self.temperature ** 2)

        # Hidden states MSE loss
        loss_mse = self.mse_loss(
            student_outputs.hidden_states[-1],
            teacher_outputs.hidden_states[-1]
        )

        total_loss = (1 - self.alpha) * loss_ce + \
                     self.alpha * (0.7 * loss_kl + 0.3 * loss_mse)

        return (total_loss, student_outputs) if return_outputs else total_loss


In [1]:
# workflow

from transformers import AutoTokenizer, DataCollatorForLanguageModeling

def run_experiment(lang: str, model_type='roberta'):
    # Load data
    dataset = load_conllu_data(lang)

    # Initialize models
    teacher_name = 'roberta-base' if model_type == 'roberta' else 'xlm-roberta-base'
    tokenizer = AutoTokenizer.from_pretrained(teacher_name)
    teacher = AutoModelForMaskedLM.from_pretrained(teacher_name)
    student = create_student_model(teacher, model_type)

    # Tokenization
    def tokenize_fn(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            max_length=128,
            return_special_tokens_mask=True
        )

    tokenized_ds = dataset.map(tokenize_fn, batched=True)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.15
    )


In [1]:
# Training arguments

    training_args = TrainingArguments(
        output_dir=f"results/{lang}_{model_type}",
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        num_train_epochs=10,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        weight_decay=0.01,
        logging_dir=f"logs/{lang}_{model_type}",
        report_to="wandb",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )


In [None]:
# Train student
trainer = DistillationTrainer(
        model=student,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["dev"],
        data_collator=data_collator,
        teacher_model=teacher,
        alpha=0.7,
        temperature=4.0
    )

trainer.train()


In [None]:
 # Save final model
student.save_pretrained(f"models/{lang}_{model_type}_distilled")
tokenizer.save_pretrained(f"models/{lang}_{model_type}_distilled")

In [None]:
from transformers import pipeline
import numpy as np

def evaluate_model(lang: str, model_type='roberta'):
    # Load test data
    test_ds = load_conllu_data(lang)['test']

    # Initialize models
    model_path = f"models/{lang}_{model_type}_distilled"
    model = AutoModelForMaskedLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Perplexity calculation
    eval_results = trainer.evaluate(tokenized_ds["test"])
    perplexity = np.exp(eval_results["eval_loss"])

    # Masked prediction accuracy
    fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    # Create masked examples
    masked_examples = []
    for example in test_ds.shuffle().select(range(100)):
        tokens = example['text'].split()
        mask_pos = np.random.randint(0, len(tokens))
        tokens[mask_pos] = tokenizer.mask_token
        masked_examples.append(' '.join(tokens))

    # Calculate accuracy
    correct = 0
    for example, masked in zip(test_ds.select(range(100)), masked_examples):
        predictions = fill_mask(masked)
        if any(pred['token_str'] == example['text'].split()[mask_pos]
               for pred in predictions):
            correct += 1

    accuracy = correct / 100

    return {
        'perplexity': perplexity,
        'masked_accuracy': accuracy
    }


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

results = []

for lang in LANGUAGES:
    # Run for Roberta
    run_experiment(lang, 'roberta')
    roberta_metrics = evaluate_model(lang, 'roberta')

    # Run for XLM-R
    run_experiment(lang, 'xlm-roberta')
    xlmr_metrics = evaluate_model(lang, 'xlm-roberta')

    # Get MicroBERT baseline (hypothetical values)
    microbert_metrics = {
        'perplexity': 15.2,  # Placeholder values - need actual baseline
        'masked_accuracy': 0.62
    }

    results.append({
        'language': lang,
        'roberta_perplexity': roberta_metrics['perplexity'],
        'xlmr_perplexity': xlmr_metrics['perplexity'],
        'microbert_perplexity': microbert_metrics['perplexity'],
        'roberta_accuracy': roberta_metrics['masked_accuracy'],
        'xlmr_accuracy': xlmr_metrics['masked_accuracy'],
        'microbert_accuracy': microbert_metrics['masked_accuracy']
    })


In [None]:
# Create results dataframe
df = pd.DataFrame(results)

# Generate visualizations
plt.figure(figsize=(12, 6))
df.set_index('language')[['roberta_perplexity', 'xlmr_perplexity', 'microbert_perplexity']].plot.bar()
plt.title('Model Comparison by Language Perplexity')
plt.ylabel('Perplexity (lower is better)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()