## Install dependencies

In [None]:
!git clone https://github.com/ZurichNLP/xstance.git

Cloning into 'xstance'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 80 (delta 36), reused 56 (delta 17), pack-reused 0[K
Unpacking objects: 100% (80/80), 6.94 MiB | 9.42 MiB/s, done.


In [None]:
!unzip xstance/data/xstance-data-v1.0.zip -d xstance/data

Archive:  xstance/data/xstance-data-v1.0.zip
  inflating: xstance/data/valid.jsonl  
  inflating: xstance/data/train.jsonl  
  inflating: xstance/data/test.jsonl  
  inflating: xstance/data/questions.it.jsonl  
  inflating: xstance/data/questions.fr.jsonl  
  inflating: xstance/data/questions.en.jsonl  
  inflating: xstance/data/questions.de.jsonl  
  inflating: xstance/data/LICENSE    


In [None]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.27.1
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf==3.20.0
  Downloading protobuf-3.20.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate==0.4.0
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencep

In [None]:
import shutil
from pathlib import Path

import evaluate
import jsonlines
import numpy as np
import transformers
from datasets import load_dataset, disable_caching
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

## Load model and pre-process data

In [None]:
model_name = "ZurichNLP/swissbert"

In [None]:
model_dir = Path("finetuned_models") / model_name
model_dir.mkdir(parents=True, exist_ok=True)

In [None]:
dataset = load_dataset("x_stance")

Downloading builder script:   0%|          | 0.00/4.32k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

Downloading and preparing dataset x_stance/default to /root/.cache/huggingface/datasets/x_stance/default/0.1.0/a5dbbc65742772e61dfd52276a9de65f4789c429bf30d549e983c8c65b72347c...


Downloading data:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45640 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17705 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3926 [00:00<?, ? examples/s]

Dataset x_stance downloaded and prepared to /root/.cache/huggingface/datasets/x_stance/default/0.1.0/a5dbbc65742772e61dfd52276a9de65f4789c429bf30d549e983c8c65b72347c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Print an example
dataset["validation"][0]

{'question': 'Sollen Ausländer/-innen, die seit mindestens zehn Jahren in der Schweiz leben, das Stimm- und Wahlrecht auf Gemeindeebene erhalten?',
 'id': 17188,
 'question_id': 3427,
 'language': 'de',
 'comment': 'Ich bin finde das geht zu wenig weit. Alle Menschen die hier leben sollen das Recht auf Mitsprache haben.',
 'label': 'FAVOR',
 'numerical_label': 100,
 'author': '89900aa69be7',
 'topic': 'Immigration'}

In [None]:
# Enumerate all possible labels
set(dataset["validation"]["label"])

{'AGAINST', 'FAVOR'}

In [None]:
id2label = {0: "AGAINST", 1: "FAVOR"}
label2id = {"AGAINST": 0, "FAVOR": 1}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/403 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

In [None]:
adapter_map = {
    "de": 0,
    "fr": 1,
    "it": 2,
}

def preprocess(examples):
    examples["label"] = [label2id[label] for label in examples["label"]]
    examples["lang_ids"] = [adapter_map[language] for language in examples["language"]]
    examples = tokenizer(
        examples["question"],
        examples["comment"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    return examples

In [None]:
preprocessed_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/45640 [00:00<?, ? examples/s]

Map:   0%|          | 0/17705 [00:00<?, ? examples/s]

Map:   0%|          | 0/3926 [00:00<?, ? examples/s]

In [None]:
# Print a pre-processed example
preprocessed_dataset["validation"][0]

{'question': 'Sollen Ausländer/-innen, die seit mindestens zehn Jahren in der Schweiz leben, das Stimm- und Wahlrecht auf Gemeindeebene erhalten?',
 'id': 17188,
 'question_id': 3427,
 'language': 'de',
 'comment': 'Ich bin finde das geht zu wenig weit. Alle Menschen die hier leben sollen das Recht auf Mitsprache haben.',
 'label': 1,
 'numerical_label': 100,
 'author': '89900aa69be7',
 'topic': 'Immigration',
 'lang_ids': 0,
 'input_ids': [0,
  11899,
  21,
  4332,
  132,
  14,
  624,
  9,
  13,
  250,
  1724,
  835,
  203,
  20,
  12,
  214,
  2664,
  9,
  41,
  20752,
  14,
  18,
  883,
  2249,
  39,
  319,
  16498,
  922,
  103,
  2,
  2,
  591,
  854,
  6100,
  41,
  391,
  38,
  847,
  1158,
  8,
  1675,
  418,
  13,
  323,
  2664,
  494,
  41,
  2164,
  39,
  208,
  8083,
  127,
  8,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [None]:
train_dataset = preprocessed_dataset["train"].shuffle(seed=42)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/612M [00:00<?, ?B/s]

Some weights of the model checkpoint at ZurichNLP/swissbert were not used when initializing XmodForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XmodForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XmodForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XmodForSequenceClassification were not initialized from the model checkpoint at ZurichNLP/swissbert and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should 

In [None]:
# Freeze language adapters
for layer in model.roberta.encoder.layer:
    if layer.output.adapter_layer_norm is not None:
        for parameter in layer.output.adapter_layer_norm.parameters():
            parameter.requires_grad = False
    for parameter in layer.output.adapter_modules.parameters():
        parameter.requires_grad = False

## Fine-tune the model

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir=model_dir,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # Set to 3 to reproduce results in paper
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=553589,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=preprocessed_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5041,0.500317,0.755476


TrainOutput(global_step=2853, training_loss=0.5488554014893862, metrics={'train_runtime': 2188.6097, 'train_samples_per_second': 20.853, 'train_steps_per_second': 1.304, 'total_flos': 7992796563333120.0, 'train_loss': 0.5488554014893862, 'epoch': 1.0})

## Evaluate on the test sets

In [None]:
output = trainer.predict(preprocessed_dataset["test"])

In [None]:
predicted_label_ids = np.argmax(output.predictions, axis=1)
predicted_labels = [id2label[label_id] for label_id in predicted_label_ids.tolist()]

In [None]:
out_path = model_dir / "predictions.jsonl"
with jsonlines.open(out_path, "w") as f:
    for label in predicted_labels:
        f.write({"label": label})

In [None]:
!python xstance/evaluate.py --gold xstance/data/test.jsonl --pred {out_path.resolve()}

new_comments_defr
DE 76.34064932765187
FR 77.09480122324159

new_questions_defr
DE 72.91853784248686
FR 76.23040472108377

new_topics_defr
DE 72.513362814061
FR 74.34102696068727

new_comments_it
IT 71.63860405499884

