In [4]:
from src.events import EventPair

from typing import Union, Dict, Iterable
from pathlib import Path

from transformers import (
    Qwen2ForSequenceClassification,
    Qwen2Tokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
    )

from peft import (
    LoraConfig,
    TaskType,
    get_peft_model
)

from datasets import Dataset
import evaluate

import numpy as np

ModuleNotFoundError: No module named 'src'

In [27]:
checkpoint = 'Qwen/Qwen2.5-1.5B'
tokenizer = Qwen2Tokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [5]:
def load_data(fpath: Union[str, Path]) -> Iterable[EventPair]:
    """Load and yield dataset from its path"""
    is_test_set = Path(fpath).suffix == '.test'
    with open(fpath, 'r', encoding='utf-8') as f:
        for line in f:
            yield EventPair(line, is_test_set)


def tokenize_func(sample):
    """Define tokenization function for a single sample"""
    return tokenizer(
        sample['event_1'],
        sample['event_2'],
        truncation=True
    )


def build_dataset_from(fpath) -> Dataset:
    """Build HF's `Dataset` for training"""
    event_pairs = load_data(fpath)
    events_1, events_2, labels = [], [], []

    for pair in event_pairs:
        event_1, event_2 = pair.events
        label = pair.label

        events_1.append(event_1)
        events_2.append(event_2)
        labels.append(label)

    data_dict = {
        'event_1': events_1,
        'event_2': events_2,
        'label': labels
    }

    dataset = Dataset.from_dict(data_dict)

    return dataset.map(tokenize_func, batched=True)

In [6]:
data_dir = Path('../data')
train_data = build_dataset_from(data_dir / "event_pairs.train")
dev_data = build_dataset_from(data_dir / "event_pairs.dev")
test_data = build_dataset_from(data_dir / "event_pairs.test")

Map: 100%|██████████| 227328/227328 [00:04<00:00, 45762.33 examples/s]
Map: 100%|██████████| 36438/36438 [00:00<00:00, 38970.01 examples/s]
Map: 100%|██████████| 42953/42953 [00:01<00:00, 42670.88 examples/s]


In [9]:
train_data[0]

{'event_1': 'elections',
 'event_2': 'campaign',
 'label': 1,
 'input_ids': [61472, 82, 37339],
 'attention_mask': [1, 1, 1]}

In [30]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=3,
    lora_alpha=16,
    lora_dropout=0.1,
    bias='none',
    target_modules=['q_proj', 'v_proj']
)

model = Qwen2ForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.config.pad_token_id = model.config.eos_token_id

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-1.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 411,648 || all params: 1,544,129,024 || trainable%: 0.0267


In [31]:
training_args = TrainingArguments('test-trainer', eval_strategy='epoch')

In [32]:
def compute_metrics(eval_preds):
    metric = evaluate.load('accuracy')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(references=labels, predictions=predictions)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.17,0.273525,0.941984
2,0.1259,0.339485,0.939404
3,0.1441,0.340588,0.938937


TrainOutput(global_step=85248, training_loss=0.1536702030861342, metrics={'train_runtime': 2864.2796, 'train_samples_per_second': 238.1, 'train_steps_per_second': 29.762, 'total_flos': 2.5213123243106304e+16, 'train_loss': 0.1536702030861342, 'epoch': 3.0})