In [None]:
%pip install transformers datasets peft wandb

In [1]:
import transformers
import datasets 
import peft

In [2]:
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, DatasetDict

lora_config = LoraConfig(
    r=4, # number of the parameters to train.
    lora_alpha=1, # magnitude of the weight matrix
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="lora_only",
    task_type="SEQ_CLS" # sequence classification
)

In [3]:
from transformers import BertTokenizer, BertModel
model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint, use_fast=True)
foundation_model = BertModel.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
import pandas as pd

DATA_PATH = "/home/jovyan/work/ULM-25-authorship-profiling/data/"

df_train = pd.read_csv(DATA_PATH + "data_train.csv")
df_test = pd.read_csv(DATA_PATH + "data_test.csv")
df_val = pd.read_csv(DATA_PATH + "data_val.csv")

In [5]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)
dataset['validation'] = Dataset.from_pandas(df_val)
dataset['test'] = Dataset.from_pandas(df_test)

In [6]:
dataset = dataset.filter(lambda example: 
                         example["text"] is not None and 
                         example["gender"] is not None and
                         example ["age"] is not None)
dataset

Filter:   0%|          | 0/620813 [00:00<?, ? examples/s]

Filter:   0%|          | 0/68980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37919 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 620812
    })
    validation: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 68980
    })
    test: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 37919
    })
})

In [7]:
from transformers import AutoTokenizer, AutoModel, PreTrainedModel, PretrainedConfig
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import numpy as np

age_encoder = LabelEncoder()
age_encoder.fit(df_train["age"].tolist() + df_val["age"].tolist() + df_test["age"].tolist())
num_age_labels = len(age_encoder.classes_)

BINS = [0, 18, 23, 27, 33, 37, 43, 47, 53, 57, 100]
def preprocess_function_dict(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    # age_labels = age_encoder.transform(examples["age"])
    age_labels = [
            int(np.digitize(item, BINS) - 1)
            for item in examples["age"]
        ]
    gender_labels = [
        {"male": 0, "female": 1}[label] for label in examples["gender"]
    ]
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "age_labels": age_labels,
        "gender_labels": gender_labels,
        "labels": np.stack([age_labels, gender_labels], axis=1)
    }

In [8]:
tokenized_dataset = dataset.map(
    preprocess_function_dict,
    batched=True,
    remove_columns=['text', 'age', 'gender']
)

Map:   0%|          | 0/620812 [00:00<?, ? examples/s]

Map:   0%|          | 0/68980 [00:00<?, ? examples/s]

Map:   0%|          | 0/37919 [00:00<?, ? examples/s]

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'age_labels', 'gender_labels', 'labels'],
        num_rows: 620812
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'age_labels', 'gender_labels', 'labels'],
        num_rows: 68980
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'age_labels', 'gender_labels', 'labels'],
        num_rows: 37919
    })
})

In [23]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [24]:
data_dir = "ULM-25-authorship-profiling/data/"
for split in ["train", "validation", "test"]:
    tokenized_dataset[split].push_to_hub(f"KonradBRG/ULM-Profling-tokneized-{split}")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/156 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/156 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/156 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/156 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/69 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/38 [00:00<?, ?ba/s]

In [25]:
class JointClassificationConfig(PretrainedConfig):
    def __init__(self, num_age_labels=None, num_gender_labels=None, loss_alpha=0.5, **kwargs):
        super().__init__(**kwargs)
        self.num_age_labels = num_age_labels
        self.num_gender_labels = num_gender_labels

In [26]:
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import CrossEntropyLoss

class BertForJointClassification(PreTrainedModel):
    config_class = JointClassificationConfig
    
    def __init__(self, config, model):
        super().__init__(config)
        self.num_age_labels = config.num_age_labels
        self.num_gender_labels = config.num_gender_labels

        self.bert = model
        self.age_classifier = nn.Linear(config.hidden_size, self.num_age_labels)
        self.gender_classifier = nn.Linear(config.hidden_size, self.num_gender_labels)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.age_loss_fct = CrossEntropyLoss()
        self.gender_loss_fct = CrossEntropyLoss()
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        age_labels=None,
        gender_labels=None,
        **kwargs
    ):
        # get BERT outputs
        x = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        # pool outputs to get sequence representation
        x = x.last_hidden_state[:, 0]
        x = self.dropout(x)
        # get logits from task head
        age_logits = self.age_classifier(x)
        gender_logits = self.gender_classifier(x)
        loss_age = self.age_loss_fct(age_logits, age_labels)
        loss_gender = self.gender_loss_fct(gender_logits, gender_labels)
        return SequenceClassifierOutput(loss={"loss_age": loss_age, "loss_gender": loss_gender}, 
                                        logits=torch.cat([age_logits, gender_logits], dim=1))

In [27]:
def setup_model(dataset, model_name="bert-base-uncased", peft=True):        
    config = JointClassificationConfig(
        name_or_path=model_name,
        num_age_labels=num_age_labels,
        num_gender_labels=2,
        hidden_size=768,
        hidden_dropout_prob=0.1,
    )
    bert = BertForJointClassification(config, foundation_model)
    if peft:
        bert = get_peft_model(bert, lora_config)
    return bert, config, dataset

In [None]:
# optional
peft_model.unload()

In [28]:
peft_model, config, data = setup_model(tokenized_dataset, model_checkpoint)

In [29]:
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    age_labels, gender_labels = eval_pred.label_ids

    age_logits = logits[:, :num_age_labels]
    gender_logits = logits[:, num_age_labels:]

    age_preds = np.argmax(age_logits, axis=-1)
    gender_preds = np.argmax(gender_logits, axis=-1)

    age_acc = (age_preds == age_labels).mean()
    gender_acc = (gender_preds == gender_labels).mean()
    joint_acc = np.mean((age_preds == age_labels) & (gender_preds == gender_labels))

    return {"age_acc": age_acc, "gender_acc": gender_acc, "joint_acc": joint_acc}

In [30]:
from typing import Dict, Union, Any
from transformers import Trainer, TrainingArguments, default_data_collator

class TrainerWithCustomLoss(Trainer):
    
    def __init__(self, age_alpha: float = 0.5, scale_losses = True, **kwargs):
        super().__init__(**kwargs)
        self.age_alpha = age_alpha
        self.gender_alpha = 1 - self.age_alpha
        self._scale_losses = scale_losses
    
    def training_step(
        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None
    ) -> torch.Tensor:
        model.train()
        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
            self.optimizer.train()
        inputs = self._prepare_inputs(inputs)
        
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)
        self.accelerator.backward(loss)
        
        # Finally we need to normalize the loss for reporting
        if num_items_in_batch is None:
            return loss.detach() / self.args.gradient_accumulation_steps
        return loss.detach()

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=8):
        outputs = model(**inputs)
        loss_age, loss_gender = outputs.loss["loss_age"], outputs.loss["loss_gender"]
        if self._scale_losses:
            loss_age = loss_age / num_items_in_batch
            loss_gender = loss_gender / num_items_in_batch
        loss = self.age_alpha * loss_age + self.gender_alpha * loss_gender
        if return_outputs:
            return loss, outputs
        return loss

In [35]:
from transformers import Trainer, TrainingArguments, default_data_collator

print(peft_model.print_trainable_parameters())
log_frequency = 1000

training_args = TrainingArguments(
    "bert-lora-for-author-profiling",
    auto_find_batch_size=True,  # Good, but try setting manually if you know your limits
    per_device_train_batch_size=32,  # Set explicitly for better control
    gradient_accumulation_steps=2,   # Effective batch size = 32 * 2 = 64
    num_train_epochs=3,
    # Speed optimizations
    dataloader_num_workers=4,        # Parallel data loading
    dataloader_pin_memory=True,      # Faster GPU transfer
    bf16=True,                       # Mixed precision (if you have newer GPU)
    # fp16=True,                     # Use this instead if bf16 not supported
    # Reduce overhead
    eval_strategy="steps",           # Less frequent evaluation
    eval_steps=500,                  # Adjust based on dataset size
    logging_strategy="steps",
    logging_steps=log_frequency,
    save_strategy="no",
    learning_rate=5e-5,
    report_to="wandb"
)

trainer = TrainerWithCustomLoss(
    model=peft_model,
    args=training_args,
    train_dataset=data['train'].with_format("torch"),
    eval_dataset=data['validation'].with_format("torch"),
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

trainable params: 187,420 || all params: 109,672,760 || trainable%: 0.1709
None


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Age Acc,Gender Acc,Joint Acc
500,No log,0.126343,0.492969,0.606567,0.298028
1000,0.131700,0.122194,0.507915,0.625761,0.317063
1500,0.131700,0.119875,0.519397,0.635909,0.330792
2000,0.121000,0.117728,0.533662,0.643795,0.34365
2500,0.121000,0.116287,0.543259,0.649058,0.354146
3000,0.117400,0.115036,0.548347,0.653421,0.360032
3500,0.117400,0.114194,0.552131,0.658321,0.366845
4000,0.115700,0.113532,0.555146,0.659785,0.370325
4500,0.115700,0.113043,0.557973,0.662569,0.373891
5000,0.114800,0.112546,0.55893,0.66341,0.375384


In [None]:
trainer.predict(data["test"])

In [None]:
trainer.push_to_hub()