In [3]:
import transformers
import datasets 
import peft

In [4]:
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, DatasetDict

lora_config = LoraConfig(
    r=4, # number of the parameters to train.
    lora_alpha=1, # a scaling factor that adjusts the magnitude of the weight matrix
    target_modules=["query", "value"], # for bert
    lora_dropout=0.05,
    bias="lora_only", # this specifies if the bias parameter should be trained.
    task_type="SEQ_CLS" # sequence classification
)

In [6]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", use_fast=True)
foundation_model = BertModel.from_pretrained("google-bert/bert-base-uncased").to("cuda")

In [16]:
import pandas as pd

df_train = pd.read_csv("/home/jovyan/work/ULM-25-authorship-profiling/data/data_train.csv")
df_test = pd.read_csv("/home/jovyan/work/ULM-25-authorship-profiling/data/data_test.csv")
df_val = pd.read_csv("/home/jovyan/work/ULM-25-authorship-profiling/data/data_val.csv")

In [32]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)#.select(range(100))
dataset['validation'] = Dataset.from_pandas(df_val)#.select(range(100))
dataset['test'] = Dataset.from_pandas(df_test)#.select(range(100))

In [33]:
dataset = dataset.filter(lambda example: 
                         example["text"] is not None and 
                         example["gender"] is not None and
                         example ["age"] is not None)
dataset

Filter:   0%|          | 0/620813 [00:00<?, ? examples/s]

Filter:   0%|          | 0/68980 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37919 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 620812
    })
    validation: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 68980
    })
    test: Dataset({
        features: ['text', 'age', 'gender'],
        num_rows: 37919
    })
})

In [34]:
from transformers import AutoTokenizer, AutoModel, PreTrainedModel, PretrainedConfig
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import json
from sklearn.preprocessing import LabelEncoder

age_encoder = LabelEncoder()
age_encoder.fit(df_train["age"].tolist() + df_val["age"].tolist() + df_test["age"].tolist())
num_age_labels = len(age_encoder.classes_)

def preprocess_function_dict(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )

    age_labels = age_encoder.transform(examples["age"])
    gender_labels = [
        {"male": 0, "female": 1}[label] for label in examples["gender"]
    ]

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "age_labels": age_labels,
        "gender_labels": gender_labels,
    }

In [35]:
tokenized_dataset = dataset.map(
    preprocess_function_dict,
    batched=True,
    remove_columns=['text', 'age', 'gender']
)

Map:   0%|          | 0/620812 [00:00<?, ? examples/s]

Map:   0%|          | 0/68980 [00:00<?, ? examples/s]

Map:   0%|          | 0/37919 [00:00<?, ? examples/s]

In [36]:
class JointClassificationConfig(PretrainedConfig):
    def __init__(self, num_age_labels=None, num_gender_labels=None, **kwargs):
        super().__init__(**kwargs)
        self.num_age_labels = num_age_labels
        self.num_gender_labels = num_gender_labels

In [37]:
class BertForJointClassification(PreTrainedModel):
    config_class = JointClassificationConfig
    
    def __init__(self, config, model):
        super().__init__(config)
        self.num_age_labels = config.num_age_labels
        self.num_gender_labels = config.num_gender_labels
        
        self.bert = model
        self.age_classifier = nn.Linear(config.hidden_size, self.num_age_labels)
        self.gender_classifier = nn.Linear(config.hidden_size, self.num_gender_labels)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        self.age_loss_fct = CrossEntropyLoss()
        self.gender_loss_fct = CrossEntropyLoss()
        
        self.init_weights()
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        age_labels=None,
        gender_labels=None,
        **kwargs
    ):
        # get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # pool outputs to get sequence representation
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        # get logits for each task
        age_logits = self.age_classifier(pooled_output)
        gender_logits = self.gender_classifier(pooled_output)
        
        age_predictions = torch.argmax(age_logits, dim=-1)
        gender_predictions = torch.argmax(gender_logits, dim=-1)
        output = {
            'age_logits': age_logits,
            'gender_logits': gender_logits,
            'age_predictions': age_predictions,
            'gender_predictions': gender_predictions,
        }
        
        # get combined loss
        age_loss = self.age_loss_fct(age_logits, age_labels)
        gender_loss = self.gender_loss_fct(gender_logits, gender_labels)
        total_loss = age_loss + gender_loss # use alpha later
        
        output.update({
            'loss': total_loss,
            'age_loss': age_loss,
            'gender_loss': gender_loss
        })
        return output

In [38]:
def setup_dict_model(dataset, model_name="bert-base-uncased"):
    config = JointClassificationConfig(
        name_or_path=model_name,
        num_age_labels=num_age_labels,
        num_gender_labels=2,
        hidden_size=768,
        hidden_dropout_prob=0.1
    )
    bert = BertForJointClassification(config, foundation_model)
    peft_model = get_peft_model(bert, lora_config)
    return peft_model, config

In [42]:
# optional
peft_model.unload()

BertForJointClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [43]:
peft_model, config = setup_dict_model(dataset)

In [44]:
# custom trainer
from transformers import Trainer

class JointClassificationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs.get('loss') # get the total loss from our output dict
        return (loss, outputs) if return_outputs else loss

In [49]:
from transformers import TrainingArguments, default_data_collator

print(peft_model.print_trainable_parameters())

training_args = TrainingArguments(
    "bert-lora-for-author-profiling",
    learning_rate=3e-2,
    report_to="wandb"
)

trainer = JointClassificationTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=default_data_collator
)

trainable params: 187,420 || all params: 109,672,760 || trainable%: 0.1709
None


In [None]:
trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jovyan/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkonrad-brg[0m ([33mkonrad-brg-university-of-t-bingen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,5.3934
1000,5.7757
1500,6.391
2000,6.8622
2500,10.6515
3000,13.3921
3500,15.7702
4000,10.7724
4500,11.1014
