In [None]:
!pip install -q transformers datasets

In [None]:
from google.colab import files
import pandas as pd


uploaded = files.upload()


train_file = "ecthr_b_train_preprocessed_WA.csv"
valid_file = "ecthr_b_valid_preprocessed_WA.csv"
test_file  = "ecthr_b_test_preprocessed_WA.csv"


train_df = pd.read_csv(train_file)
valid_df = pd.read_csv(valid_file)
test_df  = pd.read_csv(test_file)

print("Train shape:", train_df.shape)
print("Valid shape:", valid_df.shape)
print("Test shape:", test_df.shape)
print(train_df.head())


Saving ecthr_b_test_preprocessed_WA.csv to ecthr_b_test_preprocessed_WA.csv
Saving ecthr_b_train_preprocessed_WA.csv to ecthr_b_train_preprocessed_WA.csv
Saving ecthr_b_valid_preprocessed_WA.csv to ecthr_b_valid_preprocessed_WA.csv
Train shape: (9000, 2)
Valid shape: (1000, 2)
Test shape: (1000, 2)
                                                text   labels
0  11 begin event relev applic k daughter p son b...      [4]
1  9 applic monarch liechtenstein born 1945 live ...  [8 3 9]
2  9 june 1949 plot agricultur land own applicant...      [3]
3  8 1991 mr dušan slobodník research worker fiel...  [6 8 5]
4  9 applic italian citizen born 1947 live orista...    [8 3]


In [None]:
# Check for missing values
print(train_df.isnull().sum())

# Example of text & label
print("Sample text:", train_df["text"].iloc[0])
print("Sample label:", train_df["labels"].iloc[0])


text      0
labels    0
dtype: int64
Sample text: 11 begin event relev applic k daughter p son born 1986 1988 respect p’ father x m’ father v march may 1989 k voluntarili hospitalis three month diagnos suffer schizophrenia august novemb 1989 decemb 1989 march 1990 hospitalis period three month account ill 1991 hospitalis less week diagnos suffer atyp undefin psychosi appear social welfar health author contact famili sinc 1989 12 applic initi cohabit summer 1991 juli 1993 1991 p live 1991 1993 k x involv custodi access disput concern p may 1992 resid order made transfer custodi p x 13 k hospitalis 22 april 7 may 1992 13 may 10 june 1992 11 17 januari 1993 account psychosi compulsori care 15 may 10 june 1992 accord medic report date 15 may 1992 k paranoid psychot 14 19 march 1993 accord social welfar authorities’ record discuss took place social worker k’ mother k’ mother said daughter’ health condit realli bad k destroy childhood pictur wed photo mother broken glass “pierc eyes” appear 

In [None]:
# Check first 10 rows of train CSV
print(train_df.head(10))


                                                text     labels
0  11 begin event relev applic k daughter p son b...        [4]
1  9 applic monarch liechtenstein born 1945 live ...    [8 3 9]
2  9 june 1949 plot agricultur land own applicant...        [3]
3  8 1991 mr dušan slobodník research worker fiel...    [6 8 5]
4  9 applic italian citizen born 1947 live orista...      [8 3]
5  12 1987 applic associ publish book entitl eusk...    [6 8 3]
6  7 applic former member turkish nation assembl ...  [6 7 8 3]
7  7 circumst applicant’ brother disappear disput...      [0 2]
8  11 29 april 1962 applic marri mr gigliozzi rel...        [3]
9  7 applic lithuanian nation born 1974 8 5 octob...      [1 4]


In [None]:
# Show first 10 raw label strings
for i in range(10):
    print(f"Row {i} labels raw:", train_df['labels'].iloc[i])


Row 0 labels raw: [4]
Row 1 labels raw: [8 3 9]
Row 2 labels raw: [3]
Row 3 labels raw: [6 8 5]
Row 4 labels raw: [8 3]
Row 5 labels raw: [6 8 3]
Row 6 labels raw: [6 7 8 3]
Row 7 labels raw: [0 2]
Row 8 labels raw: [3]
Row 9 labels raw: [1 4]


In [None]:
# Make copies of DataFrames
train_df_copy = train_df.copy()
valid_df_copy = valid_df.copy()
test_df_copy  = test_df.copy()


print("Train copy shape:", train_df_copy.shape)
print("Valid copy shape:", valid_df_copy.shape)
print("Test copy shape:", test_df_copy.shape)


for i in range(5):
    print(f"Train copy Row {i} labels:", train_df_copy["labels"].iloc[i])


Train copy shape: (9000, 2)
Valid copy shape: (1000, 2)
Test copy shape: (1000, 2)
Train copy Row 0 labels: [4]
Train copy Row 1 labels: [8 3 9]
Train copy Row 2 labels: [3]
Train copy Row 3 labels: [6 8 5]
Train copy Row 4 labels: [8 3]


In [None]:

for df, name in zip([train_df_copy, valid_df_copy, test_df_copy], ["Train", "Valid", "Test"]):
    print(f"--- {name} ---")
    print(df.head(5))
    print("Missing text:", df['text'].isna().sum())
    print("Missing labels:", df['labels'].apply(lambda x: len(x) == 0).sum())
    print()


--- Train ---
                                                text   labels
0  11 begin event relev applic k daughter p son b...      [4]
1  9 applic monarch liechtenstein born 1945 live ...  [8 3 9]
2  9 june 1949 plot agricultur land own applicant...      [3]
3  8 1991 mr dušan slobodník research worker fiel...  [6 8 5]
4  9 applic italian citizen born 1947 live orista...    [8 3]
Missing text: 0
Missing labels: 0

--- Valid ---
                                                text     labels
0  5 applic born 1983 detain sztum 6 time event q...        [4]
1  5 applic born 1982 current detain cricova 6 20...        [1]
2  5 applic born 1955 life narofominsk moscow reg...      [1 3]
3  6 applic born 1977 life luton 7 applic summon ...         []
4  6 applic born 1983 2007 respect live gevgelija...  [7 2 3 4]
Missing text: 0
Missing labels: 0

--- Test ---
                                                text labels
0  5 applic journalist dnno norwegian internetbas...    [6]
1  5 applic b

In [None]:
import numpy as np
import ast

NUM_LABELS = 10

def labels_to_multihot_safe(label_list_str): # Renamed argument to clarify it's a string
    vec = np.zeros(NUM_LABELS, dtype=float)
    try:
        # Convert the string representation of the list to an actual list
        label_list = ast.literal_eval(label_list_str.replace(' ', ',')) # Add comma for proper parsing
        if isinstance(label_list, list):
            for label in label_list:
                try:
                    label_int = int(label)
                    if 0 <= label_int < NUM_LABELS:
                        vec[label_int] = 1.0
                    else:
                        print(f"Warning: label {label_int} out of range")
                except ValueError: # Catch error if label is not an integer after eval
                    print(f"Warning: invalid label {label}")
        else:
            print(f"Warning: label_list is not a list after evaluation -> {label_list}")
    except (SyntaxError, ValueError) as e: # Catch errors during ast.literal_eval
        print(f"Error evaluating label string: {label_list_str} - {e}")

    return vec

# Apply the safe function
for df in [train_df_copy, valid_df_copy, test_df_copy]:
    df['labels_multihot'] = df['labels'].apply(labels_to_multihot_safe)

# Check first few samples
for i in range(5):
    print(f"Row {i} original labels: {train_df_copy['labels'].iloc[i]}")
    print(f"Row {i} multi-hot: {train_df_copy['labels_multihot'].iloc[i]}")

Row 0 original labels: [4]
Row 0 multi-hot: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
Row 1 original labels: [8 3 9]
Row 1 multi-hot: [0. 0. 0. 1. 0. 0. 0. 0. 1. 1.]
Row 2 original labels: [3]
Row 2 multi-hot: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
Row 3 original labels: [6 8 5]
Row 3 multi-hot: [0. 0. 0. 0. 0. 1. 1. 0. 1. 0.]
Row 4 original labels: [8 3]
Row 4 multi-hot: [0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]


In [None]:
print(train_df_copy.head(3))
print(valid_df_copy.head(3))
print(test_df_copy.head(3))


                                                text   labels  \
0  11 begin event relev applic k daughter p son b...      [4]   
1  9 applic monarch liechtenstein born 1945 live ...  [8 3 9]   
2  9 june 1949 plot agricultur land own applicant...      [3]   

                                     labels_multihot  
0  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...  
2  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
                                                text labels  \
0  5 applic born 1983 detain sztum 6 time event q...    [4]   
1  5 applic born 1982 current detain cricova 6 20...    [1]   
2  5 applic born 1955 life narofominsk moscow reg...  [1 3]   

                                     labels_multihot  
0  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
                                              

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df_copy[['text', 'labels_multihot']])
valid_dataset = Dataset.from_pandas(valid_df_copy[['text', 'labels_multihot']])
test_dataset  = Dataset.from_pandas(test_df_copy[['text', 'labels_multihot']])


In [None]:
print(train_dataset[0])


{'text': '11 begin event relev applic k daughter p son born 1986 1988 respect p’ father x m’ father v march may 1989 k voluntarili hospitalis three month diagnos suffer schizophrenia august novemb 1989 decemb 1989 march 1990 hospitalis period three month account ill 1991 hospitalis less week diagnos suffer atyp undefin psychosi appear social welfar health author contact famili sinc 1989 12 applic initi cohabit summer 1991 juli 1993 1991 p live 1991 1993 k x involv custodi access disput concern p may 1992 resid order made transfer custodi p x 13 k hospitalis 22 april 7 may 1992 13 may 10 june 1992 11 17 januari 1993 account psychosi compulsori care 15 may 10 june 1992 accord medic report date 15 may 1992 k paranoid psychot 14 19 march 1993 accord social welfar authorities’ record discuss took place social worker k’ mother k’ mother said daughter’ health condit realli bad k destroy childhood pictur wed photo mother broken glass “pierc eyes” appear photo k’ mother said tire situat get sup

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",   # or "longest"
        max_length=512
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import torch

def format_labels(example):
    example["labels"] = torch.tensor(example["labels_multihot"], dtype=torch.float)
    return example

train_dataset = train_dataset.map(format_labels)
valid_dataset = valid_dataset.map(format_labels)
test_dataset  = test_dataset.map(format_labels)


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:

print(train_dataset.features)


{'text': Value('string'), 'labels_multihot': List(Value('float64')), 'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8')), 'labels': List(Value('float32'))}


In [None]:
sample = train_dataset[0]
print("Keys:", sample.keys())
print("input_ids dtype:", sample["input_ids"].dtype)
print("attention_mask dtype:", sample["attention_mask"].dtype)
print("labels dtype:", sample["labels"].dtype)
print("labels tensor:", sample["labels"])


Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids dtype: torch.int64
attention_mask dtype: torch.int64
labels dtype: torch.float32
labels tensor: tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])


In [None]:
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=10,             # ECTHR-B has 10 classes
    problem_type="multi_label_classification"
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
metric_f1 = evaluate.load("f1")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # apply sigmoid to get probabilities
    probs = 1 / (1 + np.exp(-logits))
    # threshold to get binary predictions
    preds = (probs >= 0.5).astype(int)


    labels = labels.astype(int)

    return {
        "accuracy": accuracy_score(labels, preds),
        "precision_micro": precision_score(labels, preds, average="micro", zero_division=0),
        "recall_micro": recall_score(labels, preds, average="micro", zero_division=0),
        "f1_micro": f1_score(labels, preds, average="micro", zero_division=0),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision Micro,Recall Micro,F1 Micro,F1 Macro
1,0.1727,0.21983,0.481,0.750438,0.616104,0.676668,0.50789
2,0.156,0.200105,0.516,0.789615,0.634076,0.703349,0.557978
3,0.1508,0.200209,0.527,0.787326,0.652049,0.713331,0.596244
4,0.1146,0.197663,0.527,0.778626,0.659957,0.714397,0.589592
5,0.101,0.199396,0.52,0.766694,0.668584,0.714286,0.603329


TrainOutput(global_step=5625, training_loss=0.1502127981821696, metrics={'train_runtime': 4659.7211, 'train_samples_per_second': 9.657, 'train_steps_per_second': 1.207, 'total_flos': 1.1908209211981824e+16, 'train_loss': 0.1502127981821696, 'epoch': 5.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
save_path = "/content/drive/MyDrive/ecthr_b_finetuned_model"


In [None]:
# Save model
trainer.save_model(save_path)

# Save tokenizer (if you used a Hugging Face tokenizer)
tokenizer.save_pretrained(save_path)


('/content/drive/MyDrive/ecthr_b_finetuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/ecthr_b_finetuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/ecthr_b_finetuned_model/vocab.txt',
 '/content/drive/MyDrive/ecthr_b_finetuned_model/added_tokens.json',
 '/content/drive/MyDrive/ecthr_b_finetuned_model/tokenizer.json')