In [8]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/kaggle/input/eye-tracking-dataset')

import numpy as np
import matplotlib.pyplot as plt
import spacy
from transformers import AutoTokenizer, RobertaModel, BertTokenizer, BertModel
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from utils import *
from word_fixations import *
from word_properties import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
!pip install wordfreq
!pip install pyphen
!pip install surprisal

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
words_dict = get_merged_words_dict_from_csv(csv_path='/kaggle/input/eye-tracking-dataset/word_sentence_fixations/words_dict_romanian_merged.csv', properties_dir='/kaggle/input/eye-tracking-dataset/properties/properties_romanian_009')

In [10]:
# Create a dictionary excluding non-page stimuli
words_dict_reading = {}
for stimulus_key in words_dict:
	if 'page' in stimulus_key:
		words_dict_reading[stimulus_key] = words_dict[stimulus_key]

In [50]:
import math, numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (BertModel, BertForTokenClassification, AutoTokenizer,
                          get_cosine_schedule_with_warmup)
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr

# --------------------------------
# Data Preparation
# --------------------------------
# Get data from dict
words = [words_dict_reading[stimulus_key][word_idx]['word']
         for stimulus_key in words_dict_reading
         for word_idx in words_dict_reading[stimulus_key]]
sentences = [words_dict_reading[stimulus_key][word_idx]['sentence']
             for stimulus_key in words_dict_reading
             for word_idx in words_dict_reading[stimulus_key]]
sentence_ids = [words_dict_reading[stimulus_key][word_idx]['sentence_id']
                for stimulus_key in words_dict_reading
                for word_idx in words_dict_reading[stimulus_key]]
trt = [words_dict_reading[stimulus_key][word_idx]['average_TRT']
       for stimulus_key in words_dict_reading
       for word_idx in words_dict_reading[stimulus_key]]

data_sentences = {}
for i in range(len(words)):
    sid = sentence_ids[i]
    if sid not in data_sentences:
        data_sentences[sid] = {'sentence': sentences[i], 'words': [], 'trt': []}
    data_sentences[sid]['words'].append(words[i])
    data_sentences[sid]['trt'].append(trt[i])

# Turn into dataframe
data_sentences = pd.DataFrame(data_sentences).T

# Split
train_df = data_sentences.sample(frac=0.8, random_state=42)
tmp_df   = data_sentences.drop(train_df.index)
val_df   = tmp_df.sample(frac=0.5, random_state=42)
test_df  = tmp_df.drop(val_df.index)

# Standardize trt
train_trt = np.concatenate(train_df["trt"].values)
trt_mean, trt_std = train_trt.mean(), train_trt.std()
for df in [train_df, val_df, test_df]:
    df["trt"] = df["trt"].apply(lambda lst: [(t - trt_mean) / trt_std for t in lst])


# --------------------------------
# Dataset
# --------------------------------
class ReadingTimeDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_len: int = 128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): 
        return len(self.df)

    def __getitem__(self, idx):
        words   = self.df.loc[idx, "sentence"].split()
        targets = self.df.loc[idx, "trt"]

        enc = self.tokenizer(words, is_split_into_words=True, truncation=True,
                             padding="max_length", max_length=self.max_len,
                             return_attention_mask=True)

        wp_labels = []
        cur = None
        for w_id in enc.word_ids():
            if w_id is None:
                wp_labels.append(0.0)
            else:
                if cur != w_id: cur = w_id
                wp_labels.append(targets[cur])

        return {
            "ids":     torch.tensor(enc["input_ids"], dtype=torch.long),
            "mask":    torch.tensor(enc["attention_mask"], dtype=torch.long),
            "targets": torch.tensor(wp_labels, dtype=torch.float32)
        }


# --------------------------------
# Loss
# --------------------------------
def masked_mse_loss(preds, targets, word_ids):
    mask = torch.tensor([0 if w_id is None else 1 for w_id in word_ids],
                        dtype=torch.float32, device=preds.device)
    loss = (preds - targets) ** 2
    loss = loss * mask
    return loss.sum() / mask.sum()

def aggregate_token_preds_to_words(token_preds, word_ids):
    word_to_token_preds = {}
    for i, w_id in enumerate(word_ids):
        if w_id is None:
            continue
        word_to_token_preds.setdefault(w_id, []).append(token_preds[i])
    return torch.stack([torch.stack(v).mean() for k, v in sorted(word_to_token_preds.items())])


# --------------------------------
# Models
# --------------------------------
class BertRegression1(nn.Module):
    def __init__(self, model_name="dumitrescustefan/bert-base-romanian-uncased-v1", dropout=0.3):
        super().__init__()
        self.model = BertForTokenClassification.from_pretrained(model_name, num_labels=1, hidden_dropout_prob=dropout)

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask, return_dict=True)
        return out.logits.squeeze(-1)


class BertRegressionHead(nn.Module):
    def __init__(self, hidden_size=768, dropout=0.1):
        super().__init__()
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.LayerNorm(hidden_size),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, x, mask):
        logits = self.regressor(x).squeeze(-1)
        return logits * mask

class BertRegression2(nn.Module):
    def __init__(self, bert_model_name="dumitrescustefan/bert-base-romanian-uncased-v1", dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.head = BertRegressionHead(self.bert.config.hidden_size, dropout)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.head(outputs.last_hidden_state, attention_mask)


# --------------------------------
# Train, evaluate and freezing utils
# --------------------------------
def freeze_bert_layers(model, num_layers_to_unfreeze=0):
    """
    Freeze all BERT layers except the last `num_layers_to_unfreeze.
    """
    if hasattr(model, 'model'):
        bert = model.model.bert
    elif hasattr(model, 'bert'):
        bert = model.bert
    else:
        raise AttributeError("Model has no BERT encoder attribute")

    bert_layers = list(bert.encoder.layer)
    total_layers = len(bert_layers)

    for i, layer in enumerate(bert_layers):
        requires_grad = i >= (total_layers - num_layers_to_unfreeze)
        for param in layer.parameters():
            param.requires_grad = requires_grad

    # Embeddings
    for param in bert.embeddings.parameters():
        param.requires_grad = (num_layers_to_unfreeze > 0)


def freeze_all_bert_layers(model):
    if hasattr(model, 'model'):
        bert = model.model.bert
    elif hasattr(model, 'bert'):
        bert = model.bert
    else:
        raise AttributeError("Model has no BERT encoder attribute")
    for param in bert.parameters():
        param.requires_grad = False


def unfreeze_bert_layers(model, num_layers_to_unfreeze):
    if hasattr(model, 'model'):
        bert = model.model.bert
    elif hasattr(model, 'bert'):
        bert = model.bert
    else:
        raise AttributeError("Model has no BERT encoder attribute")

    bert_layers = list(bert.encoder.layer)
    total_layers = len(bert_layers)

    for i, layer in enumerate(bert_layers):
        requires_grad = i >= (total_layers - num_layers_to_unfreeze)
        for param in layer.parameters():
            param.requires_grad = requires_grad

    for param in bert.embeddings.parameters():
        param.requires_grad = (num_layers_to_unfreeze > 0)


def train_word_level(
    model, train_loader, val_loader, tokenizer, device, epochs, model_path,
    initial_unfrozen_layers=0, base_lr=1e-4, weight_decay=1e-4
):
    model.to(device)

    # Freeze all layers except the last `initial_unfrozen_layers`
    freeze_bert_layers(model, initial_unfrozen_layers)

    # Create optimizer and scheduler once at the start
    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=base_lr,
        weight_decay=weight_decay
    )
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=10,
        num_training_steps=len(train_loader) * epochs
    )

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0
        total_train_tokens = 0

        for idx, batch in enumerate(train_loader):
            ids, mask, y = [b.to(device) for b in batch.values()]
            out = model(ids, mask)
            optimizer.zero_grad()

            batch_loss = 0.0
            batch_tokens = 0

            for b in range(ids.size(0)):
                row_idx = idx * train_loader.batch_size + b
                words = train_loader.dataset.df.loc[row_idx, "sentence"].split()
                enc = tokenizer(words, is_split_into_words=True, truncation=True, padding="max_length",
                                max_length=train_loader.dataset.max_len)
                word_ids = enc.word_ids()

                token_preds = out[b]
                token_targets = y[b]

                mask_tensor = torch.tensor([0 if wid is None else 1 for wid in word_ids],
                                           dtype=torch.float32, device=token_preds.device)
                loss = ((token_preds - token_targets) ** 2) * mask_tensor
                batch_loss += loss.sum()
                batch_tokens += mask_tensor.sum().item()

            batch_loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_train_loss += batch_loss.item()
            total_train_tokens += batch_tokens

        train_loss = total_train_loss / total_train_tokens
        val_loss, r2, pearson, spearman, accuracy = evaluate_word_level(model, val_loader, tokenizer, device)

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"R2: {r2:.4f}, Pearson: {pearson:.4f}, Spearman: {spearman:.4f}, Accuracy: {accuracy:.2f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_path)
            print("→ Saved new best model.")


def gradual_unfreeze_and_train(
    model, train_dl, val_dl, tokenizer, device, model_path,
    total_epochs=20, unfreeze_every=5, max_unfrozen_layers=None,
    base_lr=1e-4, weight_decay=1e-4
):
    """
    Gradually unfreeze BERT layers every unfreeze_every epochs and train.
    """

    # Determine max layers to unfreeze
    if max_unfrozen_layers is None:
        if hasattr(model, 'model'):
            max_unfrozen_layers = len(list(model.model.bert.encoder.layer))
        elif hasattr(model, 'bert'):
            max_unfrozen_layers = len(list(model.bert.encoder.layer))
        else:
            raise AttributeError("Model has no BERT encoder attribute")

    epochs_per_phase = unfreeze_every
    phases = total_epochs // epochs_per_phase

    unfrozen_layers = 2

    for phase in range(phases):
        print(f"\n=== Training Phase {phase+1}/{phases} with {unfrozen_layers}/{max_unfrozen_layers} BERT layers unfrozen ===\n")
        freeze_bert_layers(model, unfrozen_layers)

        train_word_level(
            model=model,
            train_loader=train_dl,
            val_loader=val_dl,
            tokenizer=tokenizer,
            device=device,
            epochs=epochs_per_phase,
            model_path=model_path,
            initial_unfrozen_layers=unfrozen_layers,
            base_lr=base_lr,
            weight_decay=weight_decay,
        )

        unfrozen_layers = min(unfrozen_layers + 2, max_unfrozen_layers)


def train_head_then_full_model(
    model, train_dl, val_dl, tokenizer, device, model_path_head_only, model_path_full,
    head_only_epochs=5, total_epochs=20, unfreeze_every=5,
    base_lr=1e-4, weight_decay=1e-4
):
    # Phase 1: Freeze BERT completely and train only the head
    print("=== Phase 1: Training head only (BERT frozen) ===")
    freeze_all_bert_layers(model)
    
    train_word_level(
        model=model,
        train_loader=train_dl,
        val_loader=val_dl,
        tokenizer=tokenizer,
        device=device,
        epochs=head_only_epochs,
        model_path=model_path_head_only,
        initial_unfrozen_layers=1,  # BERT layers frozen
        base_lr=base_lr,
        weight_decay=weight_decay
    )
    
    model.load_state_dict(torch.load(model_path_head_only))
    
    # Phase 2: Gradually unfreeze and train full model
    print("\n=== Phase 2: Gradual unfreeze and full model training ===")
    gradual_unfreeze_and_train(
        model,
        train_dl,
        val_dl,
        tokenizer,
        device,
        model_path=model_path_full,
        total_epochs=total_epochs - head_only_epochs,
        unfreeze_every=unfreeze_every,
        base_lr=base_lr,
        weight_decay=weight_decay
    )


def evaluate_word_level(model, val_loader, tokenizer, device):
    model.eval()
    model.to(device)
    losses = []
    preds, golds = [], []

    with torch.no_grad():
        for idx, batch in enumerate(val_loader):
            ids, mask, y = [b.to(device) for b in batch.values()]
            out = model(ids, mask)

            for b in range(ids.size(0)):
                row_idx = idx * val_loader.batch_size + b
                words = val_loader.dataset.df.loc[row_idx, "sentence"].split()
                enc = tokenizer(words, is_split_into_words=True, truncation=True, padding="max_length",
                                max_length=val_loader.dataset.max_len)
                word_ids = enc.word_ids()

                token_preds = out[b]
                token_targets = y[b]
                loss = masked_mse_loss(token_preds, token_targets, word_ids)
                losses.append(loss.item())

                word_preds = aggregate_token_preds_to_words(token_preds.cpu(), word_ids)
                word_targets = aggregate_token_preds_to_words(token_targets.cpu(), word_ids)

                preds.extend(word_preds.tolist())
                golds.extend(word_targets.tolist())

    r2 = r2_score(golds, preds)
    pearson = pearsonr(golds, preds)[0]
    spearman = spearmanr(golds, preds)[0]
    
    golds_np = np.array(golds) * trt_std + trt_mean
    preds_np = np.array(preds) * trt_std + trt_mean
    golds_100 = (golds_np - np.min(golds_np)) / (np.max(golds_np) - np.min(golds_np)) * 100
    preds_100 = (preds_np - np.min(preds_np)) / (np.max(preds_np) - np.min(preds_np)) * 100
    mae = np.abs(golds_100 - preds_100).mean()
    accuracy = 100 - mae

    return np.mean(losses), r2, pearson, spearman, accuracy


# --------------------------------
# Execution
# --------------------------------
MAX_LEN = 256
BATCH = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")

train_dl = DataLoader(ReadingTimeDataset(train_df, tokenizer, MAX_LEN), batch_size=BATCH, shuffle=True)
val_dl   = DataLoader(ReadingTimeDataset(val_df, tokenizer, MAX_LEN), batch_size=BATCH)
test_dl  = DataLoader(ReadingTimeDataset(test_df, tokenizer, MAX_LEN), batch_size=BATCH)

# === Model 1 ===
model1 = BertRegression1("dumitrescustefan/bert-base-romanian-uncased-v1", dropout=0.3)

gradual_unfreeze_and_train(
    model1,
    train_dl,
    val_dl,
    tokenizer,
    device,
    model_path="bert_token_cls_best_1.pt",
    total_epochs=30,
    unfreeze_every=5,
    base_lr=1e-4,
    weight_decay=1e-4
)

model1.load_state_dict(torch.load("bert_token_cls_best_1.pt"))
test_loss, r2, pearson, spearman, accuracy = evaluate_word_level(model1, test_dl, tokenizer, device)
print(f"\nModel 1 Test → Loss: {test_loss:.4f}, R2: {r2:.4f}, Pearson: {pearson:.4f}, Spearman: {spearman:.4f}, Accuracy: {accuracy:.2f}\n")


# === Model 2 ===
model2 = BertRegression2("dumitrescustefan/bert-base-romanian-uncased-v1", dropout=0.3)

train_head_then_full_model(
    model2,
    train_dl,
    val_dl,
    tokenizer,
    device,
    model_path_head_only="bert_regression2_head_only.pt",
    model_path_full="bert_regression2_best.pt",
    head_only_epochs=5,
    total_epochs=35,
    unfreeze_every=5,
    base_lr=1e-4,
    weight_decay=1e-4
)

model2.load_state_dict(torch.load("bert_regression2_best.pt"))
test_loss2, r2_2, pearson_2, spearman_2, accuracy_2 = evaluate_word_level(model2, test_dl, tokenizer, device)
print(f"\nModel 2 Test → Loss: {test_loss2:.4f}, R2: {r2_2:.4f}, Pearson: {pearson_2:.4f}, Spearman: {spearman_2:.4f}, Accuracy: {accuracy_2:.2f}\n")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Training Phase 1/6 with 2/12 BERT layers unfrozen ===

Epoch 1/5 - Train Loss: 1.0427, Val Loss: 1.9067, R2: 0.1857, Pearson: 0.4820, Spearman: 0.5346, Accuracy: 63.93
→ Saved new best model.
Epoch 2/5 - Train Loss: 0.7994, Val Loss: 1.6463, R2: 0.2958, Pearson: 0.5499, Spearman: 0.6038, Accuracy: 64.66
→ Saved new best model.
Epoch 3/5 - Train Loss: 0.7492, Val Loss: 1.6892, R2: 0.2612, Pearson: 0.5575, Spearman: 0.6125, Accuracy: 63.37
Epoch 4/5 - Train Loss: 0.6083, Val Loss: 1.6613, R2: 0.2862, Pearson: 0.5614, Spearman: 0.6171, Accuracy: 64.09
Epoch 5/5 - Train Loss: 0.5780, Val Loss: 1.6830, R2: 0.2722, Pearson: 0.5607, Spearman: 0.6168, Accuracy: 65.00

=== Training Phase 2/6 with 4/12 BERT layers unfrozen ===

Epoch 1/5 - Train Loss: 0.8105, Val Loss: 2.0158, R2: 0.0803, Pearson: 0.5312, Spearman: 0.6043, Accuracy: 66.49
→ Saved new best model.
Epoch 2/5 - Train Loss: 0.5997, Val Loss: 1.7357, R2: 0.2649, Pearson: 0.5775, Spearman: 0.6352, Accuracy: 65.65
→ Saved new best 

  model1.load_state_dict(torch.load("bert_token_cls_best_1.pt"))



Model 1 Test → Loss: 1.0952, R2: 0.4075, Pearson: 0.6516, Spearman: 0.6847, Accuracy: 85.10

=== Phase 1: Training head only (BERT frozen) ===
Epoch 1/10 - Train Loss: 1.0162, Val Loss: 1.8315, R2: 0.1984, Pearson: 0.5045, Spearman: 0.5389, Accuracy: 64.49
→ Saved new best model.
Epoch 2/10 - Train Loss: 0.7550, Val Loss: 1.4811, R2: 0.2923, Pearson: 0.5535, Spearman: 0.5904, Accuracy: 67.35
→ Saved new best model.
Epoch 3/10 - Train Loss: 0.6203, Val Loss: 1.5578, R2: 0.2439, Pearson: 0.5551, Spearman: 0.5776, Accuracy: 74.78
Epoch 4/10 - Train Loss: 0.4477, Val Loss: 1.5302, R2: 0.2521, Pearson: 0.5823, Spearman: 0.6182, Accuracy: 78.39
Epoch 5/10 - Train Loss: 0.3636, Val Loss: 1.6129, R2: 0.2969, Pearson: 0.5604, Spearman: 0.5963, Accuracy: 77.52
Epoch 6/10 - Train Loss: 0.3063, Val Loss: 1.5574, R2: 0.2954, Pearson: 0.5635, Spearman: 0.6097, Accuracy: 79.25
Epoch 7/10 - Train Loss: 0.2602, Val Loss: 1.5680, R2: 0.2820, Pearson: 0.5582, Spearman: 0.6050, Accuracy: 80.06
Epoch 8/10

  model.load_state_dict(torch.load(model_path_head_only))



=== Phase 2: Gradual unfreeze and full model training ===

=== Training Phase 1/6 with 2/12 BERT layers unfrozen ===

Epoch 1/5 - Train Loss: 0.6132, Val Loss: 1.6034, R2: 0.2386, Pearson: 0.5620, Spearman: 0.5948, Accuracy: 78.00
→ Saved new best model.
Epoch 2/5 - Train Loss: 0.4294, Val Loss: 1.5281, R2: 0.3090, Pearson: 0.5623, Spearman: 0.6034, Accuracy: 75.26
→ Saved new best model.
Epoch 3/5 - Train Loss: 0.4214, Val Loss: 1.6039, R2: 0.2548, Pearson: 0.5426, Spearman: 0.5885, Accuracy: 81.53
Epoch 4/5 - Train Loss: 0.2818, Val Loss: 1.5190, R2: 0.2869, Pearson: 0.5584, Spearman: 0.5994, Accuracy: 79.77
→ Saved new best model.
Epoch 5/5 - Train Loss: 0.2880, Val Loss: 1.5323, R2: 0.2867, Pearson: 0.5588, Spearman: 0.6002, Accuracy: 79.79

=== Training Phase 2/6 with 4/12 BERT layers unfrozen ===

Epoch 1/5 - Train Loss: 0.3102, Val Loss: 1.9042, R2: 0.1086, Pearson: 0.5436, Spearman: 0.5808, Accuracy: 83.11
→ Saved new best model.
Epoch 2/5 - Train Loss: 0.2741, Val Loss: 1.451

  model2.load_state_dict(torch.load("bert_regression2_best.pt"))



Model 2 Test → Loss: 1.1241, R2: 0.3956, Pearson: 0.6524, Spearman: 0.6860, Accuracy: 83.30

