In [1]:
!pip install transformers evaluate datasets accelerate
!pip install pyarrow pandas

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import os
import shutil
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    PreTrainedModel,
    BertConfig,
    BertPreTrainedModel
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.model_selection import train_test_split
import evaluate

In [3]:
# --- Configuration ---
MODEL_NAME = "bert-base-uncased"
DATA_FILE = "semcor_train.parquet"
LABEL_MAP_FILE = "label_map.json"
OUTPUT_DIR = "./wsd_bert_custom"
DRIVE_PATH = "/content/drive/MyDrive/My_WSD_Project"

In [4]:
# Mount Drive
from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU detected: Tesla T4


In [6]:
# --- 2. Load Data & Prepare Mappings ---
print("Loading data...")
df = pd.read_parquet(DATA_FILE)

# Load label mapping
with open(LABEL_MAP_FILE, 'r') as f:
    label2id = json.load(f)

NUM_LABELS = len(label2id)
print(f"Total unique senses (classes): {NUM_LABELS}")

# Split Data
train_df, val_df = train_test_split(df, test_size=0.1, random_state=111)

Loading data...
Total unique senses (classes): 25819


In [7]:
# --- 3. Custom Dataset with Offset Mapping ---
class WSDDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sentence = row['sentence']

        # Get the exact boundaries of a word from a dataframe
        c_start = row['char_start']
        c_end = row['char_end']

        # Tokenization with offset mapping
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        # Format offsets for searching a token
        offsets = encoding['offset_mapping'].squeeze().tolist()
        target_token_idx = 0

        # Looking for a token that STARTS within the boundaries of the word
        for i, (o_start, o_end) in enumerate(offsets):
            if o_start == 0 and o_end == 0: continue # Skip [CLS], [PAD]

            # Logic: if the beginning of the token matches the beginning of the word
            if o_start == c_start:
                target_token_idx = i
                break

            # Fallback: if the token starts a little earlier but covers the beginning of the word
            if o_start < c_start and o_end > c_start:
                 target_token_idx = i
                 break

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'target_token_idx': torch.tensor(target_token_idx, dtype=torch.long),
            'labels': torch.tensor(row['label_id'], dtype=torch.long)
        }

        return item

# Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Creating datasets...")
train_dataset = WSDDataset(train_df, tokenizer)
val_dataset = WSDDataset(val_df, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Creating datasets...


In [8]:
# --- 4. Custom Model Architecture ---
class BertForWSD(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = AutoModel.from_config(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, target_token_idx=None, labels=None, **kwargs):

        # Remove the num_items_in_batch argument that Trainer sends but BertModel does not accept.
        kwargs.pop("num_items_in_batch", None)

        # 1. Run BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, **kwargs)

        sequence_output = outputs.last_hidden_state

        # 2. Extract specific vector
        batch_size = input_ids.shape[0]
        batch_indices = torch.arange(batch_size, device=input_ids.device)
        target_vectors = sequence_output[batch_indices, target_token_idx]

        # 3. Classification
        target_vectors = self.dropout(target_vectors)
        logits = self.classifier(target_vectors)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

print("Initializing Custom Model...")
model = BertForWSD.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

Initializing Custom Model...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForWSD were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# --- 5. Training Setup ---
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none",
    dataloader_num_workers=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
# --- 6. Train ---
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,3.1467,3.01556,0.600303
2,2.1797,2.338993,0.659042
3,1.6176,2.096758,0.689213
4,1.1577,2.002292,0.704254
5,0.9121,1.989069,0.710662


TrainOutput(global_step=63205, training_loss=2.178926537514261, metrics={'train_runtime': 8162.8512, 'train_samples_per_second': 123.881, 'train_steps_per_second': 7.743, 'total_flos': 8.193417887359488e+16, 'train_loss': 2.178926537514261, 'epoch': 5.0})

In [11]:
# --- 7. Save to Drive ---
print("Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Copying to Drive: {DRIVE_PATH}")
target_path = os.path.join(DRIVE_PATH, "bert_wsd_custom")
if os.path.exists(target_path):
    shutil.rmtree(target_path)
shutil.copytree(OUTPUT_DIR, target_path)
shutil.copy(LABEL_MAP_FILE, os.path.join(DRIVE_PATH, "label_map.json"))
print("Done!")

Saving model...
Copying to Drive: /content/drive/MyDrive/My_WSD_Project
Done!
