In [1]:
! git clone https://github.com/amee342/semantic_role_labeling.git

Cloning into 'semantic_role_labeling'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 53 (delta 17), reused 21 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (53/53), 3.80 MiB | 9.02 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [2]:
cd semantic_role_labeling/

/content/semantic_role_labeling


In [3]:
!pip install -q transformers datasets accelerate evaluate seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m834.0 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
# set up saving repo in drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Argument setting
model_checkpoint = "distilbert/distilbert-base-uncased"
batch_size = 16
task = "SRL"
training_epoch = 1

There is an repository in drive called "SRL" for storing finetuned models

In [6]:
from typing import List
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, set_seed
from transformers import DataCollatorForTokenClassification
from evaluate import load
from datasets import Dataset

In [7]:
# Set random seed!

SEED = 0
set_seed(SEED)

## Load and Parse CONLLU dataset

In [8]:
def load_conll_sentences(path: str):

    sentences = []
    sent = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:

            line = line.rstrip("\n")

            #  save previous sentence
            # at boundary between 2 sentences
            if line.strip() == "":
                if sent:
                    sentences.append(sent)
                    sent = []
                continue

            # skip comments
            if line.startswith("#"):
                continue

            cols = line.split("\t")
            sent.append(cols)

    if sent:
        sentences.append(sent)

    return sentences

## Preprocessing



In [9]:
def count_sentences_and_tokens(sentences: List):
  """
  Return number of sentences (n_sent)
  and number of tokens from these sentences (n_token)
  """
  n_sent = len(sentences)
  n_token = sum(len(s) for s in sentences)

  return n_sent, n_token





### Replicate each sentence for each predicate

In [10]:
def find_predicate_index(sent,
                           label_col,
                           predicate_markers=("V", "B-V")):
  for i, row in enumerate(sent):
    if len(row) > label_col and row[label_col] in predicate_markers:
      return i
  return None



In [11]:
def replicate_sentences(sentences,
                        base_cols: int=11):

  instances = []
  """
  base_cols: the column with specified predicates
  """
  for sent in sentences:

    # check the maximum columns in specific sentence
    # assume it's consistent per token row
    max_cols = max(len(r) for r in sent)

    # nr of predicate-specific label columns
    k = max(0, max_cols-base_cols)

    if k == 0 :
      # sentence has no predicate
      continue

    # rely on k
    for j in range(k):
      label_col = base_cols + j  # 0-based index

      pred_index = find_predicate_index(sent, label_col)

      # fallback if no V marker found
      if pred_index is None:
        pred_index = next((i for i,r in enumerate(sent) if len(r) > 9 and r[9] not in ("_", "-", "")), None)


      tokens = [r[1] for r in sent] # FORM column

      #labels = [(r[label_col] if len(r) > label_col else "O") for r in sent]
      labels = [
                    "O" if (len(r) <= label_col or r[label_col] == "_")
                    else r[label_col]
                    for r in sent
              ]

      instances.append({
                "tokens": tokens,
                "predicate_index": pred_index,
                "labels": labels,
            })
  return instances



In [12]:
def load_and_preprocess(path:str):
  sentences = load_conll_sentences(path)
  before_s, before_t = count_sentences_and_tokens(sentences)

  instances = replicate_sentences(sentences)
  after_s, after_t = count_sentences_and_tokens(instances)

  return {
        "sentences": sentences,
        "instances": instances,
        "stats": {
            "before_sentences": before_s,
            "before_tokens": before_t,
            "after_instances": after_s,
            "after_tokens": after_t
        }
    }

### Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [14]:
# check if tokenizer is backed by RUST
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

### Add predicate encoding

In [15]:
def make_predicate_query(tokens, pred_idx, window=1):
    start = max(0, pred_idx - window)
    end = min(len(tokens), pred_idx + window + 1)
    return tokens[start:end]  # list[str]

In [149]:
labels_all_tokens = True  # or False

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    aligned_labels = []

    for i, labels in enumerate(examples["labels_str"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[labels[word_idx]])
            else:
                label_ids.append(label2id[labels[word_idx]] if labels_all_tokens else -100)

            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels  # <-- ints + -100
    return tokenized_inputs

In [16]:
labels_all_tokens = False  # usually False is safest

def tokenize_and_align_labels(examples):

    sent_batch = examples["tokens"]               # list[list[str]]
    labels_batch = examples["labels_str"]         # <-- use string labels
    pred_idx_batch = examples["predicate_index"]  # list[int]

    # Build predicate query (Sequence B)
    query_batch = [
        make_predicate_query(tokens, pred_idx, window=1)
        for tokens, pred_idx in zip(sent_batch, pred_idx_batch)
    ]

    # Tokenize sentence pairs
    tokenized = tokenizer(
        sent_batch,
        query_batch,
        is_split_into_words=True,
        truncation=True,
    )

    all_label_ids = []

    for i, labels in enumerate(labels_batch):

        word_ids = tokenized.word_ids(batch_index=i)
        seq_ids = tokenized.sequence_ids(i)

        prev_word = None
        label_ids = []

        for word_idx, seq_id in zip(word_ids, seq_ids):

            if word_idx is None:
                label_ids.append(-100)

            elif seq_id != 0:
                # Ignore sequence B and special tokens
                label_ids.append(-100)

            elif word_idx != prev_word:
                # First subword of token in sentence A

                # Ignore predicate token
                if word_idx == pred_idx_batch[i]:
                    label_ids.append(-100)
                else:
                    label_ids.append(label2id[labels[word_idx]])

            else:
                # Subsequent subword
                if labels_all_tokens:
                    label_ids.append(label2id[labels[word_idx]])
                else:
                    label_ids.append(-100)

            prev_word = word_idx

        all_label_ids.append(label_ids)

    tokenized["labels"] = all_label_ids
    return tokenized

In [17]:
# sanity check

dataset = load_and_preprocess("/content/semantic_role_labeling/data/en_ewt-up-test.conllu")
ds = Dataset.from_list(dataset['instances'])
ds = ds.filter(lambda ex: ex["predicate_index"] is not None)
ds = ds.rename_column("labels", "labels_str")


Filter:   0%|          | 0/5338 [00:00<?, ? examples/s]

In [18]:
label_list = sorted({l for ex in ds for l in ex["labels_str"]})
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

In [19]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/5083 [00:00<?, ? examples/s]

### Inspect tokenized sentences

In [35]:
def print_tokenized_sentence(tokenized_ds, index:list[int] = None):
  for i in index:
    print(f"Instance {i}:" +" ".join(tokenizer.convert_ids_to_tokens(tokenized_ds[i]['input_ids'])))


In [33]:
print_tokenized_sentence(tokenized_ds, range(0,20))

Instance 0:[CLS] what if google mor ##ph ##ed into google ##os ? [SEP] google mor ##ph ##ed into [SEP]
Instance 1:[CLS] what if google expanded on its search - engine ( and now e - mail ) ware ##s into a full - fledged operating system ? [SEP] google expanded on [SEP]
Instance 2:[CLS] [ via microsoft watch from mary jo foley ] [SEP] [ via [SEP]
Instance 3:[CLS] ( and , by the way , is anybody else just a little nos ##tal ##gic for the days when that was a good thing ? ) [SEP] the way , [SEP]
Instance 4:[CLS] ( and , by the way , is anybody else just a little nos ##tal ##gic for the days when that was a good thing ? ) [SEP] , is anybody [SEP]
Instance 5:[CLS] ( and , by the way , is anybody else just a little nos ##tal ##gic for the days when that was a good thing ? ) [SEP] that was a [SEP]
Instance 6:[CLS] this buzz ##mac ##hine post argues that google ' s rush toward u ##bi ##qui ##ty might back ##fire - - which we ' ve all heard before , but it ' s particularly well - put in this pos

## Fine Tune Model

In [36]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.resize_token_embeddings(len(tokenizer))



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForTokenClassification LOAD REPORT from: distilbert/distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Embedding(30522, 768, padding_idx=0)

In [115]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=training_epoch,
    weight_decay=0.01,
    seed=SEED,
    report_to="none",
)

In [None]:
# batch dataset
data_collator = DataCollatorForTokenClassification(tokenizer)

In [110]:
# load metric
metric = load("seqeval")

Downloading builder script: 0.00B [00:00, ?B/s]

In [139]:
ex = tokenized_datasets[0]
print(ex["input_ids"][:30])
print(tokenizer.convert_ids_to_tokens(ex["input_ids"][:30]))

[101, 2054, 2065, 8224, 30522, 22822, 8458, 2098, 30523, 2046, 8224, 2891, 1029, 102]
['[CLS]', 'what', 'if', 'google', '[PRED]', 'mor', '##ph', '##ed', '[/PRED]', 'into', 'google', '##os', '?', '[SEP]']


In [None]:
tokenized_datasets[0]

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_datasets[0]["input_ids"])

In [None]:
print(dataset["instances"][0])

In [None]:
id2label

In [None]:
from datasets import Dataset

In [None]:
ds = Dataset.from_list(dataset['instances'])

In [None]:
ds

In [None]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)


In [39]:
# Deep clean widgets
import json
from copy import deepcopy

def clean_notebook_widgets(in_path, out_path=None):
    out_path = out_path or in_path
    with open(in_path, "r", encoding="utf-8") as f:
        nb = json.load(f)

    # 1) Top-level metadata
    nb_meta = nb.get("metadata", {})
    # Remove common widget keys
    for k in ["widgets", "widget", "jupyter_widgets", "ipywidgets"]:
        if k in nb_meta:
            nb_meta.pop(k, None)
    nb["metadata"] = nb_meta

    # 2) Per-cell metadata and outputs
    for cell in nb.get("cells", []):
        cell_meta = cell.get("metadata", {})
        for k in ["widgets", "widget", "jupyter_widgets", "ipywidgets"]:
            cell_meta.pop(k, None)
        # Some notebooks store widget state under these keys too
        for k in list(cell_meta.keys()):
            if "widget" in k.lower():
                cell_meta.pop(k, None)
        cell["metadata"] = cell_meta

        # Clean outputs metadata too
        if "outputs" in cell:
            for out in cell["outputs"]:
                out_meta = out.get("metadata", {})
                # Remove widget-related metadata
                for k in list(out_meta.keys()):
                    if "widget" in k.lower():
                        out_meta.pop(k, None)
                out["metadata"] = out_meta

                # Some widget outputs embed data that triggers nbconvert widget parsing
                data = out.get("data", {})
                if isinstance(data, dict):
                    # Common widget MIME types
                    data.pop("application/vnd.jupyter.widget-view+json", None)
                    data.pop("application/vnd.jupyter.widget-state+json", None)
                    out["data"] = data

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(nb, f, ensure_ascii=False, indent=1)

    return out_path

# ---- use it ----
nb_path = "/content/semantic_role_labeling/bert_finetuning.ipynb"   # <-- change this
clean_notebook_widgets(nb_path)
print("Deep-cleaned:", nb_path)

Deep-cleaned: /content/semantic_role_labeling/bert_finetuning.ipynb
