<a href="https://colab.research.google.com/github/amee342/semantic_role_labeling/blob/main/bert_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/amee342/semantic_role_labeling.git

Cloning into 'semantic_role_labeling'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 35 (delta 5), reused 22 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (35/35), 3.78 MiB | 17.50 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [2]:
cd semantic_role_labeling/

/content/semantic_role_labeling


In [3]:
!pip install -q transformers datasets accelerate evaluate seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
# set up saving repo in drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
model_checkpoint = "distilbert/distilbert-base-uncased"

There is an repository in drive called "SRL" for storing finetuned models

In [6]:
from typing import List
import transformers
from transformers import AutoTokenizer

## Load and Parse CONLLU dataset

In [7]:
def load_conll_sentences(path: str):

    sentences = []
    sent = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:

            line = line.rstrip("\n")

            #  save previous sentence
            # at boundary between 2 sentences
            if line.strip() == "":
                if sent:
                    sentences.append(sent)
                    sent = []
                continue

            # skip comments
            if line.startswith("#"):
                continue

            cols = line.split("\t")
            sent.append(cols)

    if sent:
        sentences.append(sent)

    return sentences

## Preprocessing



In [8]:
def count_sentences_and_tokens(sentences: List):
  """
  Return number of sentences (n_sent)
  and number of tokens from these sentences (n_token)
  """
  n_sent = len(sentences)
  n_token = sum(len(s) for s in sentences)

  return n_sent, n_token





### Replicate each sentence for each predicate

In [9]:
def find_predicate_index(sent,
                           label_col,
                           predicate_markers=("V", "B-V")):
  for i, row in enumerate(sent):
    if len(row) > label_col and row[label_col] in predicate_markers:
      return i
  return None



In [10]:
def replicate_sentences(sentences,
                        base_cols: int=11):

  instances = []
  """
  base_cols: the column with specified predicates
  """
  for sent in sentences:

    # check the maximum columns in specific sentence
    # assume it's consistent per token row
    max_cols = max(len(r) for r in sent)

    # nr of predicate-specific label columns
    k = max(0, max_cols-base_cols)

    if k == 0 :
      # sentence has no predicate
      continue

    # rely on k
    for j in range(k):
      label_col = base_cols + j  # 0-based index

      pred_index = find_predicate_index(sent, label_col)

      # fallback if no V marker found
      if pred_index is None:
        pred_index = next((i for i,r in enumerate(sent) if len(r) > 9 and r[9] not in ("_", "-", "")), None)


      tokens = [r[1] for r in sent] # FORM column

      #labels = [(r[label_col] if len(r) > label_col else "O") for r in sent]
      labels = [
                    "O" if (len(r) <= label_col or r[label_col] == "_")
                    else r[label_col]
                    for r in sent
              ]

      instances.append({
                "tokens": tokens,
                "predicate_index": pred_index,
                "labels": labels,
            })
  return instances



In [11]:
def load_and_preprocess(path:str):
  sentences = load_conll_sentences(path)
  before_s, before_t = count_sentences_and_tokens(sentences)

  instances = replicate_sentences(sentences)
  after_s, after_t = count_sentences_and_tokens(instances)

  return {
        "sentences": sentences,
        "instances": instances,
        "stats": {
            "before_sentences": before_s,
            "before_tokens": before_t,
            "after_instances": after_s,
            "after_tokens": after_t
        }
    }

### Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



tokenizer.json: 0.00B [00:00, ?B/s]

In [13]:
# check if tokenizer is backed by RUST
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [48]:
labels_all_tokens = True

def tokenize_and_align_labels(example):
  tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)

  word_ids = tokenized_inputs.word_ids()
  previous_word_idx=None
  label_ids=[]

  for word_idx in word_ids:
    if word_idx is None:
      # for special token that is ignored in Pytorch,
      # set as -100
      label_ids.append(-100)
    elif word_idx != previous_word_idx:
      label_ids.append(example["labels"][word_idx])
    else:
      label_ids.append(example["labels"][word_idx]) if labels_all_tokens else label_ids.append(-100)

    previous_word_idx = word_idx
  return label_ids


In [77]:
labels_all_tokens = True  # or False

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    aligned_labels = []

    for i, labels in enumerate(examples["labels_str"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[labels[word_idx]])
            else:
                label_ids.append(label2id[labels[word_idx]] if labels_all_tokens else -100)

            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels  # <-- ints + -100
    return tokenized_inputs

In [74]:
# sanity check

dataset = load_and_preprocess("/content/semantic_role_labeling/data/en_ewt-up-test.conllu")
ds = Dataset.from_list(dataset['instances'])
ds = ds.rename_column("labels", "labels_str")


In [75]:
label_list = sorted({l for ex in ds for l in ex["labels_str"]})
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

In [88]:
tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5338 [00:00<?, ? examples/s]

In [83]:
tokenized_datasets[0]

{'tokens': ['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?'],
 'predicate_index': 3,
 'labels_str': ['O', 'O', 'ARG1', 'V', 'O', 'ARG2', 'O'],
 'input_ids': [101,
  2054,
  2065,
  8224,
  22822,
  8458,
  2098,
  2046,
  8224,
  2891,
  1029,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 35, 35, 2, 45, 45, 45, 35, 4, 4, 35, -100]}

In [85]:
tokenizer.convert_ids_to_tokens(tokenized_datasets[0]["input_ids"])

['[CLS]',
 'what',
 'if',
 'google',
 'mor',
 '##ph',
 '##ed',
 'into',
 'google',
 '##os',
 '?',
 '[SEP]']

In [16]:
print(dataset["instances"][0])

{'tokens': ['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?'], 'predicate_index': 3, 'labels': ['O', 'O', 'ARG1', 'V', 'O', 'ARG2', 'O']}


In [86]:
id2label

{0: '',
 1: 'ARG0',
 2: 'ARG1',
 3: 'ARG1-DSP',
 4: 'ARG2',
 5: 'ARG3',
 6: 'ARG4',
 7: 'ARG5',
 8: 'ARGA',
 9: 'ARGM-ADJ',
 10: 'ARGM-ADV',
 11: 'ARGM-CAU',
 12: 'ARGM-COM',
 13: 'ARGM-CXN',
 14: 'ARGM-DIR',
 15: 'ARGM-DIS',
 16: 'ARGM-EXT',
 17: 'ARGM-GOL',
 18: 'ARGM-LOC',
 19: 'ARGM-LVB',
 20: 'ARGM-MNR',
 21: 'ARGM-MOD',
 22: 'ARGM-NEG',
 23: 'ARGM-PRD',
 24: 'ARGM-PRP',
 25: 'ARGM-PRR',
 26: 'ARGM-TMP',
 27: 'C-ARG0',
 28: 'C-ARG1',
 29: 'C-ARG1-DSP',
 30: 'C-ARG2',
 31: 'C-ARG3',
 32: 'C-ARGM-CXN',
 33: 'C-ARGM-LOC',
 34: 'C-V',
 35: 'O',
 36: 'R-ARG0',
 37: 'R-ARG1',
 38: 'R-ARG2',
 39: 'R-ARGM-ADJ',
 40: 'R-ARGM-ADV',
 41: 'R-ARGM-DIR',
 42: 'R-ARGM-LOC',
 43: 'R-ARGM-MNR',
 44: 'R-ARGM-TMP',
 45: 'V'}

In [40]:
from datasets import Dataset

In [52]:
ds = Dataset.from_list(dataset['instances'])

In [49]:
ds

Dataset({
    features: ['tokens', 'predicate_index', 'labels'],
    num_rows: 5338
})

In [50]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/5338 [00:00<?, ? examples/s]

TypeError: Provided `function` which is applied to all elements of table returns a variable of type <class 'list'>. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.