In [None]:
! pip install datasets transformers seqeval


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 5.3 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 57.3 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 50.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 23.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 38.5 MB/s 
[?2

In [None]:
from pathlib import Path
from argparse import Namespace
from typing import Union, List
from fastprogress import progress_bar
from typing_extensions import TypedDict
import torch
from datasets import Dataset, DatasetDict, load_metric
import numpy as np
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = Path("/content/drive/MyDrive/NLP 2022/nombank_train_dev_test/percentage")
data_dir2 = Path("/content/drive/MyDrive/NLP 2022/Results")

args = Namespace(
    batch_size=16,
    num_workers=4
)

train_data_path = data_dir / 'train.data'
valid_data_path = data_dir / 'dev.data'
test_data_path = data_dir / 'test.data'

outputFile = data_dir2 / 'BertBasePBPD'

In [None]:
SYMBOL_DICT = {
    "COMMA": ",",
}

LABEL_LIST = ["NONE", "PRED", "ARG1","SUPPORT"]

POS_LIST = ["CC", "CD", "DT", "FW", "IN", "JJ", "JJR", "JJS", 
            "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", 
            "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "SYM", 
            "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ", "WDT", "WP", "WP$", "WRB", "PU", "EX", 
            "RP"]
BIO_TAG_CONVERSION_DICT = {
    ".": "PU",
    ",": "PU",
    "COMMA": "PU",
    "$": "PU",
    ":": "PU",
    "(": "PU",
    ")": "PU",
    "``": "PU",
    "''": "PU",
    "#": "PU"
}
BIO_TAG_LIST = ["O", "B-NP", "I-NP", "B-VP", "I-VP", "B-PP",
                "I-PP", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP",
                "B-SBAR", "I-SBAR", "B-PRT", "I-PRT", "B-CONJP",
                "I-CONJP", "B-UCP", "I-UCP"]


In [None]:
class Word(TypedDict):
  word: str
  pos: str
  biotag: str
  label: Union[str, None]
  distance: int

def parse_input(input_file: Union[str, Path], drop_label = False) -> List[Union[List[Word], None]]:
    """
    Parses the input file and returns a list of lists of words.
    """
    pred_index=[]

    with open(input_file, "r") as fp:
        lines = fp.readlines()
    sentences: List[Union[List[Word], None]] = []
    last_sentence: List[Word] = []
    print("Parsing input file lines...")
    line_no = 0
    for line in progress_bar(lines):
        line_no += 1
        line = line.strip()
        word_info = line.split("\t")
        if len(word_info) >= 6 and word_info[5].strip() == "PRED":
            pred_index.append(int(word_info[3].strip()));

    print(pred_index)
  
    with open(input_file, "r") as f:
        lines = f.readlines()
    sentences: List[Union[List[Word], None]] = []
    last_sentence: List[Word] = []
    print("Parsing input file lines...")
    line_no = 0
    for line in progress_bar(lines):
        line_no += 1
        line = line.strip()
        word_info = line.split("\t")
        if len(word_info) >= 5:
            word_str = word_info[0].strip()
            if word_str in SYMBOL_DICT:
              word_str = SYMBOL_DICT[word_str]
            pos = word_info[1].strip()
            if pos in BIO_TAG_CONVERSION_DICT:
              pos = BIO_TAG_CONVERSION_DICT[pos]
            if pos not in POS_LIST:
              print(f"Warning: invalid POS on line {line_no} \"{pos}\", treated as PU.")
              pos = "PU"
            biotag = word_info[2].strip()
            if biotag not in BIO_TAG_LIST:
              print(f"Warning: invalid bio tag on line {line_no} \"{biotag}\", treated as O.")
              biotag = "O"
            if len(word_info) >= 6:
                label = word_info[5].strip()
            else:
                label = "NONE"
            if label not in LABEL_LIST:
              print(f"Warning: invalid label on line {line_no} \"{label}\", treated as NONE.")
              label = "NONE"
            distance = int(word_info[3].strip()) - int(pred_index[int(word_info[4])])
            if drop_label:
              label = None
            word = Word(word=word_str, pos=pos, biotag=biotag, label=label, distance=distance)
            last_sentence.append(word)
        else:
            if len(last_sentence) > 0:
                sentences.append(last_sentence)
            last_sentence = []
    if len(last_sentence) > 0:
        sentences.append(last_sentence)
    return sentences


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(LABEL_LIST))


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
def build_dataset_from_sentences(sentences, drop_label = False):
  dataset_tokens = []
  dataset_partitive_roles = []
  dataset_pos_tags = []
  dataset_bio_tags = []
  dataset_distance = []

  for sentence in sentences:
    tokens = [word['word'] for word in sentence]
    if not drop_label:
      partitive_roles = [LABEL_LIST.index(word['label']) for word in sentence]
    else:
      partitive_roles = None
    pos_tags = [POS_LIST.index(word['pos']) for word in sentence]
    bio_tags = [BIO_TAG_LIST.index(word['biotag']) for word in sentence]
    distance_tags = [word['distance'] for word in sentence]


    dataset_tokens.append(tokens)
    if not drop_label:
      dataset_partitive_roles.append(partitive_roles)
    dataset_pos_tags.append(pos_tags)
    dataset_bio_tags.append(bio_tags)
    dataset_distance.append(distance_tags)

  if not drop_label:
    dataset_dict = {
        "tokens": dataset_tokens,
        "partitive_roles": dataset_partitive_roles,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags,
        "distance_tags" : dataset_distance
    }
  else:
    dataset_dict = {
        "tokens": dataset_tokens,
        "pos_tags": dataset_pos_tags,
        "bio_tags": dataset_bio_tags,
        "distance_tags" : dataset_distance
    }
  return Dataset.from_dict(dataset_dict)

In [None]:
train_sentences = parse_input(train_data_path)
valid_sentences = parse_input(valid_data_path)
test_sentences = parse_input(test_data_path, drop_label=True)

Parsing input file lines...


[3, 21, 11, 22, 13, 16, 8, 3, 9, 6, 9, 6, 12, 5, 14, 7, 18, 10, 4, 8, 9, 28, 34, 26, 18, 14, 16, 5, 18, 6, 19, 16, 22, 9, 15, 10, 21, 36, 38, 25, 5, 27, 8, 20, 14, 10, 14, 13, 8, 16, 16, 26, 29, 15, 15, 14, 9, 8, 39, 12, 31, 18, 28, 4, 9, 11, 3, 2, 27, 5, 8, 15, 13, 7, 21, 4, 7, 32, 33, 12, 3, 17, 10, 18, 12, 13, 3, 9, 3, 16, 13, 3, 14, 4, 9, 40, 18, 9, 36, 19, 8, 11, 21, 34, 2, 3, 10, 7, 2, 7, 9, 7, 10, 14, 13, 8, 20, 13, 3, 16, 25, 16, 6, 8, 3, 16, 10, 15, 27, 17, 6, 19, 27, 11, 21, 30, 15, 25, 52, 23, 15, 16, 12, 26, 26, 7, 9, 3, 7, 12, 18, 3, 15, 4, 3, 27, 23, 15, 11, 9, 8, 5, 17, 3, 6, 6, 4, 20, 13, 5, 11, 21, 33, 14, 17, 10, 21, 2, 27, 18, 15, 18, 17, 6, 28, 4, 3, 14, 17, 12, 33, 20, 3, 36, 17, 3, 23, 2, 9, 8, 12, 18, 17, 2, 26, 22, 11, 16, 4, 8, 27, 11, 10, 10, 8, 8, 19, 6, 5, 6, 13, 32, 7, 11, 22, 12, 20, 8, 13, 5, 24, 27, 15, 6, 17, 10, 25, 4, 21, 7, 8, 10, 3, 3, 28, 25, 22, 25, 5, 16, 5, 4, 6, 13, 14, 4, 5, 16, 6, 12, 9, 19, 6, 14, 23, 24, 7, 25, 8, 10, 23, 5, 8, 9, 22, 2, 19

Parsing input file lines...


[6, 16, 19, 18, 7, 4, 15, 6, 5, 15, 11, 19, 7, 21, 24, 26, 9, 8, 30, 10, 8, 6, 9, 31, 20, 34, 5, 8, 11, 23, 17, 6, 7, 37, 17, 22, 9, 17, 6, 7, 12, 17, 21, 17, 16, 25, 5, 14, 15, 10, 13, 46, 23, 15, 5, 26, 33, 14, 32, 9, 16, 5, 15, 10, 11, 22, 4, 18, 7, 8, 8, 12, 5, 9, 11, 22, 16, 24, 3, 13, 5, 22, 9]
Parsing input file lines...


Parsing input file lines...


[31, 25, 4, 13, 16, 7, 9, 8, 11, 22, 25, 15, 15, 7, 17, 6, 9, 24, 8, 19, 9, 17, 14, 7, 8, 20, 10, 16, 7, 20, 25, 26, 12, 13, 16, 28, 4, 25, 27, 39, 20, 30, 39, 14, 8, 8, 9, 20, 26, 13, 16, 19, 14, 32, 38, 10, 3, 11, 7, 9, 8, 11, 22, 7, 6, 15, 15, 33, 7, 26, 16, 19, 23, 26, 14, 19, 10, 13, 15, 2, 39, 17, 20, 24, 12, 5, 20, 8, 9, 4, 12, 11, 22, 26, 7, 7, 13, 17, 24, 29, 31, 2, 18, 37, 26, 18, 12, 11, 16, 4, 14, 18, 18, 12, 4, 2, 5, 8, 10, 21, 15, 12, 30, 13, 6, 25, 19, 12, 3, 9, 11, 21, 15, 25, 22, 12, 28, 13, 10, 14, 19, 4, 17, 8, 7, 24, 15, 13, 13, 28]
Parsing input file lines...


In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    if "partitive_roles" in examples:
      labels = []
      for i, label in enumerate(examples["partitive_roles"]):
          word_ids = tokenized_inputs.word_ids(batch_index=i)
          previous_word_idx = None
          label_ids = []
          for word_idx in word_ids:
              # Special tokens have a word id that is None. We set the label to -100 so they are automatically
              # ignored in the loss function.
              if word_idx is None:
                  label_ids.append(-100)
              # We set the label for the first token of each word.
              elif word_idx != previous_word_idx:
                  label_ids.append(label[word_idx])
              # For the other tokens in a word, we set the label to either the current label or -100, depending on
              # the label_all_tokens flag.
              else:
                  label_ids.append(label[word_idx] if label_all_tokens else -100)
              previous_word_idx = word_idx

          labels.append(label_ids)

      tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
train_raw_dataset = build_dataset_from_sentences(train_sentences)
valid_raw_dataset = build_dataset_from_sentences(valid_sentences)
test_raw_dataset = build_dataset_from_sentences(test_sentences, drop_label=True)

raw_datasets = DatasetDict(train=train_raw_dataset, valid=valid_raw_dataset, test=test_raw_dataset)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_and_align_labels, batched=True)


  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [LABEL_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABEL_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


     

In [None]:
batch_size=16

train_args = TrainingArguments(
    "bert_partitive_roles",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: partitive_roles, tokens, distance_tags, pos_tags, bio_tags. If partitive_roles, tokens, distance_tags, pos_tags, bio_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2192
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1370
  Number of trainable parameters = 108894724


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.086754,0.771536,0.820717,0.795367,0.969325
2,No log,0.07601,0.808853,0.800797,0.804805,0.970475
3,No log,0.074345,0.815041,0.798805,0.806841,0.970475
4,0.026000,0.072158,0.77634,0.836653,0.805369,0.970092
5,0.026000,0.076466,0.79096,0.836653,0.813166,0.970092
6,0.026000,0.076949,0.827801,0.794821,0.810976,0.971242
7,0.026000,0.085568,0.781022,0.85259,0.815238,0.970475
8,0.026700,0.084705,0.780357,0.870518,0.822976,0.970859
9,0.026700,0.087658,0.796935,0.828685,0.8125,0.970859
10,0.026700,0.087104,0.799618,0.834661,0.816764,0.971242


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: partitive_roles, tokens, distance_tags, pos_tags, bio_tags. If partitive_roles, tokens, distance_tags, pos_tags, bio_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: partitive_roles, tokens, distance_tags, pos_tags, bio_tags. If partitive_roles, tokens, distance_tags, pos_tags, bio_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored:

TrainOutput(global_step=1370, training_loss=0.02583851013740484, metrics={'train_runtime': 306.7273, 'train_samples_per_second': 71.464, 'train_steps_per_second': 4.467, 'total_flos': 678926574234624.0, 'train_loss': 0.02583851013740484, 'epoch': 10.0})

In [None]:
test_results = trainer.predict(tokenized_datasets['test'])


The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: pos_tags, tokens, distance_tags, bio_tags. If pos_tags, tokens, distance_tags, bio_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 150
  Batch size = 16


In [None]:
out = []

for i in range(len(tokenized_datasets['test'])):
  sentence = tokenized_datasets['test'][i]
  tokenized_input = tokenizer(sentence["tokens"], truncation=True, is_split_into_words=True)
  predictions = test_results.predictions[i]
  label_ids = []
  for prediction in predictions:
    label_ids.append(np.argmax(prediction))
  word_id_to_label_idx = {}
  for j, word_id in enumerate(tokenized_input.word_ids()):
    if word_id in word_id_to_label_idx or word_id is None:
      continue
    word_id_to_label_idx[word_id] = j
  labelings = []
  for j, token in enumerate(sentence["tokens"]):
    label_idx = word_id_to_label_idx[j]
    label_id = label_ids[label_idx]
    label = LABEL_LIST[label_id] if label_id ==2 else None
    labelings.append((token, label))
  out.append(labelings)

In [None]:
with open(outputFile, 'w') as f:
  for line in out:
    for labling in line:
      if labling[1]:
        f.write(f"{labling[0]}\t{labling[1]}\n")
      else:
        f.write(f"{labling[0]}\n")
    f.write("\n")

In [None]:

def test_sentence_string(s: str):
  tokenized_input = tokenizer(s, truncation=True)
  model(tokenized_input)
