In [1]:
# import os
# import sys
# DIR_PREFIX = "/home/user/commits/NER"
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
# import pytorch_lightning as pl


from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    AutoTokenizer,
    RobertaTokenizerFast,
    T5ForTokenClassification,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

label2id = {'O': 0, 'B-ALG': 1, 'I-ALG': 2, 'B-APP': 3, 'I-APP': 4, 'B-CB': 5, 'I-CB': 6, 'B-CLA': 7, 'I-CLA': 8, 'B-DEV': 9, 'I-DEV': 10, 'B-DS': 11, 'I-DS': 12, 'B-DT': 13, 'I-DT': 14, 'B-FN': 15, 'I-FN': 16, 'B-FT': 17, 'I-FT': 18, 'B-FUN': 19, 'I-FUN': 20, 'B-HXT': 21, 'I-HXT': 22, 'B-LAN': 23, 'I-LAN': 24, 'B-LIB': 25, 'I-LIB': 26, 'B-OS': 27, 'I-OS': 28, 'B-UIE': 29, 'I-UIE': 30, 'B-UN': 31, 'I-UN': 32, 'B-VAL': 33, 'I-VAL': 34, 'B-VAR': 35, 'I-VAR': 36, 'B-VER': 37, 'I-VER': 38, 'B-WEB': 39, 'I-WEB': 40}
id2label = {label2id[x]: x for x in label2id}
labels = ['Algorithm', 'Application', 'Class', 'Code_Block', 'Data_Structure', 'Data_Type', 'Device', 'File_Name', 'File_Type', 'Function', 'HTML_XML_Tag', 'Language', 'Library', 'Operating_System', 'User_Interface_Element', 'User_Name', 'Value', 'Variable', 'Version', 'Website']
labels_short = ['ALG', 'APP', 'CB', 'CLA', 'DEV', 'DS', 'DT', 'FN', 'FT', 'FUN', 'HXT', 'LAN', 'LIB', 'OS', 'UIE', 'UN', 'VAL', 'VAR', 'VER', 'WEB']
short2long = {'ALG': 'Algorithm', 'APP': 'Application', 'CLA': 'Class', 'CB': 'Code_Block', 'DS': 'Data_Structure', 'DT': 'Data_Type', 'DEV': 'Device', 'FN': 'File_Name', 'FT': 'File_Type', 'FUN': 'Function', 'HXT': 'HTML_XML_Tag', 'LAN': 'Language', 'LIB': 'Library', 'OS': 'Operating_System', 'UIE': 'User_Interface_Element', 'UN': 'User_Name', 'VAL': 'Value', 'VAR': 'Variable', 'VER': 'Version', 'WEB': 'Website'}


model_name = "microsoft/codebert-base"
model_checkpoint_path = f"checkpoints/{model_name}-token-clf"
model_checkpoint_path

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abuboba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'checkpoints/microsoft/codebert-base-token-clf'

In [3]:
from datasets import load_dataset

path = "data/StackOverflow/json/"

dataset = load_dataset('json', data_files=os.path.join(path, 'data_train.json'))
dataset["test"] = load_dataset('json', data_files=os.path.join(path, 'data_test.json'))["train"]
dataset["validation"] = load_dataset('json', data_files=os.path.join(path, 'data_dev.json'))["train"]


In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

In [5]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>', 'ĠIf', 'ĠI', 'Ġwould', 'Ġhave', 'Ġ2', 'Ġtables', '</s>']

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/9263 [00:00<?, ? examples/s]

Map:   0%|          | 0/3108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2936 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9263
    })
    test: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3108
    })
    validation: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2936
    })
})

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")
label_list = list(label2id.keys())
labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p, full=False):
    predictions, labels = p
    if full is False:
        predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    if full:
        return results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [11]:
from transformers import TrainingArguments, Trainer, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id, device_map='cuda'
)
model.model_parallel = False

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
training_args = TrainingArguments(
    output_dir=model_checkpoint_path,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    # load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2316,0.21384,0.625459,0.526533,0.571748,0.94517
2,0.154,0.205194,0.623863,0.600721,0.612073,0.948706
3,0.1007,0.225418,0.648283,0.617723,0.632634,0.951428
4,0.0638,0.234571,0.639178,0.61695,0.627867,0.950243
5,0.0392,0.279422,0.641972,0.613859,0.627601,0.94976
6,0.0296,0.293174,0.649907,0.632664,0.64117,0.951055
7,0.0181,0.328587,0.638994,0.608707,0.623483,0.949386
8,0.0117,0.345616,0.654541,0.625708,0.6398,0.951253
9,0.0077,0.358896,0.655294,0.62339,0.638944,0.951055
10,0.006,0.363278,0.652384,0.627512,0.639706,0.950967


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=11580, training_loss=0.07284522908956893, metrics={'train_runtime': 1812.45, 'train_samples_per_second': 51.108, 'train_steps_per_second': 6.389, 'total_flos': 1874042445546462.0, 'train_loss': 0.07284522908956893, 'epoch': 10.0})

In [13]:
pred = []


for item in tokenized_dataset["test"]:
    a = tokenizer(item["tokens"],truncation=True, padding=True,is_split_into_words=True, return_tensors="pt").to("cuda")
    predictions = np.argmax(model(**a).logits.cpu().detach(), axis=2)
    pred.extend(predictions)
    # print(compute_metrics((model(**a).logits.cpu().detach(), tokenized_dataset["test"]["labels"]), full=True))


In [14]:
dct = compute_metrics((pred, tokenized_dataset["test"]["labels"]), True)

In [15]:
for x in dct:
    print(x, "---", dct[x], sep="\t")

ALG	---	{'precision': 0.7, 'recall': 0.4375, 'f1': 0.5384615384615384, 'number': 16}
APP	---	{'precision': 0.6378504672897196, 'recall': 0.6707616707616708, 'f1': 0.6538922155688622, 'number': 407}
CB	---	{'precision': 0.47619047619047616, 'recall': 0.4304635761589404, 'f1': 0.45217391304347826, 'number': 302}
CLA	---	{'precision': 0.6316916488222698, 'recall': 0.5784313725490197, 'f1': 0.6038894575230297, 'number': 510}
DEV	---	{'precision': 0.6101694915254238, 'recall': 0.6792452830188679, 'f1': 0.6428571428571429, 'number': 53}
DS	---	{'precision': 0.8430493273542601, 'recall': 0.7611336032388664, 'f1': 0.7999999999999999, 'number': 247}
DT	---	{'precision': 0.7459016393442623, 'recall': 0.8198198198198198, 'f1': 0.7811158798283261, 'number': 111}
FN	---	{'precision': 0.8282208588957055, 'recall': 0.8282208588957055, 'f1': 0.8282208588957055, 'number': 163}
FT	---	{'precision': 0.8645833333333334, 'recall': 0.6434108527131783, 'f1': 0.7377777777777779, 'number': 129}
FUN	---	{'preci

In [17]:
with open("log_codebert.txt", "wt") as f:
    f.write(f"{trainer.state.log_history}")