In [1]:
# !pip install --upgrade pip
# !pip install transformers==4.38.1
# !pip install sentencepiece datasets seqeval evaluate
# !pip install accelerate -U

In [2]:
import transformers
print(transformers.__version__)

4.38.1


In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
# import pytorch_lightning as pl


from transformers import (
    AdamW,
    MT5ForConditionalGeneration,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    AutoTokenizer,
    PreTrainedTokenizerFast,
    T5ForTokenClassification,
    RobertaTokenizerFast,
    AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)

# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

label2id = {'O': 0, 'B-ALG': 1, 'I-ALG': 2, 'B-APP': 3, 'I-APP': 4, 'B-CB': 5, 'I-CB': 6, 'B-CLA': 7, 'I-CLA': 8, 'B-DEV': 9, 'I-DEV': 10, 'B-DS': 11, 'I-DS': 12, 'B-DT': 13, 'I-DT': 14, 'B-FN': 15, 'I-FN': 16, 'B-FT': 17, 'I-FT': 18, 'B-FUN': 19, 'I-FUN': 20, 'B-HXT': 21, 'I-HXT': 22, 'B-LAN': 23, 'I-LAN': 24, 'B-LIB': 25, 'I-LIB': 26, 'B-OS': 27, 'I-OS': 28, 'B-UIE': 29, 'I-UIE': 30, 'B-UN': 31, 'I-UN': 32, 'B-VAL': 33, 'I-VAL': 34, 'B-VAR': 35, 'I-VAR': 36, 'B-VER': 37, 'I-VER': 38, 'B-WEB': 39, 'I-WEB': 40}
id2label = {label2id[x]: x for x in label2id}
labels = ['Algorithm', 'Application', 'Class', 'Code_Block', 'Data_Structure', 'Data_Type', 'Device', 'File_Name', 'File_Type', 'Function', 'HTML_XML_Tag', 'Language', 'Library', 'Operating_System', 'User_Interface_Element', 'User_Name', 'Value', 'Variable', 'Version', 'Website']
labels_short = ['ALG', 'APP', 'CB', 'CLA', 'DEV', 'DS', 'DT', 'FN', 'FT', 'FUN', 'HXT', 'LAN', 'LIB', 'OS', 'UIE', 'UN', 'VAL', 'VAR', 'VER', 'WEB']
short2long = {'ALG': 'Algorithm', 'APP': 'Application', 'CLA': 'Class', 'CB': 'Code_Block', 'DS': 'Data_Structure', 'DT': 'Data_Type', 'DEV': 'Device', 'FN': 'File_Name', 'FT': 'File_Type', 'FUN': 'Function', 'HXT': 'HTML_XML_Tag', 'LAN': 'Language', 'LIB': 'Library', 'OS': 'Operating_System', 'UIE': 'User_Interface_Element', 'UN': 'User_Name', 'VAL': 'Value', 'VAR': 'Variable', 'VER': 'Version', 'WEB': 'Website'}


model_name = "Salesforce/codet5-small"
model_checkpoint_path = f"checkpoints/{model_name.split('/')[1]}-token-clf"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abuboba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
from datasets import load_dataset

dataset = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_train.json'))
dataset["test"] = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_test.json'))["train"]
dataset["validation"] = load_dataset('json', data_files=os.path.join('data\StackOverflow\json', 'data_dev.json'))["train"]


In [5]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

In [6]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['<s>', 'ĠIf', 'ĠI', 'Ġwould', 'Ġhave', 'Ġ2', 'Ġtables', '</s>']

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9263
    })
    test: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3108
    })
    validation: Dataset({
        features: ['tokens', 'spans', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2936
    })
})

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")
label_list = list(label2id.keys())
labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p, full=False):
    predictions, labels = p
    if full is False:
        predictions = np.argmax(predictions, axis=2)
        
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    if full:
        return results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [12]:
from transformers import TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id,  device_map='cuda'
)
model.model_parallel = False

  return self.fget.__get__(instance, owner)()
Some weights of T5ForTokenClassification were not initialized from the model checkpoint at Salesforce/codet5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir=model_checkpoint_path,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4789,0.308113,0.496231,0.428649,0.459971,0.930132
2,0.3119,0.247481,0.545784,0.530385,0.537975,0.938817
3,0.2435,0.236467,0.59442,0.549105,0.570864,0.942627
4,0.192,0.230405,0.582905,0.584645,0.583774,0.944221
5,0.1563,0.240362,0.607783,0.588985,0.598236,0.945746
6,0.1407,0.243613,0.602377,0.604992,0.603682,0.945884
7,0.124,0.244652,0.600651,0.600651,0.600651,0.945445
8,0.1054,0.25603,0.623286,0.604178,0.613583,0.947409
9,0.0965,0.25683,0.617092,0.603364,0.610151,0.947247
10,0.0881,0.261044,0.613399,0.606077,0.609716,0.946993


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory checkpoints/codet5-small-token-clf\checkpoint-11580 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=11580, training_loss=0.223681714456522, metrics={'train_runtime': 326.8664, 'train_samples_per_second': 283.388, 'train_steps_per_second': 35.427, 'total_flos': 422182886887416.0, 'train_loss': 0.223681714456522, 'epoch': 10.0})

In [None]:
pred = []

# model = AutoModelForTokenClassification.from_pretrained(
#     "codet5-base-token-clf-low", num_labels=len(label2id), id2label=id2label, label2id=label2id, device_map='cuda:0'
# )

for item in tokenized_dataset["test"]:
    a = tokenizer(item["tokens"],truncation=True, padding=True,is_split_into_words=True, return_tensors="pt").to("cuda")
    predictions = np.argmax(model(**a).logits.cpu().detach(), axis=2)
    pred.extend(predictions)


In [None]:
dct = compute_metrics((pred, tokenized_dataset["test"]["labels"]), True)
for x in dct:
    print(x, "---", dct[x], sep="\t")

In [None]:
with open("codet5-small.txt", "w") as f:
    f.write(f"{trainer.state.log_history}")

In [None]:
model.save_pretrained("t5small", from_pt=True) 