In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
TOKEN = ""
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from datasets import load_dataset
import torch
from peft import get_peft_model, LoraConfig, TaskType
from transformers.models.llama.modeling_llama import *
from transformers.modeling_outputs import TokenClassifierOutput

path = "so"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# train_set = load_dataset('json', data_files=os.path.join('so', 'data_train.json'))["train"]
# test_set = load_dataset('json', data_files=os.path.join('so', 'data_test.json'))["train"]
# dev_set = load_dataset('json', data_files=os.path.join('so', 'data_dev.json'))["train"]

dataset = load_dataset('json', data_files=os.path.join(path, 'data_train.json'), download_mode='force_redownload')
dataset["test"] = load_dataset('json', data_files=os.path.join(path, 'data_test.json'), download_mode='force_redownload')["train"]
dataset["validation"] = load_dataset('json', data_files=os.path.join(path, 'data_dev.json'), download_mode='force_redownload')["train"]
dataset["gh"] = load_dataset('json', data_files=os.path.join(path, 'data_gh.json'), download_mode='force_redownload')["train"]

Generating train split:   0%|          | 0/9263 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/3108 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2936 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/8023 [00:00<?, ? examples/s]

In [3]:
label2id = {'O': 0, 'B-ALG': 1, 'I-ALG': 2, 'B-APP': 3, 'I-APP': 4, 'B-CB': 5, 'I-CB': 6, 'B-CLA': 7, 'I-CLA': 8, 'B-DEV': 9, 'I-DEV': 10, 'B-DS': 11, 'I-DS': 12, 'B-DT': 13, 'I-DT': 14, 'B-FN': 15, 'I-FN': 16, 'B-FT': 17, 'I-FT': 18, 'B-FUN': 19, 'I-FUN': 20, 'B-HXT': 21, 'I-HXT': 22, 'B-LAN': 23, 'I-LAN': 24, 'B-LIB': 25, 'I-LIB': 26, 'B-OS': 27, 'I-OS': 28, 'B-UIE': 29, 'I-UIE': 30, 'B-UN': 31, 'I-UN': 32, 'B-VAL': 33, 'I-VAL': 34, 'B-VAR': 35, 'I-VAR': 36, 'B-VER': 37, 'I-VER': 38, 'B-WEB': 39, 'I-WEB': 40}
id2label = {label2id[x]: x for x in label2id}

epochs = 10
batch_size = 8
learning_rate = 1e-4
max_length = 64
lora_r = 12



_CONFIG_FOR_DOC = "LlamaConfig"


# Copied from transformers.models.bart.modeling_bart._make_causal_mask
class LlamaForTokenClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [4]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from transformers import AutoTokenizer, LlamaForCausalLM

base_model = "codellama/CodeLlama-7b-hf"
# base_model = "meta-llama/Llama-2-7b-hf"
model = LlamaForTokenClassification.from_pretrained(
    base_model, num_labels=len(label2id), id2label=id2label, label2id=label2id, token=TOKEN
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(base_model, token=TOKEN)
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForTokenClassification were not initialized from the model checkpoint at codellama/CodeLlama-7b-hf and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import DataCollatorForTokenClassification

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
max_length = 64

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=max_length, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


Map:   0%|          | 0/9263 [00:00<?, ? examples/s]

Map:   0%|          | 0/3108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2936 [00:00<?, ? examples/s]

Map:   0%|          | 0/8023 [00:00<?, ? examples/s]

In [11]:
import numpy as np
import evaluate


seqeval = evaluate.load("seqeval")
label_list = list(label2id.keys())

def compute_metrics(p, full=False):
    predictions, labels = p
    if full is False:
        predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    if full:
        return results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [7]:
from transformers import TrainingArguments, Trainer

epochs = 10
batch_size = 8
learning_rate = 1e-4



training_args = TrainingArguments(
    output_dir="codellama-with-mask",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2363,0.232199,0.545981,0.488157,0.515452,0.937443
2,0.1706,0.217917,0.58392,0.541791,0.562068,0.943755
3,0.1222,0.242778,0.588108,0.519739,0.551814,0.940785
4,0.0847,0.259627,0.559051,0.532262,0.545328,0.940321
5,0.0509,0.292055,0.563873,0.531173,0.547035,0.940344
6,0.0301,0.312592,0.523301,0.547237,0.535001,0.937095
7,0.0185,0.352507,0.542373,0.531446,0.536854,0.938534
8,0.0106,0.377543,0.548146,0.519194,0.533277,0.938464
9,0.0091,0.389155,0.544782,0.521644,0.532962,0.938441
10,0.0073,0.399247,0.53954,0.523822,0.531565,0.938372


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=11580, training_loss=0.0833618023646722, metrics={'train_runtime': 11933.8665, 'train_samples_per_second': 7.762, 'train_steps_per_second': 0.97, 'total_flos': 2.3059619972289792e+17, 'train_loss': 0.0833618023646722, 'epoch': 10.0})

In [8]:
with open("codellama.txt", "w") as f:
    f.write(f"{trainer.state.log_history}")

In [9]:
a = trainer.predict(tokenized_ds["validation"])
b = trainer.predict(tokenized_ds["test"])
c = trainer.predict(tokenized_ds["gh"])

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import pickle
from pprint import pprint

def get_dct(x, text):
    pred = np.argmax(x.predictions, axis=2)
    dct = compute_metrics((pred, tokenized_ds[text]["labels"]), True)
    with open(f'codellama_{text}.pickle', 'wb') as f:
        pickle.dump(dct, f)

    return dct

pprint(get_dct(a, "validation"))
pprint(get_dct(b, "test"))
pprint(get_dct(c, "gh"))

{'ALG': {'f1': 0.0, 'number': 9, 'precision': 0.0, 'recall': 0.0},
 'APP': {'f1': 0.5393939393939394,
         'number': 480,
         'precision': 0.5235294117647059,
         'recall': 0.55625},
 'CB': {'f1': 0.3436293436293436,
        'number': 244,
        'precision': 0.3248175182481752,
        'recall': 0.36475409836065575},
 'CLA': {'f1': 0.4993662864385298,
         'number': 406,
         'precision': 0.5143603133159269,
         'recall': 0.4852216748768473},
 'DEV': {'f1': 0.5461254612546126,
         'number': 149,
         'precision': 0.6065573770491803,
         'recall': 0.4966442953020134},
 'DS': {'f1': 0.7048710601719198,
        'number': 177,
        'precision': 0.7151162790697675,
        'recall': 0.6949152542372882},
 'DT': {'f1': 0.7389558232931727,
        'number': 134,
        'precision': 0.8,
        'recall': 0.6865671641791045},
 'FN': {'f1': 0.3619047619047619,
        'number': 134,
        'precision': 0.5,
        'recall': 0.2835820895522388},
 '