In [1]:
import transformers, tokenizers, torch, datasets
from datasets import load_dataset
import os
import glob
import pickle
import pandas as pd

In [2]:
print('transformers={}'.format(transformers.__version__))
print('tokenizers={}'.format(tokenizers.__version__))
print('torch={}'.format(torch.__version__))
print('datasets={}'.format(datasets.__version__))

transformers=4.35.2
tokenizers=0.15.0
torch=2.1.1+cu121
datasets=2.14.5


In [3]:
current_directory = os.getcwd()

In [4]:
def defect_percent(df):
    return df['label'].sum()/df.shape[0]

In [5]:
datatest_versions = ['base','full_change','special_tokens']
models_names = ['bilstm','codebert','codereviewer','javabert','codet5p']
fine_tuning_techniques = ['full','partial','lora']

In [6]:
include_message=True
include_metrics=False
use_lora = False
ft_technique = fine_tuning_techniques[1]
lsg_attention = True
copy_embedding_from_model=True
print_for_debug = True
max_commit_code_length = 1024
vocab_size = 32000 # 16000, 32000, 52000, for custom tokenizer in BiLSTM model only.
partial_trained_encoders = 3
model_name = models_names[1]
dataset_version_name = datatest_versions[0]

In [7]:
create_lsg_model = False

### load dataset

In [8]:
from datasets import load_dataset
multiple_files = True
data_name = 'apache_jit'
prefix = '<java> '
columns_to_remove = ['id','msg','code','metrics']

In [9]:
train_dataset = load_dataset("csv",data_files=f'datasets/{data_name}/train_{dataset_version_name}_shuffled.csv', streaming=True,split="train")
valid_dataset = load_dataset("csv",data_files=f'datasets/{data_name}/valid_{dataset_version_name}_balanced.csv',split="train")
test_dataset = load_dataset("csv",data_files=f'datasets/{data_name}/test_{dataset_version_name}.csv',split="train")
train_df = pd.read_csv(f'datasets/{data_name}/train_{dataset_version_name}_shuffled.csv')
valid_df = pd.read_csv(f'datasets/{data_name}/valid_{dataset_version_name}_balanced.csv')
test_df = pd.read_csv(f'datasets/{data_name}/test_{dataset_version_name}.csv')
train_length = train_df.shape[0]
valid_length = valid_df.shape[0]
test_length  = test_df.shape[0]

In [10]:
valid_df.head()

Unnamed: 0,id,label,msg,code,metrics
0,edebb108d0d0477efba81e55b07339755739dd39,0,[FLINK-11721][network] Remove IOMode from Netw...,[['<del> import org.apache.flink.runtime.io.di...,"la:0.000, ld:21.000, nf:6.000, nd:5.000, ns:2...."
1,5fa84c28fc1bfc62fa2e1165e3407fc81b3d09a9,0,[FLINK-8935][tests] Implement MiniClusterClien...,[['<del> throw new UnsupportedOperationExcepti...,"la:17.000, ld:1.000, nf:2.000, nd:2.000, ns:2...."
2,70952fe92d5be7f7c9407783867d524544cd9fec,1,IGNITE-9870 GridDhtPartitionsFullMessage#prepa...,"[['<add> import java.util.zip.Deflater;', '<de...","la:483.000, ld:245.000, nf:10.000, nd:7.000, n..."
3,ca5d8afee8321e0dff063e8404538b132e979739,0,[FLINK-7192] [java] Activate checkstyle flink-...,"[['<del> import java.util.ArrayList;', '<del> ...","la:1179.000, ld:1085.000, nf:20.000, nd:1.000,..."
4,75469a3b602c26ea81d6fc0a409d39d321195ea4,1,MINOR: Replacing for with foreach loop in stre...,[['<del> for (int i = 0; i < expectedKeys.leng...,"la:227.000, ld:226.000, nf:17.000, nd:1.000, n..."


In [11]:
if model_name == 'bilstm':
    model_checkpoint = 'microsoft/codereviewer'
elif model_name == 'codebert':
    model_checkpoint = 'microsoft/codebert-base'
elif model_name == 'javabert':
    model_checkpoint = 'CAUKiel/JavaBERT'
elif model_name == 'codereviewer':
    model_checkpoint = 'microsoft/codereviewer'
else:
    model_checkpoint = 'Salesforce/codet5p-220m'
if lsg_attention:
    if os.path.exists("models/{}_lsg_{}".format(model_name,max_commit_code_length)):
        model_checkpoint = "models/{}_lsg_{}".format(model_name,max_commit_code_length)
    else:
        create_lsg_model = True
model_name_suffix = model_name + '_{}{}{}{}{}_{}'.format(max_commit_code_length,'_msg' if include_message else '','_mtc' if include_metrics else '', '_lsg' if lsg_attention else '', '_lora' if use_lora else '',dataset_version_name) 

### Tokenize the train and test data

In [12]:
if model_name == 'javabert':
    cls_token = '[CLS]'
    sep_token = '[SEP]'
    msg_token = '<msg>'
    metrics_token = '[CLS]'
    code_change_token = '[CLS]'
else:
    cls_token = '<s>'
    sep_token = '</s>'
    msg_token = '<msg>'
    metrics_token = '<s>'
    code_change_token = '<s>'
if dataset_version_name == 'special_tokens':
    added_token = '<added>'
    removed_token = '<removed>'
else:
    added_token = '<add>'
    removed_token = '<del>'
prefix = ''

In [13]:
def add_tokens_to_tokenizer(tokenizer):
    if dataset_version_name == 'special_tokens':
        tokenizer.add_special_tokens({'additional_special_tokens':[added_token, removed_token,'<STR>','<NUM>']})
    if include_message:
        tokenizer.add_special_tokens({'additional_special_tokens':[cls_token, sep_token,'<pad>', '<unk>',added_token, removed_token,msg_token]})
    else:
        tokenizer.add_special_tokens({'additional_special_tokens':[cls_token, sep_token,'<pad>', '<unk>',added_token, removed_token]})

In [14]:
if create_lsg_model:
    from lsg_converter import LSGConverter
    from transformers import AutoModelForSequenceClassification
    converter = LSGConverter(max_sequence_length=max_commit_code_length)
    if model_name == 'javabert':
        architecture = 'BertForSequenceClassification'
    elif model_name == 'codebert':
        architecture = 'RobertaForSequenceClassification'
    else:
        print('Error! LSG Attention not supported for T5 models at the moment. (CodeReviewer and CodeT5+)')
        exit()
    model, tokenizer = converter.convert_from_pretrained(model_checkpoint,dropout=0.2,hidden_dropout_prob=0.2,num_labels=2,architecture=architecture)
    add_tokens_to_tokenizer(tokenizer)
    model.resize_token_embeddings(len(tokenizer))
    save_path = 'models/{}_lsg_{}'.format(model_name,max_commit_code_length)
    model_checkpoint = save_path
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing LSGRobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing LSGRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LSGRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LSGRobertaForSequenceClassification were not initialized from the model checkpoint at microso

In [15]:
from transformers import AutoTokenizer
from bi_lstm import BiLSTM
if model_name == 'bilstm':
    if copy_embedding_from_model:
        tokenizer = AutoTokenizer.from_pretrained( model_checkpoint)
    else:
        from transformers import RobertaTokenizerFast
        tokenizer_name = '{}_{}_bpe'.format(data_name,vocab_size)
        tokenizer = RobertaTokenizerFast.from_pretrained('./BPE_tokenizer/{}'.format(tokenizer_name),max_len=max_commit_code_length)
else:
    tokenizer = AutoTokenizer.from_pretrained( model_checkpoint)
add_tokens_to_tokenizer(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
if print_for_debug:
    print(len(tokenizer))

50268


In [17]:
def remove_empty(seq):
    return list(filter(lambda s: s != None and s != '',seq))


def join_commit_codes_sep(commit,commit_start=' <NFILE> ',file_sep=' <NFILE> ',line_sep=' <NLINE> '):
    if type(commit) == str:
        commit = eval(commit)
    #return commit_start + file_sep.join(remove_empty([line_sep.join([correct_token(line.split(' ')[0]) +' ' + ' '.join(line.split(' ')[1:]) for line in file]) for file in commit]))
    return commit_start + file_sep.join(remove_empty([line_sep.join(file) for file in commit]))

def join_commit_codes(commit):
    return join_commit_codes_sep(commit,prefix ,f' {sep_token} ','\n')
    #return join_commit_codes_sep(commit,cls_token+' ',f' {sep_token} ',' ')
def join_file_lines(file):
    if type(file) != list:
        file = eval(file)
    return prefix +  '\n'.join(file)

def empty_join_commit_codes(commit):
    return join_commit_codes_sep(commit,'','')
def join_lines(lines,commit_start=' <NFILE> ',line_sep=' <NLINE> '):
    return commit_start + line_sep.join(lines)
def flatten(l):
    return [item for sublist in l for item in sublist]
def join_commit_msg_and_code(msg,code):
    if msg is None:
        msg = ''
    if code is None:
        code = ''
    return msg_token + ' ' + msg.split('\n')[0] + ' ' + code_change_token + ' ' + join_commit_codes_sep(code ,'',f' {sep_token} ','\n')
def join_commit_msg_metrics_code(msg,mtc,code):
    if msg is None:
        msg = ''
    if code is None:
        code = ''
    if mtc is None:
        mtc = ''
    return msg_token + ' ' + msg.split('\n')[0] + '\n' + metrics_token + ' ' + mtc + '\n' + code_change_token + ' ' + join_commit_codes_sep(code ,'',f' {sep_token} ','\n')

In [18]:
def encode(batch):
    if multiple_files:
        if include_message:
            if include_metrics:
                inputs = tokenizer(list(map(join_commit_msg_metrics_code,batch['msg'],batch['metrics'],batch['code'])),truncation="longest_first",max_length=max_commit_code_length)
            else:
                inputs = tokenizer(list(map(join_commit_msg_and_code,batch['msg'],batch['code'])),truncation="longest_first",max_length=max_commit_code_length)
        elif include_metrics:
            inputs = tokenizer(list(map(join_commit_msg_and_code,batch['metrics'],batch['code'])),truncation="longest_first",max_length=max_commit_code_length)
        else:
            inputs = tokenizer(list(map(join_commit_codes,batch['code'])),truncation="longest_first",max_length=max_commit_code_length)
    else:    
        inputs = tokenizer(list(map(join_file_lines,batch['code'])),truncation="longest_first",max_length=max_commit_code_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    return batch

In [19]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,max_length=max_commit_code_length,padding='longest')

### Model classes and initialization


In [20]:
device = 'cuda'

In [21]:
import torch
import torch.nn as nn
from transformers import AutoModel,AutoModelForSequenceClassification, AutoConfig
if model_name == 'bilstm':
    model = BiLSTM(len(tokenizer),embed_size=768,hidden_size=64,lstm_layers=4,dropout=0.2,padding_id=tokenizer.pad_token_id)
    if copy_embedding_from_model:
        copy_from_model = AutoModel.from_pretrained(model_checkpoint)
        with torch.no_grad():
            model.embedding.weight.copy_(copy_from_model.encoder.embed_tokens.weight)
            model.embedding.require_grad = False
else:
    config = AutoConfig.from_pretrained(model_checkpoint)
    config.hidden_dropout_prob = 0.2
    config.dropout = 0.2
    config.num_labels=2
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,config=config)

    #change embedding layer size to match tokenizer vocabulary size (Because we added new tokens to the tokenizer):
    model.resize_token_embeddings(len(tokenizer))

In [24]:
model_checkpoint

'codebert_lsg_1024'

In [23]:
if print_for_debug:
    print(model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50268, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [25]:
if ft_technique == 'lora':
    from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
    )
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
elif ft_technique == 'partial':
    # freeze encoder layers except the last 2 layers
    modules = []
    trained_encoder_layers = partial_trained_encoders
    if model_name == 'codebert':
        modules = [model.roberta.embeddings, *model.roberta.encoder.layer[:-trained_encoder_layers]]
    elif model_name == 'javabert':
        modules = [model.bert.embeddings, *model.bert.encoder.layer[:-trained_encoder_layers]]
    elif model_name == 'codereviewer' or model_name == 'codet5p':
        modules = [model.shared, *model.encoder.block[:-trained_encoder_layers]]
    for module in modules:
        for param in module.parameters():
            param.requires_grad = False

In [26]:
def param_count(model,trainable_only=True):
    return sum([p.numel()for p in model.parameters() if p.requires_grad or not trainable_only])
if print_for_debug:
    print(param_count(model))
    print(param_count(model,False))

21855746
125042690


In [27]:
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
def roc_auc(preds,target):
    roc_auc_score(target, preds)

def compute_metrics(p):
    _predictions = p.predictions
    _labels = p.label_ids
    _predictions = np.argmax(_predictions, axis=-1)
    vals = {}
    vals['accuracy'] = accuracy_score(_labels, _predictions)
    vals['f1'] = f1_score(_labels, _predictions)
    vals['precision'] = precision_score(_labels, _predictions)
    vals['recall'] = recall_score(_labels, _predictions)
    vals['matthews_correlation'] = matthews_corrcoef(_labels, _predictions)
    vals['auc'] = roc_auc_score(_labels, _predictions.reshape(-1,1))
    return  vals

In [29]:
from transformers.optimization import AdamW
from transformers import Trainer,get_linear_schedule_with_warmup
from math import ceil
init_lr,head_lr = 5e-4 ,1e-4
adam_eps = 1e-6
weight_decay = 0.01
epochs=10
batch_size = 16
gradient_accumulation_steps = 4
batch_steps = int(train_length/(batch_size*gradient_accumulation_steps))
#rem_steps = train_length%(batch_size*gradient_accumulation_steps)
rem_steps = ceil((train_length%(batch_size*gradient_accumulation_steps)) / batch_size)
train_steps = (epochs) * (batch_steps + rem_steps)
warmpup_factor = 0.25
warmpup_steps = int(train_steps*warmpup_factor)
optim = 'adafactor'

In [30]:
train_dataset = train_dataset.map(encode, batch_size=batch_size,batched=True, remove_columns=columns_to_remove)
valid_dataset = valid_dataset.map(encode, batch_size=batch_size,batched=True, remove_columns=columns_to_remove)

Map:   0%|          | 0/6940 [00:00<?, ? examples/s]

In [31]:
if print_for_debug:
    for v in range(3):
        print(valid_dataset['input_ids'][v])
        print(valid_dataset['label'][v])
        #print(valid_dataset['attention_mask'][v])
        print(tokenizer.decode(valid_dataset['input_ids'][v]))

[0, 50266, 646, 7613, 23617, 12, 21598, 2146, 46386, 34728, 742, 27336, 38, 3765, 4636, 31, 3658, 46291, 1437, 0, 1437, 50267, 6595, 31118, 4, 48530, 4, 4825, 4291, 4, 49600, 4, 1020, 4, 47340, 4, 118, 16187, 6988, 4, 100, 3765, 260, 6988, 131, 50118, 50267, 6595, 31118, 4, 48530, 4, 4825, 4291, 4, 49600, 4, 1020, 4, 47340, 4, 118, 16187, 6988, 4, 100, 3765, 260, 6988, 4, 100, 3765, 4636, 131, 50118, 50267, 940, 507, 38, 3765, 260, 6988, 4, 100, 3765, 4636, 6814, 100, 3765, 4636, 131, 50118, 50267, 38, 3765, 260, 6988, 4, 100, 3765, 4636, 4, 21134, 6905, 6, 50118, 50267, 38, 3765, 4636, 6814, 100, 3765, 4636, 6, 50118, 50267, 42, 4, 43234, 100, 3765, 4636, 5457, 6814, 100, 3765, 4636, 131, 50118, 50267, 285, 38, 3765, 4636, 120, 48398, 100, 3765, 4636, 43048, 25522, 50118, 50267, 671, 6814, 100, 3765, 4636, 131, 50118, 50267, 35524, 50118, 1437, 2, 1437, 50267, 1546, 46291, 49602, 4, 1020, 47062, 49196, 50118, 1437, 2, 1437, 50267, 6595, 31118, 4, 48530, 4, 4825, 4291, 4, 49600, 4, 90,

In [32]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir = './models/{}/{}'.format(data_name,model_name_suffix),
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    per_device_eval_batch_size= batch_size,
    save_total_limit = 2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model ='auc',
    evaluation_strategy = "epoch",
    eval_steps=4,
    disable_tqdm = False,
    warmup_steps=warmpup_steps,
    logging_steps = 4,
    remove_unused_columns=False,
    report_to="wandb",
    fp16 = False,
    logging_dir= './models/{}/{}/logs/'.format(data_name,model_name_suffix),
    dataloader_num_workers = 0,
    max_steps=train_steps)

In [33]:
from transformers.optimization import Adafactor

if optim == 'adamw':
    opt = torch.optim.AdamW(model.parameters(),lr=init_lr,betas=(0.9, 0.999), eps=adam_eps, weight_decay=weight_decay)
elif optim == 'adafactor':
    opt = Adafactor(model.parameters(), lr=init_lr, relative_step=False, warmup_init=False)
scheduling_types = ['warmpup_anneal', 'warmup','constant']
scheduling_type = scheduling_types[1]

if scheduling_type == scheduling_types[0]:
    lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=opt,
        max_lr=init_lr,
        pct_start=training_args.warmup_steps / training_args.max_steps,
        anneal_strategy="linear",
        total_steps=training_args.max_steps
    )
elif scheduling_type == scheduling_types[1]:
    lr_scheduler = transformers.get_constant_schedule_with_warmup(opt,training_args.warmup_steps)
else:
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(opt,lambda epoch: init_lr)

In [34]:
import wandb
# Login with your authentication key

wandb.login()
training_hyper_params = {
    "model_name": model_name_suffix,
    "optimizer": optim,
    "base_lr": init_lr,
    "weight_decay": weight_decay,
    "warmpup_factor":warmpup_factor,
    "warmpup_steps": warmpup_steps,
    "batch_size": batch_size,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "seq_len": max_commit_code_length,
    "epochs": epochs,
    "include_commit_msg":include_message,
    "trained_encoder_layers":trained_encoder_layers if not use_lora else -1,
    "dataset_version":dataset_version_name
}
wandb.init(project=data_name,name='{}/{}'.format(data_name,model_name_suffix),config=training_hyper_params)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myaseralosh[0m ([33mjit_defect[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [35]:
#adam_opt = AdamW(model.parameters(),lr=5e-5,betas=[0.9,0.999],weight_decay=0.01)
trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset.with_format("torch"),
        eval_dataset =valid_dataset.with_format("torch"),
        data_collator=data_collator,
        tokenizer=tokenizer,
        optimizers = (opt,lr_scheduler)
    )

In [36]:
print("One Epoch total steps: {}".format(train_steps / epochs))
train_res = trainer.train()

One Epoch total steps: 703.0


  0%|          | 0/7030 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7254, 'learning_rate': 1.1383039271485487e-06, 'epoch': 0.0}
{'loss': 0.7015, 'learning_rate': 2.2766078542970975e-06, 'epoch': 0.0}
{'loss': 0.7041, 'learning_rate': 3.414911781445646e-06, 'epoch': 0.0}
{'loss': 0.6959, 'learning_rate': 4.553215708594195e-06, 'epoch': 0.0}
{'loss': 0.6945, 'learning_rate': 5.6915196357427435e-06, 'epoch': 0.0}
{'loss': 0.7155, 'learning_rate': 6.829823562891292e-06, 'epoch': 0.0}
{'loss': 0.7147, 'learning_rate': 7.96812749003984e-06, 'epoch': 0.0}
{'loss': 0.7102, 'learning_rate': 9.10643141718839e-06, 'epoch': 0.0}
{'loss': 0.7168, 'learning_rate': 1.024473534433694e-05, 'epoch': 0.01}
{'loss': 0.7011, 'learning_rate': 1.1383039271485487e-05, 'epoch': 0.01}
{'loss': 0.7114, 'learning_rate': 1.2521343198634035e-05, 'epoch': 0.01}
{'loss': 0.7101, 'learning_rate': 1.3659647125782584e-05, 'epoch': 0.01}
{'loss': 0.7114, 'learning_rate': 1.4797951052931132e-05, 'epoch': 0.01}
{'loss': 0.7173, 'learning_rate': 1.593625498007968e-05, 'epoch': 0

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.6067043542861938, 'eval_accuracy': 0.6786743515850144, 'eval_f1': 0.7078092243186582, 'eval_precision': 0.6489668428640077, 'eval_recall': 0.7783861671469741, 'eval_matthews_correlation': 0.36467376976184873, 'eval_auc': 0.6786743515850144, 'eval_runtime': 141.3914, 'eval_samples_per_second': 49.084, 'eval_steps_per_second': 3.069, 'epoch': 0.1}
{'loss': 0.5465, 'learning_rate': 0.00020034149117814456, 'epoch': 1.0}
{'loss': 0.5472, 'learning_rate': 0.00020147979510529312, 'epoch': 1.0}
{'loss': 0.6219, 'learning_rate': 0.00020261809903244165, 'epoch': 1.0}
{'loss': 0.5944, 'learning_rate': 0.0002037564029595902, 'epoch': 1.0}
{'loss': 0.5817, 'learning_rate': 0.00020489470688673874, 'epoch': 1.0}
{'loss': 0.6119, 'learning_rate': 0.0002060330108138873, 'epoch': 1.0}
{'loss': 0.5674, 'learning_rate': 0.00020717131474103586, 'epoch': 1.0}
{'loss': 0.5711, 'learning_rate': 0.0002083096186681844, 'epoch': 1.0}
{'loss': 0.5735, 'learning_rate': 0.00020944792259533295, 'epoc

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5697055459022522, 'eval_accuracy': 0.7054755043227666, 'eval_f1': 0.7426340972047344, 'eval_precision': 0.6594364937388193, 'eval_recall': 0.8498559077809799, 'eval_matthews_correlation': 0.4292359123529265, 'eval_auc': 0.7054755043227666, 'eval_runtime': 141.1519, 'eval_samples_per_second': 49.167, 'eval_steps_per_second': 3.075, 'epoch': 1.1}
{'loss': 0.4916, 'learning_rate': 0.0003995446784291406, 'epoch': 2.0}
{'loss': 0.4805, 'learning_rate': 0.0004006829823562891, 'epoch': 2.0}
{'loss': 0.6, 'learning_rate': 0.00040182128628343767, 'epoch': 2.0}
{'loss': 0.5626, 'learning_rate': 0.00040295959021058623, 'epoch': 2.0}
{'loss': 0.5761, 'learning_rate': 0.0004040978941377348, 'epoch': 2.0}
{'loss': 0.5667, 'learning_rate': 0.0004052361980648833, 'epoch': 2.0}
{'loss': 0.5279, 'learning_rate': 0.0004063745019920319, 'epoch': 2.0}
{'loss': 0.5554, 'learning_rate': 0.0004075128059191804, 'epoch': 2.0}
{'loss': 0.5589, 'learning_rate': 0.000408651109846329, 'epoch': 2.0}


  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5322141051292419, 'eval_accuracy': 0.7474063400576368, 'eval_f1': 0.7853031230863441, 'eval_precision': 0.6828541001064963, 'eval_recall': 0.9239193083573487, 'eval_matthews_correlation': 0.5288643881925791, 'eval_auc': 0.747406340057637, 'eval_runtime': 141.308, 'eval_samples_per_second': 49.113, 'eval_steps_per_second': 3.071, 'epoch': 2.1}
{'loss': 0.4637, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.4136, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5086, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5002, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.4887, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5516, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5058, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5064, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.4648, 'learning_rate': 0.0005, 'epoch': 3.0}
{'loss': 0.5588, 'learning_rate': 0.0005, 'epoch': 3.01}
{'loss': 0.515, 'learning_rate': 0.0005, 'epoch': 3.01}
{'loss': 0.4942, 'lear

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.48480120301246643, 'eval_accuracy': 0.7753602305475504, 'eval_f1': 0.7852912821925354, 'eval_precision': 0.7520443154840412, 'eval_recall': 0.8216138328530259, 'eval_matthews_correlation': 0.5530921116959865, 'eval_auc': 0.7753602305475504, 'eval_runtime': 141.5785, 'eval_samples_per_second': 49.019, 'eval_steps_per_second': 3.065, 'epoch': 3.1}
{'loss': 0.4211, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4402, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4913, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4781, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4771, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4836, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4612, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.5225, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4511, 'learning_rate': 0.0005, 'epoch': 4.0}
{'loss': 0.4824, 'learning_rate': 0.0005, 'epoch': 4.01}
{'loss': 0.542, 'learning_rate': 0.0005, 'epoch': 4.01}
{'loss': 0.4615, 'l

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5316388607025146, 'eval_accuracy': 0.7625360230547551, 'eval_f1': 0.7929127921588338, 'eval_precision': 0.7029857397504456, 'eval_recall': 0.909221902017291, 'eval_matthews_correlation': 0.5492394398958639, 'eval_auc': 0.7625360230547551, 'eval_runtime': 141.9362, 'eval_samples_per_second': 48.895, 'eval_steps_per_second': 3.058, 'epoch': 4.1}
{'loss': 0.4044, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4443, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4109, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.5003, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4526, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4592, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4923, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4944, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4553, 'learning_rate': 0.0005, 'epoch': 5.0}
{'loss': 0.4719, 'learning_rate': 0.0005, 'epoch': 5.01}
{'loss': 0.5096, 'learning_rate': 0.0005, 'epoch': 5.01}
{'loss': 0.4453, 'le

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.49957406520843506, 'eval_accuracy': 0.7789625360230548, 'eval_f1': 0.7974650118827569, 'eval_precision': 0.7358674463937622, 'eval_recall': 0.8703170028818443, 'eval_matthews_correlation': 0.5674773913705552, 'eval_auc': 0.7789625360230548, 'eval_runtime': 141.9701, 'eval_samples_per_second': 48.884, 'eval_steps_per_second': 3.057, 'epoch': 5.1}
{'loss': 0.4625, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.3707, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.5367, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.4057, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.4995, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.5017, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.4819, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.4559, 'learning_rate': 0.0005, 'epoch': 6.0}
{'loss': 0.4745, 'learning_rate': 0.0005, 'epoch': 6.01}
{'loss': 0.4844, 'learning_rate': 0.0005, 'epoch': 6.01}
{'loss': 0.4555, 'learning_rate': 0.0005, 'epoch': 6.01}
{'loss': 0.5169, 

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.49980756640434265, 'eval_accuracy': 0.7757925072046109, 'eval_f1': 0.7949393779652083, 'eval_precision': 0.7323943661971831, 'eval_recall': 0.869164265129683, 'eval_matthews_correlation': 0.5614618568129566, 'eval_auc': 0.775792507204611, 'eval_runtime': 141.6751, 'eval_samples_per_second': 48.985, 'eval_steps_per_second': 3.063, 'epoch': 6.1}
{'loss': 0.381, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.373, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.4834, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.4768, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.46, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.5106, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.468, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.4704, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.4452, 'learning_rate': 0.0005, 'epoch': 7.0}
{'loss': 0.5195, 'learning_rate': 0.0005, 'epoch': 7.01}
{'loss': 0.4939, 'learning_rate': 0.0005, 'epoch': 7.01}
{'loss': 0.4467, 'learnin

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.49713948369026184, 'eval_accuracy': 0.7799711815561959, 'eval_f1': 0.7931735067045915, 'eval_precision': 0.7482749808331204, 'eval_recall': 0.8438040345821326, 'eval_matthews_correlation': 0.5645620301901673, 'eval_auc': 0.779971181556196, 'eval_runtime': 140.979, 'eval_samples_per_second': 49.227, 'eval_steps_per_second': 3.078, 'epoch': 7.1}
{'loss': 0.4081, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.3421, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.4798, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.4348, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.4128, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.501, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.469, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.4531, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.4305, 'learning_rate': 0.0005, 'epoch': 8.0}
{'loss': 0.5501, 'learning_rate': 0.0005, 'epoch': 8.01}
{'loss': 0.4448, 'learning_rate': 0.0005, 'epoch': 8.01}
{'loss': 0.4456, 'lear

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5449955463409424, 'eval_accuracy': 0.7662824207492795, 'eval_f1': 0.7967418546365914, 'eval_precision': 0.7048780487804878, 'eval_recall': 0.9161383285302593, 'eval_matthews_correlation': 0.5582266411024245, 'eval_auc': 0.7662824207492795, 'eval_runtime': 141.2731, 'eval_samples_per_second': 49.125, 'eval_steps_per_second': 3.072, 'epoch': 8.1}
{'loss': 0.3858, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.384, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4305, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4533, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4538, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4599, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4382, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4686, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4201, 'learning_rate': 0.0005, 'epoch': 9.0}
{'loss': 0.4818, 'learning_rate': 0.0005, 'epoch': 9.01}
{'loss': 0.453, 'learning_rate': 0.0005, 'epoch': 9.01}
{'loss': 0.4293, 'lea

  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5484501719474792, 'eval_accuracy': 0.7755043227665706, 'eval_f1': 0.8015792154865004, 'eval_precision': 0.7181652213601095, 'eval_recall': 0.9069164265129683, 'eval_matthews_correlation': 0.5710858915916953, 'eval_auc': 0.7755043227665707, 'eval_runtime': 140.8109, 'eval_samples_per_second': 49.286, 'eval_steps_per_second': 3.082, 'epoch': 9.1}
{'loss': 0.3467, 'learning_rate': 0.0005, 'epoch': 10.0}
{'loss': 0.4065, 'learning_rate': 0.0005, 'epoch': 10.0}
{'loss': 0.3958, 'learning_rate': 0.0005, 'epoch': 10.0}
{'loss': 0.4631, 'learning_rate': 0.0005, 'epoch': 10.0}
{'loss': 0.4223, 'learning_rate': 0.0005, 'epoch': 10.0}
{'loss': 0.4253, 'learning_rate': 0.0005, 'epoch': 10.0}


  0%|          | 0/434 [00:00<?, ?it/s]

{'eval_loss': 0.5414459705352783, 'eval_accuracy': 0.77492795389049, 'eval_f1': 0.8001535312180142, 'eval_precision': 0.7195121951219512, 'eval_recall': 0.9011527377521614, 'eval_matthews_correlation': 0.5682618796241644, 'eval_auc': 0.7749279538904899, 'eval_runtime': 140.5753, 'eval_samples_per_second': 49.369, 'eval_steps_per_second': 3.087, 'epoch': 10.0}
{'train_runtime': 24742.4251, 'train_samples_per_second': 18.184, 'train_steps_per_second': 0.284, 'train_loss': 0.4874733863989286, 'epoch': 10.0}


In [None]:
if print_for_debug:
    print(trainer.state.best_model_checkpoint)

## Evaluate and Predict

In [36]:
testing=True
if testing:
    test_dataset = test_dataset.map(encode,load_from_cache_file=False, batch_size=batch_size,batched=True, remove_columns=columns_to_remove)  
    trainer.model = AutoModelForSequenceClassification.from_pretrained(trainer.state.best_model_checkpoint).to('cuda') #
    test_res = trainer.evaluate(eval_dataset=test_dataset.with_format("torch"))
    print(test_res)

Map:   0%|          | 0/7526 [00:00<?, ? examples/s]

  0%|          | 0/941 [00:00<?, ?it/s]

{'eval_loss': 0.5461655855178833, 'eval_accuracy': 0.7499335636460271, 'eval_f1': 0.5639481000926784, 'eval_precision': 0.42433751743375175, 'eval_recall': 0.8404696132596685, 'eval_matthews_correlation': 0.4617012156429259, 'eval_auc': 0.7844171034379949, 'eval_runtime': 75.1986, 'eval_samples_per_second': 100.082, 'eval_steps_per_second': 12.514, 'epoch': 10.01}


In [40]:
wandb.finish()

0,1
eval/accuracy,▁▅▇▇████▇█▇▆
eval/auc,▁▅▇▇█▇▇█▇█▇█
eval/f1,▅▆█████████▁
eval/loss,█▄▁▁▂▂▂▁▃▂▄▄
eval/matthews_correlation,▁▅█████████▅
eval/precision,▅▇▇▇▇▇▇█▇█▇▁
eval/recall,▆▁██▇█▆▅█▅█▅
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁█
eval/samples_per_second,▇██▇███▇▇▇█▁
eval/steps_per_second,▇██▇███▇▇▇█▁

0,1
eval/accuracy,0.74993
eval/auc,0.78442
eval/f1,0.56395
eval/loss,0.54617
eval/matthews_correlation,0.4617
eval/precision,0.42434
eval/recall,0.84047
eval/runtime,75.1986
eval/samples_per_second,100.082
eval/steps_per_second,12.514
