In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import clear_output

In [2]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import T5Config
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from transformers import set_seed

In [3]:
from datetime import datetime
import argparse
import os
import sys

sys.path.append("./hf_transformers/")

In [4]:
import torch

from data_reader import GetDataAsPython
from prepare_data import create_data
from prepare_data import create_dataset
from prepare_data import extract_warning_types
from utils import boolean_string
from utils import get_current_time

In [34]:
import torch

from data_reader import GetDataAsPython
from prepare_data import create_data
from prepare_data import create_dataset
from prepare_data import extract_warning_types
from utils import boolean_string
from utils import get_current_time
import csv

In [6]:
storage_directory = './storage/'

In [7]:
import random

In [8]:
exec_number = random.randint(0, 1000)
exec_number

985

In [31]:
repo = '/data/all/data/svgdotjs/svg.js'
repo

'/data/all/data/svgdotjs/svg.js'

In [32]:
sample_percent = 0.3
sample_percent

0.3

In [35]:
name='tuned'
name

'tuned'

In [36]:
# Read and prepare data
data = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_repo_specific_final.json")
data_eslint = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_eslint_final.json")
data += data_eslint

In [37]:
len(data)

104804

In [38]:
all_warning_types = extract_warning_types(data)

In [39]:
(train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels, train_info, val_info, test_info, ) =\
    create_data(data, all_warning_types, include_warning=True, design='repo-based-included', select_repo=repo)

splitting by : repo-based-included
train size: 105
val size: 7
test size: 37


In [40]:
load_model = f'./{storage_directory}/checkpoint-37375'
tokenizer = T5Tokenizer.from_pretrained(load_model)


In [41]:
len(train_inputs)

105

In [42]:
# Create dataset required by pytorch
samples = int(sample_percent * len(train_inputs))
print(f'{len(train_inputs)} {samples} {sample_percent}')
train_dataset = create_dataset(train_inputs[:samples], train_labels[:samples], tokenizer, pad_truncate=True, max_length=128)
val_dataset = create_dataset(val_inputs[:samples], val_labels[:samples], tokenizer, pad_truncate=True)

105 31 0.3


In [61]:
now = datetime.now()
test_result_directory = f'{storage_directory}/fine-tune-result'
full_name = f'{name}_{exec_number}_{repo.rsplit("/", 1)[1][-20:]}_{sample_percent}'
model_directory = f'{storage_directory}/tmp/{full_name}'
model_directory

'./storage//tmp/tuned_985_svg.js_0.3'

In [62]:
# Define an objective function to be minimized.
def objective(trial):
    
    clear_output()
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    lr = trial.suggest_float('lr', 1e-6, 1e-2, log=True)
    wd = trial.suggest_float('wd', 0, 0.9)
    ws = trial.suggest_float('ws', 0, 2000)
    
    model = T5ForConditionalGeneration.from_pretrained(load_model)
    
    model.resize_token_embeddings(len(tokenizer))
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=model_directory,
        num_train_epochs=15,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=ws,
        weight_decay=wd,
        logging_dir=model_directory,
        logging_steps=100,
        do_eval=True,
        evaluation_strategy="epoch",
        learning_rate=lr,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=1,
        eval_accumulation_steps=1,  # set this lower, if testing or validation crashes
        disable_tqdm=False,
        predict_with_generate=True,  # never set this to false.
        seed=42,  # default value
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=[torch.optim.Adam(params=model.parameters(), lr=lr), None],
        tokenizer=tokenizer,
    )
    
    trainer.train()
    
    return trainer.evaluate()['eval_loss']  # An objective value linked with the Trial object.

In [63]:
import optuna
study = optuna.create_study() 
study.optimize(objective, n_trials=100) 

[32m[I 2021-11-16 12:43:26,671][0m A new study created in memory with name: no-name-9e828f08-dd0b-45bb-b186-29a461b82c15[0m


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.062873,0.0237,295.713
2,No log,0.062862,0.0234,298.527
3,No log,0.062844,0.0228,307.388
4,No log,0.062816,0.0233,300.818
5,No log,0.062781,0.0241,290.588
6,No log,0.062738,0.0231,303.58
7,No log,0.062687,0.0229,305.392
8,No log,0.062627,0.0234,298.779
9,No log,0.062558,0.0225,311.523
10,No log,0.062483,0.0227,307.807


[32m[I 2021-11-16 12:43:48,563][0m Trial 0 finished with value: 0.0619756281375885 and parameters: {'lr': 2.2647922891592956e-05, 'wd': 0.373997251974964, 'ws': 1116.428316311118}. Best is trial 0 with value: 0.0619756281375885.[0m


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.062875,0.0239,293.033
2,No log,0.062874,0.0224,311.834
3,No log,0.062873,0.0232,301.566
4,No log,0.062872,0.0228,306.986
5,No log,0.062871,0.022,317.757
6,No log,0.062869,0.0229,306.316
7,No log,0.062866,0.0228,307.246
8,No log,0.062863,0.0227,307.974
9,No log,0.06286,0.0224,311.871
10,No log,0.062856,0.0393,178.13


[32m[I 2021-11-16 12:44:09,589][0m Trial 1 finished with value: 0.06282969564199448 and parameters: {'lr': 1.9204962126438614e-06, 'wd': 0.5472255947178908, 'ws': 1688.974805502949}. Best is trial 0 with value: 0.0619756281375885.[0m


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.062836,0.0247,283.659
2,No log,0.062616,0.0223,313.522
3,No log,0.062234,0.0237,294.807
4,No log,0.061688,0.0231,302.922
5,No log,0.060988,0.0227,308.651
6,No log,0.060147,0.0229,306.278
7,No log,0.059163,0.0227,307.881
8,No log,0.058039,0.0231,303.317
9,No log,0.0568,0.0241,290.177
10,No log,0.055522,0.0248,281.791


[32m[I 2021-11-16 12:44:30,930][0m Trial 2 finished with value: 0.04834530130028725 and parameters: {'lr': 0.00044404090754716056, 'wd': 0.5818183811812224, 'ws': 1069.8045009257423}. Best is trial 2 with value: 0.04834530130028725.[0m


In [64]:
lr = study.best_params['lr']
wd = study.best_params['wd']
ws = study.best_params['ws']
lr, wd, ws

(0.00044404090754716056, 0.5818183811812224, 1069.8045009257423)

In [65]:
tokenizer = T5Tokenizer.from_pretrained(load_model)
model = T5ForConditionalGeneration.from_pretrained(load_model)
model.resize_token_embeddings(len(tokenizer))
# model.eval()

Embedding(32104, 512)

In [66]:
training_args = Seq2SeqTrainingArguments(
    output_dir=model_directory,
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=ws,
    weight_decay=wd,
    logging_dir=model_directory,
    logging_steps=100,
    do_eval=True,
    evaluation_strategy="epoch",
    learning_rate=lr,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=1,
    eval_accumulation_steps=1,  # set this lower, if testing or validation crashes
    disable_tqdm=False,
    predict_with_generate=True,  # never set this to false.
    seed=42,  # default value
)

In [67]:
from sklearn.metrics import accuracy_score
import numpy as np
def compute_metrics(p):
    target_max_length = 256
    predictions, labels = p.predictions, p.label_ids
    
    predictions = np.pad(predictions, ((0, 0), (0, target_max_length - predictions.shape[1])), mode="constant")
    predictions = np.delete(predictions, 0, axis=1)
    predictions = np.insert(predictions, target_max_length - 1, 0, axis=1)

    

    labels = np.array(labels)
    labels = np.pad(labels, ((0, 0), (0, target_max_length - labels.shape[1])), mode="constant")
    labels = np.delete(labels, 0, axis=1)
    labels = np.insert(labels, target_max_length - 1, 0, axis=1)
    

    correct_counter = np.sum(np.all(np.equal(labels, predictions), axis=1))
    return {'acc': int(correct_counter)}

In [68]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=[torch.optim.Adam(params=model.parameters(), lr=lr), None],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [69]:
trainer.train()

Epoch,Training Loss,Validation Loss,Acc,Runtime,Samples Per Second
1,No log,0.062836,0,0.1997,35.061
2,No log,0.062616,0,0.197,35.526
3,No log,0.062234,0,0.1926,36.349
4,No log,0.061688,0,0.197,35.525
5,No log,0.060988,0,0.1951,35.872
6,No log,0.060147,0,0.1958,35.745
7,No log,0.059163,0,0.1943,36.02
8,No log,0.058039,0,0.1923,36.405
9,No log,0.0568,0,0.1924,36.382
10,No log,0.055522,0,0.1916,36.531


TrainOutput(global_step=60, training_loss=0.10599960486094157, metrics={'train_runtime': 43.5513, 'train_samples_per_second': 1.378, 'total_flos': 25991996405760, 'epoch': 30.0})

In [73]:
trainer.evaluate()['eval_loss']

0.03424501046538353

In [77]:
best_model_dir = f'{model_directory}/best'
trainer.save_model(best_model_dir)

In [78]:
!python hf_transformers/tfix_testing.py --load-model $best_model_dir -bs 16 --model-name t5-small -d repo-based-included -r $repo


start time:  12:57:11
['no-invalid-this', 'no-throw-literal', 'no-new-wrappers', 'guard-for-in', 'no-new-object', 'comma-style', 'prefer-spread', 'no-caller', 'no-extra-bind', 'no-array-constructor', 'prefer-rest-params', 'generator-star-spacing', 'no-this-before-super', 'no-extend-native', 'no-undef', 'no-useless-escape', 'no-dupe-keys', 'no-console', 'no-constant-condition', 'no-duplicate-case', 'no-empty', 'no-extra-semi', 'no-redeclare', 'no-cond-assign', 'no-extra-boolean-cast', 'no-fallthrough', 'no-unreachable', 'valid-typeof', 'no-unsafe-finally', 'no-unused-vars', 'no-debugger', 'no-unsafe-negation', 'no-case-declarations', 'no-self-assign', 'no-process-exit', 'no-inner-declarations', 'for-direction', 'no-compare-neg-zero', 'no-sparse-arrays', 'no-func-assign', 'no-const-assign', 'no-global-assign', 'use-isnan', 'no-unused-labels', 'require-yield', 'getter-return', 'no-dupe-class-members', 'no-ex-assign', 'constructor-super', 'no-new-symbol', 'no-empty-pattern', 'no-class-assi