In [None]:
!pip install transformers

In [None]:
import json
import glob
import pandas as pd
import torch
import random 
import numpy as np
import tqdm 

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModelForSequenceClassification.from_pretrained("results/checkpoint-6184", num_labels=2)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    p = precision_score(labels, predictions, average='weighted')
    r = recall_score(labels, predictions, average='weighted')
    return {"f1":f1, "acc":acc, "prec": p, "recall": r}

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
dataset = {}
with open('drive/MyDrive/bug_issues.jsonl', 'r') as f:
  for line in f:
    json_data = json.loads(line)
    yes = 0
    f = set()
    for commit in json_data["commits"]:
      for source in commit[-1]:
        if(".py" in source):
          f.add(source)
          yes = 1 
    if(yes):
      dataset[json_data['issue_number']] = {}
      dataset[json_data['issue_number']]["body"] = json_data["body"]
      dataset[json_data['issue_number']]["title"] = json_data["title"]
      dataset[json_data['issue_number']]["commits"] = list(f)

In [None]:
dataset_keys = list(dataset.keys())
random.shuffle(dataset_keys)
train_issues = dataset_keys[:int(len(dataset_keys)*0.9)]
test_issues = dataset_keys[int(len(dataset_keys)*0.9):]

In [None]:
all_py_files = [f for f in glob.glob("zulip-main/**", recursive=True) if ".py" in f]

In [None]:
test_issues

In [None]:
train_pairs = []
train_labels = []
test_pairs = {}
test_labels = {}

# pb = tqdm.tqdm_notebook(range(len(train_issues)))
# for issue in train_issues:
#   text = dataset[issue]["title"]+dataset[issue]["body"]
#   for f in dataset[issue]["commits"]:
#     try:
#       code = open(f'zulip-main/{f}', 'r').read()
#       train_pairs.append(f'{text} </s> {code}')
#       train_labels.append(1)
#     except:
#       continue 
#   count = 0
#   while count!=2:
#     s = random.sample(all_py_files, 1)[0]
#     if s not in dataset[issue]['commits']:
#       code = open(f'{s}', 'r').read()
#       train_pairs.append(f'{text} </s> {code}')
#       train_labels.append(0)
#       count+=1 

for issue in test_issues:
  text = dataset[issue]["title"]+dataset[issue]["body"]
  test_pairs[issue] = []
  test_labels[issue] = []
  for f in dataset[issue]["commits"]:
    try:
      code = open(f'zulip-main/{f}', 'r').read()
      test_pairs[issue].append(f'{text} </s> {code}')
      test_labels[issue].append(1)
    except:
      continue

  count = 0
  while count!=5:
    s = random.sample(all_py_files, 1)[0]
    if s not in dataset[issue]['commits']:
      code = open(s, 'r').read()
      test_pairs[issue].append(f'{text} </s> {code}')
      test_labels[issue].append(0)
      count+=1 

In [None]:
lens = [len(test_pairs[i]) for i in test_issues]

In [None]:
lens

In [None]:
test_pairs.keys()

dict_keys([20264, 6507, 13477, 17111, 7197, 6320, 3626, 4757, 11214, 7195, 5177, 16586, 22817, 5190, 3660, 3939, 2727, 3592, 1276, 10131, 7406, 5209, 17408, 6978, 9792, 13340, 14770, 16793, 19588, 10947, 12878, 18305, 4733, 9240, 16066, 12152, 19371, 2039, 729, 5389, 10379, 14111, 8959, 432, 2150, 3974, 20759, 20595, 11290, 2465, 1212, 16164, 12150, 7387, 9834, 5947, 18795, 1553, 3210, 4084, 6845, 17922, 8000, 784, 16850, 10991, 9913, 13533, 4000, 9866, 13060, 320, 2052, 1300, 7021, 16284, 12132, 10783, 3448, 20980, 17102, 1861, 19838, 10639, 15836, 19287, 9430, 9057, 10509, 5655, 8145, 20017, 17435, 13959, 15951, 12323, 499, 4742, 7441, 6959, 15307, 4557, 1084, 2308, 6896, 6985, 5184, 18599, 275, 13854, 6720, 2038, 11824, 13082, 23276, 22284, 11018, 4750, 5544, 4580, 12056, 19468, 13583, 9678, 18493, 7617, 11063, 7396, 2195, 878, 2232, 8582])

In [None]:
test_labels[7197]

[1, 1, 1, 1, 0, 0, 0, 0, 0]

In [None]:
def get_dataset(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding = True)
    dataset = CustomDataset(encodings, labels)
    return dataset

In [None]:
#train_dataset = get_dataset(train_pairs, train_labels)
test_dataset = get_dataset(test_pairs[7197], test_labels[7197])

In [None]:
training_args = TrainingArguments(
    output_dir='results/',  
    num_train_epochs=5,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=1e-4,              
    logging_dir='logs/',            
    logging_steps=200,
    save_strategy='epoch',
    save_total_limit=3,
    evaluation_strategy="epoch", 
    learning_rate = 1e-5,
    metric_for_best_model = 'eval_loss',
    load_best_model_at_end = True,
    fp16=True,
    group_by_length=True
)

In [None]:
trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=test_dataset,         
    eval_dataset=test_dataset,             
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

Using cuda_amp half precision backend


In [None]:
ntrainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,             
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 6184
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 7730
  Number of trainable parameters = 124647170


Epoch,Training Loss,Validation Loss,F1,Acc
1,0.5173,0.725797,0.717863,0.739007
2,0.5177,0.754823,0.72491,0.744681
3,0.5358,0.821227,0.725075,0.730496
4,0.5116,0.733055,0.741186,0.748936


***** Running Evaluation *****
  Num examples = 705
  Batch size = 8
Saving model checkpoint to results/checkpoint-1546
Configuration saved in results/checkpoint-1546/config.json
Model weights saved in results/checkpoint-1546/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 705
  Batch size = 8
Saving model checkpoint to results/checkpoint-3092
Configuration saved in results/checkpoint-3092/config.json
Model weights saved in results/checkpoint-3092/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 705
  Batch size = 8
Saving model checkpoint to results/checkpoint-4638
Configuration saved in results/checkpoint-4638/config.json
Model weights saved in results/checkpoint-4638/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 705
  Batch size = 8
Saving model checkpoint to results/checkpoint-6184
Configuration saved in results/checkpoint-6184/config.json
Model weights saved in results/checkpoint-6184/pytorch_model.bin
Deleting older checkpoin

TrainOutput(global_step=6184, training_loss=0.5065170216529521, metrics={'train_runtime': 1367.1274, 'train_samples_per_second': 22.617, 'train_steps_per_second': 5.654, 'total_flos': 6508315065384960.0, 'train_loss': 0.5065170216529521, 'epoch': 4.0})

In [None]:
out = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 9
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
preds = torch.tensor(out.predictions)

In [None]:
preds

tensor([[-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166],
        [-0.1245, -0.5166]], dtype=torch.float16)

In [None]:
import numpy as np

In [None]:
torch.nn.functional.softmax(preds, dim=1)

In [None]:
len(out.label_ids)

705

In [None]:
acc = accuracy_score(test_labels, out.label_ids)
f1 = f1_score(test_labels, out.label_ids, average='macro')

In [None]:
out

PredictionOutput(predictions=array([[-2.396 ,  2.396 ],
       [-2.398 ,  2.393 ],
       [-2.398 ,  2.396 ],
       ...,
       [-0.634 ,  0.403 ],
       [-1.453 ,  0.9067],
       [ 1.188 , -1.82  ]], dtype=float16), label_ids=array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 

In [None]:
f1

1.0

In [None]:
!cp -r results/checkpoint-6184 drive/MyDrive/