In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 13.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 59.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 66.7MB/s 
Installing collec

In [None]:
import json
import torch
import random
import math
import numpy as np
import gc
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding

data_directory = '/content/drive/MyDrive/datasets/'
log_directory = '/content/drive/MyDrive/{}_fulldata_otherdata/logs'
output_directory = '/content/drive/MyDrive/{}_fulldata_otherdata/output/'
pretrained_model = 'microsoft/codebert-base'
#pretrained_model = 'microsoft/graphcodebert-base'
batch_size = 16

class SmellDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class Method():
    def __init__(self, text, count):
        self.text = text
        self.count = count

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(pretrained_model)
    model.cuda()
    return model

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

def hyperparam_objective(metrics):
  return metrics['eval_loss']

def find_hyperparams(train_data, val_data, tokenizer, smell):
    training_args = TrainingArguments(
        evaluation_strategy='steps',
        eval_steps=250,
        output_dir=output_directory.format(smell + '_hpx'),
        logging_dir=log_directory.format(smell + '_hpx'),
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=2,
        warmup_steps=500,
    )
    
    trainer = Trainer(
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=train_data, 
        eval_dataset=val_data, 
        model_init=model_init,
        compute_metrics=compute_metrics,
    )
    return trainer.hyperparameter_search(direction='minimize', hp_space=hp_space, compute_objective=hyperparam_objective)

def read_data(smell):
    texts = None
    labels = None
    with open(data_directory + smell + '.texts', 'r') as texts_file:
        texts = json.loads(texts_file.read())
    with open(data_directory + smell + '.labels', 'r') as labels_file:
        labels = json.loads(labels_file.read())
    return texts, labels

def split_data(texts, labels, smell, validation_size=0.3, max_size=60000):
    positive_texts = []
    negative_texts = []
    negative_texts_with_block = []
    positive_labels = []
    negative_labels = []
    negative_labels_with_block = []
    bad_methods = []
    negative_countx = 0

    for text, label in zip(texts, labels):
        if label == 1:
            positive_texts.append(text)
            positive_labels.append(label)
        else:
            stripped_text = ''.join(text.split())
            if smell == 'Empty catch clause':
                if 'try{' in stripped_text and ('}catch(' in stripped_text or '}catch{' in stripped_text):
                    negative_texts_with_block.append(text)
                    negative_labels_with_block.append(label)
                else:
                    negative_labels.append(text)
                    negative_labels.append(label)
            elif smell == 'Complex Method':
                count = stripped_text.count('if(') + stripped_text.count('else') + stripped_text.count('for(') + stripped_text.count('while(') + stripped_text.count('case') + stripped_text.count('switch(') + stripped_text.count('try{') + stripped_text.count('}catch{')
                bad_methods.append(Method(text, count))
                negative_texts.append(text)
                negative_labels.append(label)
            elif smell == 'Multifaceted Abstraction':
                if 5 <= (stripped_text.count('public') + stripped_text.count('private') + stripped_text.count('protected')) <= 15:
                    negative_texts.append(text)
                    negative_labels.append(label)
            else:
                negative_texts.append(text)
                negative_labels.append(label)

            negative_countx += 1

    print('positive examples pre-trim: {}'.format(len(positive_labels)))
    print('negative examples pre-trim: {}'.format(len(negative_labels)))

    if smell == 'Empty catch clause':
        print('negative examples with catch block pre-trim: {}'.format(len(negative_labels_with_block)))
        #ensure roughly 50/50 split
        negative_texts = random.sample(negative_texts, len(negative_texts_with_block)) + negative_texts_with_block
        negative_labels = negative_labels[:len(negative_texts_with_block)] + negative_labels_with_block
    elif smell == 'Complex Method':
        sorted_method = sorted(bad_methods, key=lambda x: x.count, reverse=True)
        sorted_methodx = list(reversed(sorted_method))[:len(positive_texts)]
        negative_texts = sorted_method[len(positive_texts):-1 * len(positive_texts)]
        negative_labels = negative_labels[:len(negative_texts)]
        sorted_method = sorted_method[:len(positive_texts)]

    positive_negative_ratio = len(positive_labels) / negative_countx
    positive_validation_size = validation_size * positive_negative_ratio
    positive_count = min(math.ceil(max_size / (1 - positive_validation_size)), len(positive_texts))
    
    positive_texts_training, positive_texts_validation, positive_labels_training, positive_labels_validation = train_test_split(
        random.sample(positive_texts, positive_count),
        positive_labels[:positive_count], 
        test_size=positive_validation_size)

    negative_count = len(positive_texts_training) + int(len(positive_labels_validation) * math.pow(positive_negative_ratio, -1))
    print(negative_count)

    negative_texts_training, negative_texts_validation, negative_labels_training, negative_labels_validation = (
        random.sample([t.text for t in (sorted_method + sorted_methodx)], len(positive_texts_training)),
        random.sample([t.text for t in negative_texts], negative_count - len(positive_texts_training)),
        [0] * len(positive_texts_training),
        [0] * (negative_count - len(positive_texts_training))) if smell == 'Complex Method' else train_test_split(
        random.sample(negative_texts, negative_count),
        negative_labels[:negative_count],
        test_size=negative_count - len(positive_texts_training))

    print('positive training examples post-trim: {}'.format(len(positive_labels_training)))
    print('positive validation examples post-trim: {}'.format(len(positive_labels_validation)))
    print('negative training examples post-trim: {}'.format(len(negative_labels_training)))
    print('negative validation examples post-trim: {}'.format(len(negative_labels_validation)))

    print(type(negative_texts_training[0]))
    print(type(negative_texts_validation[0]))

    texts_training, labels_training = shuffle(positive_texts_training + negative_texts_training, positive_labels_training + negative_labels_training)
    texts_validation, labels_validation = shuffle(positive_texts_validation + negative_texts_validation, positive_labels_validation + negative_labels_validation)
    return texts_training, texts_validation, labels_training, labels_validation

smells = [
    'Empty catch clause',
    'Complex Method', 
    'Multifaceted Abstraction',
    'Magic Number'
]

tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)

for smell in smells:
    print('Reading data for {}...'.format(smell))
    texts, labels = read_data(smell)

    print('Generating splits...')

    train_texts, validation_texts, train_labels, validation_labels = split_data(texts, labels, smell)

    del texts
    del labels

    print('Tokenizing training data...')

    train_encodings = tokenizer(train_texts, truncation=True)
    del train_texts

    print('Tokenizing validation data...')

    validation_encodings = tokenizer(validation_texts, truncation=True)
    del validation_texts

    print('Preparing to train...')

    gc.collect()

    train_dataset = SmellDataset(train_encodings, train_labels)
    validation_dataset = SmellDataset(validation_encodings, validation_labels)

    args = TrainingArguments(
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        output_dir=output_directory.format(smell + pretrained_model),
        logging_dir=log_directory.format(smell + pretrained_model),
        learning_rate=5e-6,
        warmup_steps=500,
        num_train_epochs=5
    )

    trainer = Trainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()

    print(trainer.evaluate())

    trainer.save_model()

Reading data for Multifaceted Abstraction...
Generating splits...
positive examples pre-trim: 10485
negative examples pre-trim: 368828
13682
positive training examples post-trim: 10449
positive validation examples post-trim: 36
negative training examples post-trim: 10449
negative validation examples post-trim: 3233
<class 'str'>
<class 'str'>
Tokenizing training data...
Tokenizing validation data...
Preparing to train...


Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2179,0.121207,0.961456,0.203947,0.861111,0.329787
2,0.1458,0.10713,0.96788,0.244444,0.916667,0.385965
3,0.1202,0.095687,0.975222,0.297297,0.916667,0.44898
4,0.1025,0.077219,0.981646,0.366667,0.916667,0.52381
5,0.0863,0.131954,0.972775,0.277311,0.916667,0.425806


{'eval_loss': 0.07721885293722153, 'eval_accuracy': 0.9816457632303457, 'eval_precision': 0.36666666666666664, 'eval_recall': 0.9166666666666666, 'eval_f1': 0.5238095238095238, 'eval_runtime': 34.5456, 'eval_samples_per_second': 94.629, 'epoch': 5.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 931402240}


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sat May 22 21:13:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces