In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_json('resources/embold_train.json')
df['combined'] = df['title']+'. '+df['body']
df.head()

Unnamed: 0,title,body,label,combined
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1,y-zoom piano roll. a y-zoom on the piano roll ...
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0,buggy behavior in selection. ! screenshot from...
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1,"auto update feature. hi,\r \r great job so far..."
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1,filter out noisy endpoints in logs. i think we...
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0,enable pid on / pid off alarm actions for ardu...


In [3]:
df_bug = df[df['label']==0]
df_feature = df[df['label']==1]
df_question = df[df['label']==2]

In [19]:
print('Number of datapoints with label as Bug :',df_bug)
print('Number of datapoints with label as Feature :',df_feature)
print('Number of datapoints with label as Question :',df_question)

Number of datapoints with label as Bug :                                                     title  \
1                             buggy behavior in selection   
4       enable pid on / pid off alarm actions for ardu...   
5                           script stopped adding video's   
9       en la org ull-esit-pl-1617 people info /nico/ ...   
15                                 filter floating points   
...                                                   ...   
149994         copy module fails with json and with_items   
149996  decoder displays some neurovault images incorr...   
149997    parser should return an error, not an exception   
149998  errorexception  array to string conversion on ...   
149999                   ignore headings in code sections   

                                                     body  label  \
1       ! screenshot from 2016-02-23 21 27 40  https:/...      0   
4       expected behavior\r alarm actions pid on and p...      0   
5       a recent chang

In [20]:
label_counts = df.label.value_counts().sort_index()
label_counts

label
0    66827
1    69106
2    14067
Name: count, dtype: int64

In [4]:
import nltk
import re
import string

#### clean data

In [5]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text

In [6]:
from string import punctuation
from nltk.corpus import stopwords

#### remove stop word


In [7]:
def punctuation_stopwords_removal(git_text):
    # filters charecter-by-charecter : ['h', 'e', 'e', 'l', 'o', 'o', ' ', 'm', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'i', 's', ' ', 'p', 'u', 'r', 'v', 'a']
    remove_punctuation = [ch for ch in git_text if ch not in punctuation]
    # convert them back to sentences and split into words
    remove_punctuation = "".join(remove_punctuation).split()
    filtered_git_text = [word.lower() for word in remove_punctuation if word.lower() not in stopwords.words('english')]
    return filtered_git_text

In [8]:
from collections import Counter
import plotly.express as px

In [9]:
def plot_most_common_words(df_category, category):
    df_category['combined'] = df_category['combined'].apply(lambda x: x.replace("\\r", ""))
    df_category['combined'] = df_category['combined'].apply(lambda x: clean_text(x))
    
    df_category["combined"] = df_category["combined"].apply(punctuation_stopwords_removal)
    
    word_list = []
    
    for i, j in df_category.iterrows():
        for word in j['combined']:
            word_list.append(word)
        
    count_dict = Counter(word_list)
    most_common_words_df = pd.DataFrame(count_dict.most_common(20), columns=['word', 'count'])
    fig = px.histogram(most_common_words_df,
                       x='word', 
                       y='count',
                       title='Most common terms used while refering to a GitHub {}'.format(category),
                       color_discrete_sequence=['#843B62'] )
    fig.show()

####
label 0: Bug
label 1: Feature
label 2: Question

In [14]:
df['combined'] = df['combined'].apply(lambda x: x.replace("\\r", ""))
df['combined'] = df['combined'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,title,body,label,combined
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,1,yzoom piano roll a yzoom on the piano roll wou...
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,0,buggy behavior in selection screenshot from ...
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",1,auto update feature hi great job so far saenz...
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,1,filter out noisy endpoints in logs i think we ...
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,0,enable pid on pid off alarm actions for expe...


In [15]:
df.drop(['title', 'body'], axis=1, inplace=True)
df.head()
df_test_for_example = df.copy()

In [16]:
import torch
import torch.nn as nn

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

In [17]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1cedb0612d0>

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#### loading our BERT model 

In [21]:
BERT_PATH = "bert-base-uncased"

In [22]:
BERT_UNCASED = 'bert-base-uncased/bert-base-uncased'

#### loading the pre-trained BertTokenizer

In [24]:
# tokenizer = BertTokenizer.from_pretrained(BERT_UNCASED)
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

In [25]:
# some basic operations to understand how BERT converts a sentence into tokens and then into IDs
sample_body = 'script stopped adding videos saenzramiro abc xyz'
tokens = tokenizer.tokenize(sample_body)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_body}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

 Sentence: script stopped adding videos saenzramiro abc xyz
   Tokens: ['script', 'stopped', 'adding', 'videos', 'sa', '##en', '##z', '##ram', '##iro', 'abc', 'x', '##y', '##z']
Token IDs: [5896, 3030, 5815, 6876, 7842, 2368, 2480, 6444, 9711, 5925, 1060, 2100, 2480]


In [26]:
# using encode_plus to add special tokens : [CLS]:101, [SEP]:102, [PAD]:0
encodings = tokenizer.encode_plus(
            sample_body,
            max_length=32,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
)

encodings.keys()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


dict_keys(['input_ids', 'attention_mask'])

In [27]:
print('Input IDs : {}'.format(encodings['input_ids'][0]))
print('\nAttention Mask : {}'.format(encodings['attention_mask'][0]))

Input IDs : tensor([ 101, 5896, 3030, 5815, 6876, 7842, 2368, 2480, 6444, 9711, 5925, 1060,
        2100, 2480,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

Attention Mask : tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


In [28]:
MAX_LENGTH = 512

In [29]:
class GitHubCommitMessages(Dataset):
    
    def __init__(self, commit_message, label, tokenizer, max_len):
        self.commit_message = commit_message
        self.label = label
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.commit_message)
    
    def __getitem__(self, item):
        commit_message = str(self.commit_message[item])
        label = self.label[item]
        
        encoding = self.tokenizer.encode_plus(
        commit_message,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt')
        return {
        'commit_message': commit_message,
         'input_ids': encoding['input_ids'],
         'attention_mask': encoding['attention_mask'],
         'label': torch.tensor(label, dtype=torch.long)
          }

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
df.head()

Unnamed: 0,label,combined
0,1,yzoom piano roll a yzoom on the piano roll wou...
1,0,buggy behavior in selection screenshot from ...
2,1,auto update feature hi great job so far saenz...
3,1,filter out noisy endpoints in logs i think we ...
4,0,enable pid on pid off alarm actions for expe...


In [32]:
df = df[:2000]

In [33]:
df.shape

(2000, 2)

In [6]:
training_data, testing_data = train_test_split(
    df,
    test_size=0.1,
    random_state=RANDOM_SEED
)

testing_data, validation_data = train_test_split(
    testing_data,
    test_size=0.5,
    random_state=RANDOM_SEED
)

NameError: name 'train_test_split' is not defined

In [8]:
training_data.shape, testing_data.shape, validation_data.shape

NameError: name 'training_data' is not defined

In [3]:
def create_data_loader(data, tokenizer, max_len, batch_size):
    
    ds = GitHubCommitMessages(commit_message=data.combined.to_numpy(),
    label=data.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len)
    
    return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4)


BATCH_SIZE = 16
train_data_loader = create_data_loader(training_data, tokenizer, MAX_LENGTH, BATCH_SIZE)
testing_data_loader = create_data_loader(testing_data, tokenizer, MAX_LENGTH, BATCH_SIZE)
val_data_loader = create_data_loader(validation_data, tokenizer, MAX_LENGTH, BATCH_SIZE)

NameError: name 'training_data' is not defined

In [1]:
df = next(iter(train_data_loader))
df.keys()

NameError: name 'train_data_loader' is not defined

In [None]:
df['input_ids'].squeeze().shape, df['attention_mask'].squeeze().shape, df['label'].shape

In [None]:
print('commit_message  : ', df['commit_message'][0])
print('input_ids : ', df['input_ids'].squeeze()[0])
print('attention_mask : ', df['attention_mask'].squeeze()[0])
print('label : ', df['label'][0])

In [None]:
bert_model = BertModel.from_pretrained(BERT_UNCASED)

In [None]:
last_hidden_state, pooled_output = bert_model(
  input_ids=encodings['input_ids'],
  attention_mask=encodings['attention_mask']
)

In [None]:
last_hidden_state.shape, pooled_output.shape

In [None]:
class BugPredictor(nn.Module):
    
    def __init__(self, n_classes):
        super(BugPredictor, self).__init__()
        self.bert_model = BertModel.from_pretrained(BERT_UNCASED)
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert_model.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert_model(
        input_ids=input_ids,
        attention_mask = attention_mask
        )
        output = self.dropout(pooled_output)
        return self.out(output)

In [None]:
"""
label 0: Bug
label 1: Feature
label 2: Question
"""
class_names = [0, 1, 2]
bug_predictor_model = BugPredictor(len(class_names))
bug_predictor_model = bug_predictor_model.to(device)

In [None]:
EPOCHS = 5

optimizer = AdamW(bug_predictor_model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
    all_predictions = []
    all_targets = []

    for d in data_loader:
        input_ids = d['input_ids'].squeeze().to(device)
        attention_mask = d['attention_mask'].squeeze().to(device)
        targets = d['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        all_predictions.extend(preds.detach().cpu().numpy())
        all_targets.extend(targets.detach().cpu().numpy())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    accuracy = correct_predictions.double() / n_examples
    f1 = f1_score(all_targets, all_predictions, average='weighted')
    precision = precision_score(all_targets, all_predictions, average='weighted')
    recall = recall_score(all_targets, all_predictions, average='weighted')
    mean_loss = np.mean(losses)

    return accuracy, f1, precision, recall, mean_loss

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].squeeze().to(device)
            attention_mask = d['attention_mask'].squeeze().to(device)
            targets = d['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            all_predictions.extend(preds.detach().cpu().numpy())
            all_targets.extend(targets.detach().cpu().numpy())
    
    confusion = confusion_matrix(all_targets, all_predictions)
    accuracy = correct_predictions.double() / n_examples
    f1 = f1_score(all_targets, all_predictions, average='weighted')
    precision = precision_score(all_targets, all_predictions, average='weighted')
    recall = recall_score(all_targets, all_predictions, average='weighted')
    mean_loss = np.mean(losses)

    return accuracy, f1, precision, recall, mean_loss, confusion

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(confusion, classes):
    plt.figure(figsize=(8, 6))
    plt.imshow(confusion, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = confusion.max() / 2
    for i in range(confusion.shape[0]):
        for j in range(confusion.shape[1]):
            plt.text(j, i, format(confusion[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if confusion[i, j] > thresh else "black")

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

In [None]:
%%time
from collections import defaultdict

history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
    print('EPOCH {}/{}'.format(epoch+1,EPOCHS))
    print('-' * 10)
    
    train_acc, train_f1, train_p, train_recall, train_loss = train_model(bug_predictor_model, train_data_loader, loss_fn, optimizer, device, scheduler, len(training_data))
    print(f'Train loss: {train_loss:.4f} F1: {train_f1:.4f}, precision: {train_p:.4f}, recall: {train_recall:.4f}, accuracy: {train_acc:.4f}')
    
    val_acc, val_f1, val_p, val_recall, val_loss, _ = eval_model(bug_predictor_model, val_data_loader, loss_fn, device, len(validation_data))
    print(f'Validation loss: {val_loss:.4f} F1: {val_f1:.4f}, precision: {val_p:.4f}, recall: {val_recall:.4f}, accuracy: {val_acc:.4f}')
    
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    
    if val_acc > best_accuracy:
        print('Saving the best model ...')
        torch.save(bug_predictor_model.state_dict(), 'best_model.bin')
        best_accuracy = val_acc

In [None]:
_, _, _, _, _, confusion_matrix = eval_model(bug_predictor_model, val_data_loader, loss_fn, device, len(validation_data))
class_names = ['bug', 'feature', 'question']
plot_confusion_matrix(confusion_matrix, class_names)

In [None]:
def predict_git_category(sample_message, model):
    encoded_message = tokenizer.encode_plus(sample_bug_message, max_length=MAX_LENGTH, add_special_tokens=True, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt')
    input_ids = encoded_message['input_ids'].to(device)
    attention_mask = encoded_message['attention_mask'].to(device)
    
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    _, prediction_idx = torch.max(output, dim=1)
        
    return class_names[prediction_idx]

In [None]:
sample_bug_message = "Script stopped adding video's. A recent change in the youtube layout broke the script. Probably caused by element names being altered."
print('Sample bug message : ', sample_bug_message)
print('Predicted GitHub Category : ', predict_git_category(sample_bug_message, bug_predictor_model))

In [None]:
sample_message = "add buttons to switch months when viewing salaah times have  next month  and  previous month  buttons in  masjidvue"
print('Sample bug message : ', sample_message)
print('Predicted GitHub Category : ', predict_git_category(sample_message, bug_predictor_model))