Step 3: Modeling

Generate train, validation, and test torch data sets by cleaning text, and retaining only English words.
Implement Manhattan distance based LSTM (MaLSTM) bug patch similarity scoring model
Train and evaluate model

Step 4: Evaluation

For a testbug, create a dataset with all patches. Calculate the MaLSTM score for all patches. Find the rank of the matching patching patch.

In [None]:
# Use torchtext to construct training, validation, and test dataset 
import torch
from torchtext import data
import spacy


import enchant
en_dict = enchant.Dict("en_US")

spacy_en = spacy.load('en')
spacy_nlp = spacy.load('en_core_web_sm')

# Tokenize and filter non-English words and stop words
def tokenizer(text):
    english_text =  " "
    for word in text.split():
        if en_dict.check(word):
            english_text = english_text + word + " "
    return[token.text for token in spacy_nlp(english_text) if not token.is_stop and token.text != " "] # filter stop words


BUGTITLE = data.Field(sequential=True, tokenize=tokenizer, lower=True)
PATCH = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.LabelField(dtype=torch.float)


fields = {'bugtitle': ('b', BUGTITLE), 'patch': ('p', PATCH), 'label': ('l', LABEL)}




train_data, valid_data, test_data = data.TabularDataset.splits(
                    path = './',
                    train = 'bugpatchlabel_tr.json', 
                    validation = 'bugpatchlabel_val.json', 
                    test = 'bugpatchlabel_tst.json',
                    format = 'json',
                    fields = fields
)


print(f'Number of training examples: {len(train_data)}')
print(vars(train_data[0]))
print(f'Number of validation examples: {len(valid_data)}')
print(vars(valid_data[0]))
print(f'Number of test examples: {len(test_data)}')



In [None]:
# Word embeddding with Glove - 862 MB takes time to download for the first time!
BUGTITLE.build_vocab(train_data.b, max_size=25000, min_freq=1, vectors="glove.6B.100d")
print(len(BUGTITLE.vocab))

In [None]:
PATCH.build_vocab(train_data.p, max_size=25000, min_freq=1, vectors="glove.6B.100d")
print(len(PATCH.vocab))

In [None]:
LABEL.build_vocab(train_data)

In [None]:
# Iterate over trainig and validation data in batches
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size= BATCH_SIZE,
    sort_key=lambda x: len(x.b),
    sort_within_batch=False,
    device=device)

a = next(iter(train_iterator)); vars(a).keys()
b = next(iter(valid_iterator)); vars(b).keys()

In [None]:


import torch.nn as nn
from torch.autograd import Variable


class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, x1, x2):
        
        #x = [sent len, batch size]
        
        #embedded = [sent len, batch size, emb dim]
        embedded1 = self.dropout(self.embedding(x1))
        embedded2 = self.dropout(self.embedding(x2))
         
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        output1, (hidden1, cell1) = self.rnn(embedded1)
        output2, (hidden2, cell2) = self.rnn(embedded2)
              
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden1 = self.dropout(torch.cat((hidden1[-2,:,:], hidden1[-1,:,:]), dim=1))
        hidden2 = self.dropout(torch.cat((hidden2[-2,:,:], hidden2[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
        self.encoding1 = hidden1.squeeze(0)
        self.encoding2 = hidden2.squeeze(0)
        
         # Obtain similarity score predictions by calculating the negative exponenet of the Manhattan distance between sentence encodings
            
        return torch.exp(-torch.norm((self.encoding1 - self.encoding2), 1, 1))



In [None]:
# Model parameters
INPUT_DIM = len(PATCH.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
CLIP = 1.25


model = SiameseLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

pretrained_embeddings = PATCH.vocab.vectors

print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:


# Train routine 

import torch.optim as optim
from torch.nn.utils import clip_grad_norm_

optimizer = optim.Adam(model.parameters())

criterion = nn.MSELoss()

model = model.to(device)
criterion = criterion.to(device)

def binary_accuracy(preds, y):
    
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:        
        optimizer.zero_grad()            
        predictions = model(batch.b, batch.p)
        loss = criterion(predictions, batch.l)
        acc = binary_accuracy(predictions, batch.l)
        loss.backward()
        clip_grad_norm_(model.parameters(), CLIP) # Clip gradients
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



In [None]:
# Evalaute routine
def evaluate(model, iterator, criterion, printpreds=False):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.b, batch.p)
            if (printpreds == True) : print(torch.round(predictions))
            loss = criterion(predictions, batch.l)
            acc = binary_accuracy(predictions, batch.l)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# Train and evalaute model
import time

N_EPOCHS = 5

best_valid_loss = float('inf')

start = time.time()

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
        #torch.save(model, 'model.pt')

end = time.time()

print(f'Time to compute: {(end - start)/60} min')

In [None]:
model.load_state_dict(torch.load('model.pt'))
#model = torch.load('model.pt')
#model.eval()

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
### Evaluate model by determining rank of a test bug ####33

# Obtain index of testbug in the original bugpatch dataset
import json

patch = [] # List of patches
bug = [] # List of all bugs
testbug = [] # List of test bug indexes

with open("bugpatch.json", "r") as fp:
    bpdata = json.load(fp)

for k,v in bpdata.items():
    bug.append(k)
    patch.append(v)
    

with open('testindex.txt', 'r', encoding="utf-8") as fp: # testindex.txt in Step 2: datagen.py
    for line in fp.readlines():
        testbug.append(int(line))

print(bug[testbug[0]])

In [None]:
# Create a json file of test bug and all patches
bugidx = testbug[0]
print(bugidx) # Note: Bug index is the same as the matching patch index
print(bug[bugidx])
#print(patch[bugidx])
d = []
for p in patch:
    d.append({"bugtitle":bug[bugidx], "patch":p, "label":0})

print(len(d))
with open('bugpatchlabel_eval.json', 'w+', encoding="utf-8") as fp:
    for ele in d:
        json.dump(ele, fp)
        fp.write('\n')

In [None]:
# Data and iterator
eval_data = data.TabularDataset(
                    path = './bugpatchlabel_eval.json',
                    format = 'json',
                    fields = fields
)

eval_iterator = data.BucketIterator(
    eval_data, 
    batch_size= BATCH_SIZE,
    sort_key=lambda x: len(x.b),
    sort_within_batch=False,
    device=device
)

In [None]:
import numpy as np
predictions = np.array([0])
with torch.no_grad():
    for batch in eval_iterator:
        preds = model(batch.b, batch.p)
        predictions = np.append(predictions, preds.cpu().data.numpy())

In [None]:
print(len(predictions))
order = predictions.argsort()
ranks = order.argsort()
print(ranks[6])