In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
#
import pickle
import sys
from pathlib import Path
PREPROCESSING_DIR = Path.cwd() / 'PreProcessingData'
sys.path.append(str(PREPROCESSING_DIR))
#

In [2]:
# 1. Define the filename saved by run_all_preprocessing.py
FILENAME = Path.cwd() / 'PreProcessingData' / 'all_datasets.pkl'

# 2. Open the file in binary read mode ('rb') and load the content
with open(FILENAME, 'rb') as f:
    data = pickle.load(f)

# --- Access the data ---
# all_processed_data is a dictionary where keys are dataset names (e.g., 'eclipse')
print(f"Successfully loaded {len(data)} datasets:")
print(list(data.keys()))

Successfully loaded 5 datasets:
['aspectj', 'eclipse', 'swt', 'tomcat', 'birt']


### Remove Unusable Report

In [3]:
def remove_report(name):
    rpkey = list(data[name]['report'].keys())
    for id in rpkey:
        count = 0
        for f in data[name]['report'][id].fixed_files:
            if f in data[name]['source'].keys():
                count += 1
                break
        if count == 0:
            del data[name]['report'][id]        

In [4]:
remove_report('aspectj')
remove_report('eclipse')
remove_report('swt')
remove_report('tomcat')
remove_report('birt')

In [5]:
print(len(data['aspectj']['report']), len(data['eclipse']['report']), len(data['swt']['report']), len(data['tomcat']['report']), len(data['birt']['report']))

522 4986 3321 1012 4175


In [6]:
print(len(data['aspectj']['source']), len(data['eclipse']['source']), len(data['swt']['source']), len(data['tomcat']['source']), len(data['birt']['source']))

6910 6165 2176 1794 9697


# I. Temporal split

In [7]:
def sort_reports_by_time(data):
    reports_dict = data['report']
    # 1. Convert the dictionary items (bug_id: BugReport object) into a list of tuples
    report_items = list(reports_dict.items())
    # 2. Sort the list by the 'report_time' attribute of the BugReport object
    #    We use the index [1] to access the BugReport object in the tuple (bug_id, BugReport)
    #    and then access its .report_time attribute.
    sorted_list = sorted(report_items,
                        key=lambda item: item[1].report_time)
    # 3. Store the chronologically sorted list
    print(f"Dataset '{data.keys}' reports sorted: {len(sorted_list)} total reports.")
    return sorted_list

In [8]:
temp_sort_aspectj = sort_reports_by_time(data['aspectj'])
temp_sort_eclipse = sort_reports_by_time(data['eclipse'])
temp_sort_swt = sort_reports_by_time(data['swt'])
temp_sort_tomcat = sort_reports_by_time(data['tomcat'])
temp_sort_birt = sort_reports_by_time(data['birt'])

Dataset '<built-in method keys of dict object at 0x000001CB560B4900>' reports sorted: 522 total reports.
Dataset '<built-in method keys of dict object at 0x000001CB71B06240>' reports sorted: 4986 total reports.
Dataset '<built-in method keys of dict object at 0x000001CBAEAD2200>' reports sorted: 3321 total reports.
Dataset '<built-in method keys of dict object at 0x000001CBD3817B00>' reports sorted: 1012 total reports.
Dataset '<built-in method keys of dict object at 0x000001CB95643F00>' reports sorted: 4175 total reports.


# II. Split to 10-Folds

In [9]:
def k_fold(sorted_reports, k):
    fold_size = len(sorted_reports) // k
    folds = []
    for i in range(k):
        start_index = i * fold_size
        if i == k - 1:  # Last fold takes the remainder
            end_index = len(sorted_reports)
        else:
            end_index = (i + 1) * fold_size
        folds.append(sorted_reports[start_index:end_index])
    return folds

In [10]:
aspectj_folds = k_fold(temp_sort_aspectj, 3)
swt_folds = k_fold(temp_sort_swt, 10)
tomcat_folds = k_fold(temp_sort_tomcat, 10)


In [None]:
eclipse_folds = k_fold(temp_sort_eclipse, 10)
birt_folds = k_fold(temp_sort_birt, 10)

# III. Text Extracting

In [11]:
def get_report_texts(report, content_type='stemmed'):
    summary_tokens = report.pos_tagged_summary.get(content_type, [])
    desc_tokens = report.pos_tagged_description.get(content_type, [])
        
    # Join tokens with a space to form a single document string
    return " ".join(summary_tokens + desc_tokens)

def get_source_text(source_file, content_type: str = 'stemmed') -> str:
    """Concatenates all stemmed source code features into a single string."""
    # Assuming SourceFile has pos_tagged and stemmed attributes for these components:
    features = []
    # Note: Using .get(content_type, []) to safely handle potentially missing keys
    features.extend(source_file.comments.get(content_type, []))
    features.extend(source_file.class_names.get(content_type, []))
    features.extend(source_file.attributes.get(content_type, []))
    features.extend(source_file.method_names.get(content_type, []))
    features.extend(source_file.variables.get(content_type, []))
    features.extend(source_file.file_name.get(content_type, []))
    return " ".join(features)


# IV. Pairing

In [12]:
def rs_pair(report_fold, source_files):
    all_pair = []
    for report in report_fold:
        for source in source_files.values():
            pair = {'report' : report, 'source' : source}
            all_pair.append(pair)
    return all_pair


In [13]:
aspectj_pair = [rs_pair(fold, data['aspectj']['source']) for fold in aspectj_folds]
swt_pair = [rs_pair(fold, data['swt']['source']) for fold in swt_folds]
tomcat_pair = [rs_pair(fold, data['tomcat']['source']) for fold in tomcat_folds]


In [None]:
eclipse_pair = [rs_pair(fold, data['eclipse']['source']) for fold in eclipse_folds]
birt_pair = [rs_pair(fold, data['birt']['source']) for fold in birt_folds]

In [14]:
for fold in aspectj_pair:
    for pair in fold:
        for path in data['aspectj']['source']:
            if(data['aspectj']['source'][path] == pair['source']):
                pair['path'] = path
                break
        if pair['path'] in pair['report'][1].fixed_files:
            pair['label'] = 1

for fold in swt_pair:
    for pair in fold:
        for path in data['swt']['source']:
            if(data['swt']['source'][path] == pair['source']):
                pair['path'] = path
                break
        if pair['path'] in pair['report'][1].fixed_files:
            pair['label'] = 1

for fold in tomcat_pair:
    for pair in fold:
        for path in data['tomcat']['source']:
            if(data['tomcat']['source'][path] == pair['source']):
                pair['path'] = path
                break
        if pair['path'] in pair['report'][1].fixed_files:
            pair['label'] = 1



In [None]:
for fold in eclipse_pair:
    for pair in fold:
        for path in data['eclipse']['source']:
            if(data['eclipse']['source'][path] == pair['source']):
                pair['path'] = path
                break
        if pair['path'] in pair['report'][1].fixed_files:
            pair['label'] = 1

for fold in birt_pair:
    for pair in fold:
        for path in data['birt']['source']:
            if(data['birt']['source'][path] == pair['source']):
                pair['path'] = path
                break
        if pair['path'] in pair['report'][1].fixed_files:
            pair['label'] = 1

In [15]:
for fold in aspectj_pair:
    for pair in fold:
        if 'label' not in pair:
            pair['label'] = 0

for fold in swt_pair:
    for pair in fold:
        if 'label' not in pair:
            pair['label'] = 0
            
for fold in tomcat_pair:
    for pair in fold:
        if 'label' not in pair:
            pair['label'] = 0


In [None]:
for fold in eclipse_pair:
    for pair in fold:
        if 'label' not in pair:
            pair['label'] = 0

for fold in birt_pair:
    for pair in fold:
        if 'label' not in pair:
            pair['label'] = 0

# V. Feature Extraction

### 1. Lexical Similarity

In [16]:
def rs_lexical_vector(fold):
    # --- 1. PREPARE CORPUS FOR VECTORIZATION ---
    source_corpus = []
    report_corpus = []
    #Report corpus
    report_corpus.append(get_report_texts(fold[0]['report'][1]))
    rflag = fold[0]['report'][0]
    for i in range(1, len(fold)):
        if(fold[i]['report'][0] != rflag):
            rflag = fold[i]['report'][0]
            report_corpus.append(get_report_texts(fold[i]['report'][1]))
    #Source corpus
    rflag = fold[0]['report'][0]
    count = 1
    for i in range(1, len(fold)):
        if(fold[i]['report'][0] != rflag):
            break
        else:
            count += 1
    source_corpus = [get_source_text(fold[i]['source']) for i in range(count)]

    corpus = source_corpus + report_corpus    
    # Fit a TfidfVectorizer
    vectorizer = TfidfVectorizer(
        analyzer='word',
        token_pattern=r'\S+',
        min_df=10,
        max_features=5000
    )

    tfidf = vectorizer.fit(corpus)
    source_tfidf_matrix = tfidf.transform(source_corpus)
    report_tfidf_matrix = tfidf.transform(report_corpus)
    
    for i in range(len(fold)//count):
        for j in range(count):
            lexical_index = {'source_vector' : j,
                              'report_vector' : i}
            fold[i*count + j]['lexical'] = lexical_index

    return [report_tfidf_matrix, source_tfidf_matrix]

In [17]:
aspecj_lexical_matrix = []
for fold in aspectj_pair:
    aspecj_lexical_matrix.append(rs_lexical_vector(fold))
#
swt_lexical_matrix = []
for fold in swt_pair:
    swt_lexical_matrix.append(rs_lexical_vector(fold))
#
tomcat_lexical_matrix = []
for fold in tomcat_pair:
    tomcat_lexical_matrix.append(rs_lexical_vector(fold))


In [None]:
eclipse_lexical_matrix = []
for fold in eclipse_pair:
    eclipse_lexical_matrix.append(rs_lexical_vector(fold))
#
birt_lexical_matrix = []
for fold in birt_pair:
     birt_lexical_matrix.append(rs_lexical_vector(fold))

In [18]:
def lexical_similarity(report_vector, source_vector):
        sim = cosine_similarity(report_vector.reshape(1, -1),
                                source_vector.reshape(1, -1))[0][0]
        return sim

In [19]:
for i in range(len(aspectj_pair)): #== len(aspectj_lexical_matrix)
    for pair in aspectj_pair[i]:
        sim = lexical_similarity(aspecj_lexical_matrix[i][0].getrow(pair['lexical']['report_vector']),
                                 aspecj_lexical_matrix[i][1].getrow(pair['lexical']['source_vector']))
        pair['lexical']['Lsim'] = sim

for i in range(len(swt_pair)): #== len(swt_lexical_matrix)
    for pair in swt_pair[i]:
        sim = lexical_similarity(swt_lexical_matrix[i][0].getrow(pair['lexical']['report_vector']),
                                 swt_lexical_matrix[i][1].getrow(pair['lexical']['source_vector']))
        pair['lexical']['Lsim'] = sim

for i in range(len(tomcat_pair)): #== len(tomcat_lexical_matrix)
    for pair in tomcat_pair[i]:
        sim = lexical_similarity(tomcat_lexical_matrix[i][0].getrow(pair['lexical']['report_vector']),
                                 tomcat_lexical_matrix[i][1].getrow(pair['lexical']['source_vector']))
        pair['lexical']['Lsim'] = sim


In [None]:
for i in range(len(eclipse_pair)): #== len(eclipse_lexical_matrix)
    for pair in eclipse_pair[i]:
        sim = lexical_similarity(eclipse_lexical_matrix[i][0].getrow(pair['lexical']['report_vector']),
                                 eclipse_lexical_matrix[i][1].getrow(pair['lexical']['source_vector']))
        pair['lexical']['Lsim'] = sim

for i in range(len(birt_pair)): #== len(birt_lexical_matrix)
    for pair in birt_pair[i]:
        sim = lexical_similarity(birt_lexical_matrix[i][0].getrow(pair['lexical']['report_vector']),
                                 birt_lexical_matrix[i][1].getrow(pair['lexical']['source_vector']))
        pair['lexical']['Lsim'] = sim

### 2. Semantic Similarity

In [20]:
# glove_utils.py excerpt
import os
# !!! IMPORTANT: YOU MUST ADJUST THIS PATH IF YOUR FILE IS ELSEWHERE !!!
GLOVE_FILE_PATH = Path.cwd() / 'glove' / 'glove.42B.300d.txt'
GLOVE_VECTOR_SIZE = 300
class GloVeModel:
    """
    A class to load and manage the GloVe word embeddings.
    It provides fast lookup for embeddings.
    """
    def __init__(self, path=GLOVE_FILE_PATH, vector_size=GLOVE_VECTOR_SIZE):
        self.path = path
        self.vector_size = vector_size
        self.embedding_dict = {}
        self.is_loaded = False
        self.zero_vector = np.zeros(self.vector_size, dtype=np.float32)
        
    def load(self):
        """Loads the GloVe embeddings from the file path into a dictionary."""
        if self.is_loaded:
            print("GloVe model already loaded.")
            return

        print(f"Loading GloVe embeddings from: {self.path} (Size: {self.vector_size}D)...")
        try:
            with open(self.path, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.split()
                    word = parts[0]
                    vector = np.array(parts[1:], dtype=np.float32)
                    
                    if len(vector) == self.vector_size:
                        self.embedding_dict[word] = vector

            self.is_loaded = True
            print(f"Successfully loaded {len(self.embedding_dict)} words.")
        except FileNotFoundError:
            print(f"ERROR: GloVe file not found at {self.path}.")
            print("Please check the GLOVE_FILE_PATH and ensure the file exists before running.")
        except Exception as e:
            print(f"An error occurred during GloVe loading: {e}")

    def __getitem__(self, word: str) -> np.ndarray:
        """Returns the embedding vector for a word or a zero vector if not found."""
        return self.embedding_dict.get(word, self.zero_vector)
    
    def __contains__(self, word: str) -> bool:
        """Checks if a word is in the vocabulary."""
        return word in self.embedding_dict

    def get_vector_size(self) -> int:
        """Returns the dimension size of the vectors."""
        return self.vector_size

In [21]:
def get_avg_glove_vector(corpus, model):
    """
    Computes the average GloVe vector for a given document text.
    """        
    # Tokenize the text (assuming words are separated by spaces in the corpus strings)
    tokens = corpus.split()
    
    # Filter out words not in the GloVe vocabulary and get their vectors
    vectors = [model[word] for word in tokens if word in model]
    
    if not vectors:
        # If the document is empty or contains no words in the vocabulary, return a zero vector
        return model.zero_vector
    
    # Calculate the mean (average) of all word vectors
    # Stack the vectors vertically and compute the mean along axis 0
    return np.mean(np.vstack(vectors), axis=0)

In [22]:
glove = GloVeModel()
glove.load()

Loading GloVe embeddings from: d:\Lab\3. Deep Learning\Challenge Task NLP 1 - Bug Localization\glove\glove.42B.300d.txt (Size: 300D)...
Successfully loaded 1917494 words.


In [23]:
def rs_glove_vector(fold, glove_model):
        # --- 1. PREPARE CORPUS FOR VECTORIZATION ---
    source_corpus = []
    report_corpus = []
    #Report corpus
    report_corpus.append(get_report_texts(fold[0]['report'][1]))
    rflag = fold[0]['report'][0]
    for i in range(1, len(fold)):
        if(fold[i]['report'][0] != rflag):
            rflag = fold[i]['report'][0]
            report_corpus.append(get_report_texts(fold[i]['report'][1]))
    #Source corpus
    rflag = fold[0]['report'][0]
    count = 1
    for i in range(1, len(fold)):
        if(fold[i]['report'][0] != rflag):
            break
        else:
            count += 1
    source_corpus = [get_source_text(fold[i]['source']) for i in range(count)]

    report_glove_matrix = []
    source_glove_matrix = []
    for doc in report_corpus:
        avg_vector = get_avg_glove_vector(doc, glove_model)
        report_glove_matrix.append(avg_vector)
    for doc in source_corpus:
        avg_vector = get_avg_glove_vector(doc, glove_model)
        source_glove_matrix.append(avg_vector)
    
    return [report_glove_matrix, source_glove_matrix]

In [24]:
aspectj_semantic_matrix = []
for fold in aspectj_pair:
    aspectj_semantic_matrix.append(rs_glove_vector(fold, glove))

swt_semantic_matrix = []
for fold in swt_pair:
    swt_semantic_matrix.append(rs_glove_vector(fold, glove))

tomcat_semantic_matrix = []
for fold in tomcat_pair:
    tomcat_semantic_matrix.append(rs_glove_vector(fold, glove))
   

In [None]:
eclipse_semantic_matrix = []
for fold in eclipse_pair:
    eclipse_semantic_matrix.append(rs_glove_vector(fold, glove))

birt_semantic_matrix = []
for fold in birt_pair:
    birt_semantic_matrix.append(rs_glove_vector(fold, glove))

In [25]:
def semantic_similarity(report_vector, source_vector):
    sim = cosine_similarity(report_vector.reshape(1, -1),
                            source_vector.reshape(1, -1))[0][0]
    return sim

In [26]:
for i in range(len(aspectj_pair)): #== len(aspectj_lexical_matrix)
    for j in range(len(aspectj_semantic_matrix[i][0])):
        for k in range(len(aspectj_semantic_matrix[i][1])):
            sim = semantic_similarity(aspectj_semantic_matrix[i][0][j], aspectj_semantic_matrix[i][1][k])
            aspectj_pair[i][j*len(aspectj_semantic_matrix[i][1])+k]['Ssim'] = sim

for i in range(len(swt_pair)): #== len(swt_lexical_matrix)
    for j in range(len(swt_semantic_matrix[i][0])):
        for k in range(len(swt_semantic_matrix[i][1])):
            sim = semantic_similarity(swt_semantic_matrix[i][0][j], swt_semantic_matrix[i][1][k])
            swt_pair[i][j*len(swt_semantic_matrix[i][1])+k]['Ssim'] = sim

for i in range(len(tomcat_pair)): #== len(tomcat_lexical_matrix)
    for j in range(len(tomcat_semantic_matrix[i][0])):
        for k in range(len(tomcat_semantic_matrix[i][1])):
            sim = semantic_similarity(tomcat_semantic_matrix[i][0][j], tomcat_semantic_matrix[i][1][k])
            tomcat_pair[i][j*len(tomcat_semantic_matrix[i][1])+k]['Ssim'] = sim


In [None]:
for i in range(len(eclipse_pair)): 
    for j in range(len(eclipse_semantic_matrix[i][0])):
        for k in range(len(eclipse_semantic_matrix[i][1])):
            sim = semantic_similarity(eclipse_semantic_matrix[i][0][j], eclipse_semantic_matrix[i][1][k])
            eclipse_pair[i][j*len(eclipse_semantic_matrix[i][1])+k]['Ssim'] = sim

for i in range(len(birt_pair)): #== len(birt_lexical_matrix)
    for j in range(len(birt_semantic_matrix[i][0])):
        for k in range(len(birt_semantic_matrix[i][1])):
            sim = semantic_similarity(birt_semantic_matrix[i][0][j], birt_semantic_matrix[i][1][k])
            birt_pair[i][j*len(birt_semantic_matrix[i][1])+k]['Ssim'] = sim

### Similar Bug Reports

In [27]:
def rs_BugReportSim(fold, name):

    bug_change = []

    for i in range (len(data[name]['source'])):
        for j in range (i, len(fold), len(data[name]['source'])):
            if fold[j]['label'] == 1:
                bug_change.append(fold[j])
            else:
                fold[j]['R'] = 0
        if len(bug_change) > 0:   
            R = sum(a['lexical']['Lsim'] for a in bug_change)/len(bug_change)
            for a in bug_change:
                a['R'] = R
        bug_change = []
    return

In [28]:
for fold in aspectj_pair:
    rs_BugReportSim(fold, 'aspectj')

for fold in swt_pair:
    rs_BugReportSim(fold, 'swt')

for fold in tomcat_pair:
    rs_BugReportSim(fold, 'tomcat')


In [34]:
aspectj_pair[0][58]

{'report': ('11280', <preprocessing.BugReport at 0x218e47174c0>),
 'source': <preprocessing.SourceFile at 0x218cb12aa40>,
 'path': 'ajde\\src\\org\\aspectj\\ajde\\ui\\swing\\BrowserStructureViewToolPanel.java',
 'label': 0,
 'lexical': {'source_vector': 58,
  'report_vector': 0,
  'Lsim': np.float64(0.15625950259973578)},
 'Ssim': np.float32(0.7631036),
 'R': 0}

In [None]:
for fold in eclipse_pair:
    rs_BugReportSim(fold, 'eclipse')

for fold in birt_pair:
    rs_BugReportSim(fold, 'birt')

### Code Change History

In [29]:
u = 30
def rs_CodeChangeHistory(fold, name):
    for i in range(len(data[name]['source'])):
        fold[i]['H'] = 0.01

    for i in range(len(data[name]['source']), len(fold)):
        for j in range(i, -1, -len(data[name]['source'])):
            t_dif = (fold[i]['report'][1].report_time - fold[j]['report'][1].report_time).days
            if t_dif > u:
                break
            if fold[j]['label'] == 1:
                if t_dif <= u:
                    if t_dif == 0:
                        t_dif +=1
                    H = 1 / t_dif
                    fold[i]['H'] = H
                    break
            fold[i]['H'] = 0.01
    return

In [30]:
for fold in aspectj_pair:
    rs_CodeChangeHistory(fold, 'aspectj')

for fold in swt_pair:
    rs_CodeChangeHistory(fold, "swt")

for fold in tomcat_pair:
    rs_CodeChangeHistory(fold, 'tomcat')



In [None]:
for fold in eclipse_pair:
    rs_CodeChangeHistory(fold, 'eclipse')

for fold in birt_pair:
    rs_CodeChangeHistory(fold, 'birt')

### Bug Fixing Frequency

In [31]:
def rs_BugFixFreg(fold, name):
    for i in range(len(data[name]['source'])):
        fold[i]['F'] = 0
    for i in range(len(data[name]['source']), len(fold)):
        count = 0
        for j in range(i, -1, -len(data[name]['source'])):
            if fold[j]['label'] == 1:
                count = count +1
        
        fold[i]['F'] = count
    return

In [32]:
for fold in aspectj_pair:
    rs_BugFixFreg(fold, 'aspectj')

for fold in swt_pair:
    rs_BugFixFreg(fold, 'swt')

for fold in tomcat_pair:
    rs_BugFixFreg(fold, 'tomcat')

In [None]:
for fold in eclipse_pair:
    rs_BugFixFreg(fold, 'eclipse')

for fold in birt_pair:
    rs_BugFixFreg(fold, 'birt')

### CNS

# VI. Feature Scaling

In [69]:
def MinMax(fold):
    a = np.array([pair['lexical']['Lsim'] for pair in fold]).reshape(-1,1)
    b = np.array([pair['Ssim'] for pair in fold]).reshape(-1, 1)
    c = np.array([pair['R'] for pair in fold]).reshape(-1, 1)
    d = np.array([pair['H'] for pair in fold]).reshape(-1, 1)
    e = np.array([pair['F'] for pair in fold]).reshape(-1, 1)
    scaler = MinMaxScaler()
    a = scaler.fit_transform(a)
    b = scaler.fit_transform(b)
    c = scaler.fit_transform(c)
    d = scaler.fit_transform(d)
    e = scaler.fit_transform(e)
    for i in range(len(fold)):
        ma = a[i].item()
        mb = b[i].item()
        mc = c[i].item()
        md = d[i].item()
        me = e[i].item()
        fold[i]['scaled_vector'] = [ma, mb, mc, md, me]       
    return

In [70]:
for fold in aspectj_pair:
    MinMax(fold)

for fold in swt_pair:
    MinMax(fold)

for fold in tomcat_pair:
    MinMax(fold)

In [None]:
for fold in eclipse_pair:
    MinMax(fold)

for fold in birt_pair:
    MinMax(fold)

# VII. Model

Bootstrapping

In [38]:
import random
import time
random.seed(time.time())
batch_size = 128
def batching(fold):
    neg = []
    pos = []
    for pair in fold:
        if pair['label'] == 0:
            neg.append(pair)
        else:
            pos.append(pair)

    k = len(neg)//8

    X = []
    size1 = len(neg) // k
    remainder1 = len(neg)%k
    start = 0
    end = 0
    for i in range(k):
        end = start + size1 + (1 if i < remainder1 else 0)
        X.append(neg[start:end])
        start = end
        

    pos1 = pos.copy()
    random.shuffle(pos1)
    size2 = batch_size - size1
    start = 0
    end = 0
    remainder2 = len(pos1) %k
    for j in range(k):
        end = start + size2 + (1 if j < remainder2 else 0)
        X[j].extend(pos1[start:end])
        start = end
    return X

Model

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction = 'mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, inputs, targets):
        """
        inputs: predicted probabilities (after sigmoid), shape (batch, 1)
        targets: ground truth labels (0 or 1), shape (batch, 1)
        """
        # Clip to avoid log(0)
        eps = 1e-7
        inputs = torch.clamp(inputs, eps, 1.0 - eps)

        # Compute cross entropy
        ce_loss = - (targets * torch.log(inputs) + (1 - targets) * torch.log(1 - inputs))

        # Compute pt (probability of the true class)
        pt = torch.where(targets == 1, inputs, 1 - inputs)

        # Apply focal loss modulation
        focal_weight = (self.alpha * targets + (1 - self.alpha) * (1 - targets)) * (1 - pt) ** self.gamma
        loss = focal_weight * ce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss
    

# Define the NN model
class BugLocNN(nn.Module):
    def __init__(self, input_dim):
        super(BugLocNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 300)
        self.fc2 = nn.Linear(300, 150)
        self.fc3 = nn.Linear(150, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [None]:
import torch
import torch.optim as optim

def train_model(train_fold, epochs):
    # Build batches from training fold
    batches = batching(train_fold)

    # Model, optimizer, loss
    model = BugLocNN(input_dim=5)  # 5-dim vectors
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = FocalLoss(alpha=0.25, gamma=2.0)

    # Training loop
    for epoch in range(epochs):
        epoch_loss = 0.0
        for batch in batches:
            batch_X = torch.tensor([pair['scaled_vector'] for pair in batch])
            batch_y = torch.tensor([pair['label'] for pair in batch]).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(batches):.10f}")

    return model

def test_model(model, test_fold):
    with torch.no_grad():
        # Convert test fold into tensors
        X_test = torch.tensor([pair['scaled_vector'] for pair in test_fold], dtype=torch.float32)

        # Get predictions
        preds = model(X_test)                      # probabilities

    return preds.tolist()


In [71]:
batch = batching(aspectj_pair[0])

In [72]:
type(batch[0][0]['scaled_vector'][4])

float

In [None]:
#Aspectj
for i in range(len(aspectj_pair)):
    if i == len(aspectj_pair) - 1:
        break
    model = train_model(aspectj_pair[i], 30)
    pred = test_model(model, aspectj_pair[i+1])
    for j in range (len(aspectj_pair[i+1])):
        aspectj_pair[i+1][j]['pred'] = pred[j]

Epoch 1/30, Loss: 0.0001047072
Epoch 2/30, Loss: 0.0000167460
Epoch 3/30, Loss: 0.0000164482
Epoch 4/30, Loss: 0.0000162323
Epoch 5/30, Loss: 0.0000160140
Epoch 6/30, Loss: 0.0000157875
Epoch 7/30, Loss: 0.0000155548
Epoch 8/30, Loss: 0.0000153173
Epoch 9/30, Loss: 0.0000150766
Epoch 10/30, Loss: 0.0000148328
Epoch 11/30, Loss: 0.0000145855
Epoch 12/30, Loss: 0.0000143344
Epoch 13/30, Loss: 0.0000140793
Epoch 14/30, Loss: 0.0000138201
Epoch 15/30, Loss: 0.0000135566
Epoch 16/30, Loss: 0.0000132883
Epoch 17/30, Loss: 0.0000130146
Epoch 18/30, Loss: 0.0000127357
Epoch 19/30, Loss: 0.0000124515
Epoch 20/30, Loss: 0.0000121606
Epoch 21/30, Loss: 0.0000118627
Epoch 22/30, Loss: 0.0000115574
Epoch 23/30, Loss: 0.0000112449
Epoch 24/30, Loss: 0.0000109250
Epoch 25/30, Loss: 0.0000105976
Epoch 26/30, Loss: 0.0000102621
Epoch 27/30, Loss: 0.0000099181
Epoch 28/30, Loss: 0.0000095653
Epoch 29/30, Loss: 0.0000092029
Epoch 30/30, Loss: 0.0000088320
Epoch 1/30, Loss: 0.0001010613
Epoch 2/30, Loss: 

In [None]:
#SWT
for i in range(len(swt_pair)):
    if i == len(swt_pair) - 1:
        break
    model = train_model(swt_pair[i], 15)
    pred = test_model(model, swt_pair[i+1])
    for j in range (len(swt_pair[i+1])):
        swt_pair[i+1][j]['pred'] = pred[j]

Epoch 1/15, Loss: 0.0001811172
Epoch 2/15, Loss: 0.0000394441
Epoch 3/15, Loss: 0.0000389755
Epoch 4/15, Loss: 0.0000383912
Epoch 5/15, Loss: 0.0000377332
Epoch 6/15, Loss: 0.0000370409
Epoch 7/15, Loss: 0.0000363323
Epoch 8/15, Loss: 0.0000356125
Epoch 9/15, Loss: 0.0000348829
Epoch 10/15, Loss: 0.0000341440
Epoch 11/15, Loss: 0.0000333959
Epoch 12/15, Loss: 0.0000326401
Epoch 13/15, Loss: 0.0000318733
Epoch 14/15, Loss: 0.0000310907
Epoch 15/15, Loss: 0.0000302906
Epoch 1/15, Loss: 0.0001346292
Epoch 2/15, Loss: 0.0000352337
Epoch 3/15, Loss: 0.0000346691
Epoch 4/15, Loss: 0.0000340130
Epoch 5/15, Loss: 0.0000332890
Epoch 6/15, Loss: 0.0000325278
Epoch 7/15, Loss: 0.0000317441
Epoch 8/15, Loss: 0.0000309442
Epoch 9/15, Loss: 0.0000301289
Epoch 10/15, Loss: 0.0000292974
Epoch 11/15, Loss: 0.0000284484
Epoch 12/15, Loss: 0.0000275816
Epoch 13/15, Loss: 0.0000266955
Epoch 14/15, Loss: 0.0000257907
Epoch 15/15, Loss: 0.0000248681
Epoch 1/15, Loss: 0.0001704041
Epoch 2/15, Loss: 0.0000374

In [137]:
for i in range(len(tomcat_pair)):
    if i == len(tomcat_pair) - 1:
        break
    model = train_model(tomcat_pair[i], 10)
    pred = test_model(model, tomcat_pair[i+1])
    for j in range (len(tomcat_pair[i+1])):
        tomcat_pair[i+1][j]['pred'] = pred[j]

Epoch 1/10, Loss: 0.0005820078
Epoch 2/10, Loss: 0.0000474056
Epoch 3/10, Loss: 0.0000429258
Epoch 4/10, Loss: 0.0000414368
Epoch 5/10, Loss: 0.0000406791
Epoch 6/10, Loss: 0.0000401763
Epoch 7/10, Loss: 0.0000397780
Epoch 8/10, Loss: 0.0000394263
Epoch 9/10, Loss: 0.0000390963
Epoch 10/10, Loss: 0.0000387759
Epoch 1/10, Loss: 0.0006225382
Epoch 2/10, Loss: 0.0000582961
Epoch 3/10, Loss: 0.0000543935
Epoch 4/10, Loss: 0.0000531039
Epoch 5/10, Loss: 0.0000524197
Epoch 6/10, Loss: 0.0000519333
Epoch 7/10, Loss: 0.0000515221
Epoch 8/10, Loss: 0.0000511407
Epoch 9/10, Loss: 0.0000507700
Epoch 10/10, Loss: 0.0000504020
Epoch 1/10, Loss: 0.0006695734
Epoch 2/10, Loss: 0.0000540170
Epoch 3/10, Loss: 0.0000495139
Epoch 4/10, Loss: 0.0000480346
Epoch 5/10, Loss: 0.0000472733
Epoch 6/10, Loss: 0.0000467554
Epoch 7/10, Loss: 0.0000463345
Epoch 8/10, Loss: 0.0000459550
Epoch 9/10, Loss: 0.0000455940
Epoch 10/10, Loss: 0.0000452406
Epoch 1/10, Loss: 0.0004513016
Epoch 2/10, Loss: 0.0000535641
Epoch

# VIII. Evaluation Metrics

Top-k

In [140]:
from sklearn.metrics import top_k_accuracy_score
def TopK(set, name):
    n = {1: 0, 5: 0, 10 : 0, 15:0}
    n1 = 0
    n5 = 0
    n10 = 0
    n15 = 0
    step = len(data[name]['source'])
    count = 0
    for i in range(1, len(set)):
        p = len(set[i])
        for j in range(0, p, step):
            count +=1
            y = [pair for pair in set[i][j:j+step]]
            y.sort(key = lambda x: x['pred'], reverse = True)
            
            #top1
            # Top-1
            if any(pair['label'] == 1 for pair in y[:1]):
                n1 += 1

            # Top-5
            if any(pair['label'] == 1 for pair in y[:5]):
                n5 += 1

            # Top-10
            if any(pair['label'] == 1 for pair in y[:10]):
                n10 += 1

            # Top-15
            if any(pair['label'] == 1 for pair in y[:15]):
                n15 += 1
    n[1] = n1/count
    n[5] = n5/count
    n[10] = n10/count
    n[15] = n15/count
    return n

MMR

In [132]:
def MRR(folds, name):
    step = len(data[name]['source'])   # number of candidate files per bug report
    count = 0
    rr_sum = 0.0                  # sum of reciprocal ranks

    for fold in folds[1:]:  # skip fold 0 if it's training
        for j in range(0, len(fold), step):
            count += 1
            y = fold[j:j+step]

            # sort candidates by prediction score descending
            y.sort(key=lambda x: x['pred'], reverse=True)

            # find the rank of the first buggy file (label == 1)
            rank = None
            for idx, pair in enumerate(y, start=1):
                if pair['label'] == 1:   # dataset ground truth
                    rank = idx
                    break

            # reciprocal rank contribution
            if rank is not None:
                rr_sum += 1.0 / rank
            else:
                rr_sum += 0.0  # no buggy file found

    # mean reciprocal rank
    mrr = rr_sum / count if count > 0 else 0.0
    return mrr


MAP

In [143]:
def MAP(folds, name):
    step = len(data[name]['source'])   # number of candidate files per bug report
    count = 0
    ap_sum = 0.0

    for fold in folds[1:]:  # skip fold 0 if it's training
        pairs = fold   # convert OrderedDict to list
        for j in range(0, len(pairs), step):
            count += 1
            y = pairs[j:j+step]

            # sort by prediction score descending
            y.sort(key=lambda x: x['pred'], reverse=True)

            # total relevant files for this bug report
            R = sum(pair['label'] == 1 for pair in y)
            if R == 0:
                continue  # skip if no relevant files

            precisions = []
            relevant_found = 0
            for idx, pair in enumerate(y, start=1):
                if pair['label'] == 1:
                    relevant_found += 1
                    precisions.append(relevant_found / idx)

            # average precision for this bug report
            ap_sum += sum(precisions) / R

    # mean average precision
    map_score = ap_sum / count if count > 0 else 0.0
    return map_score

In [114]:
type(aspectj_pair[1])

list

In [134]:
mrr_swt = MRR(swt_pair, 'swt')
mrr_swt


0.4349969345603154

Compute

In [148]:
topk_aspectj = TopK(aspectj_pair, 'aspectj')
topk_swt = TopK(swt_pair, 'swt')
topk_tomcat = TopK(tomcat_pair, 'tomcat')

mrr_aspectj = MRR(aspectj_pair, 'aspectj')
mrr_swt = MRR(swt_pair, 'swt')
mrr_tomcat = MRR(tomcat_pair, 'tomcat')



In [161]:
mrr_tomcat

0.008038184618329334

In [162]:
topk_tomcat

{1: 0.0010976948408342481,
 5: 0.0010976948408342481,
 10: 0.0010976948408342481,
 15: 0.003293084522502744}

In [159]:
map_aspectj = MAP(aspectj_pair, 'aspectj')
map_swt = MAP(swt_pair, 'swt')
map_tomcat = MAP(tomcat_pair, 'tomcat')

In [163]:
map_tomcat

0.008038184618329334