## Import Libraries 

In [46]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\achop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\achop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Preprocessing (stopwords + stemming)

In [47]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # lowercase
    text = text.lower()
    # keep letters/digits/underscores + spaces
    text = re.sub(r"[^a-z0-9_]+", " ", text)
    tokens = word_tokenize(text)
    tokens = [
        stemmer.stem(t)
        for t in tokens
        if t not in stop_words and len(t) > 1
    ]
    return " ".join(tokens)


### Load AspectJ_Dataset dataset

In [48]:
bug_file = "CIS580_Assignment_BugLocalization\\AspectJ_Dataset.txt"   # path relative to your notebook

bugs_df = pd.read_csv(bug_file, sep="\t", encoding="utf-8")
print(bugs_df.columns)      # just to verify
bugs_df.head()


Index(['id', 'bug_id', 'summary', 'description', 'report_time',
       'report_timestamp', 'status', 'commit', 'commit_timestamp', 'files',
       'Unnamed: 10'],
      dtype='object')


Unnamed: 0,id,bug_id,summary,description,report_time,report_timestamp,status,commit,commit_timestamp,files,Unnamed: 10
0,1,423257,Bug 423257 LTW - java.lang.VerifyError: Bad re...,,2013-12-04 19:43:22,1386200000,resolved fixed,dd88d21,1386350000,org.aspectj.ajdt.core/src/org/aspectj/ajdt/int...,52:tests/src/org/aspectj/systemtest/incrementa...
1,2,420210,Bug 420210 Support additional message insert k...,It would be good to be able to insert the encl...,2013-10-23 16:00:16,1382560000,resolved fixed,9319e34,1382560000,org.aspectj.matcher/src/org/aspectj/weaver/Che...,63:org.aspectj.matcher/src/org/aspectj/weaver/...
2,3,419279,Bug 419279 ajc option to change -Xlint level p...,The -Xlintfile option is not a great fit for c...,2013-10-11 15:20:08,1381520000,resolved fixed,b2cd5fa,1382560000,org.aspectj.ajdt.core/src/org/aspectj/ajdt/int...,6:org.aspectj.ajdt.core/src/org/aspectj/ajdt/i...
3,4,415266,Bug 415266 LTW not working when JMX is enabled,When I enable JMX remote management on a JVM a...,2013-08-16 17:51:29,1376690000,resolved fixed,9e992d6,1380650000,loadtime/src/org/aspectj/weaver/loadtime/Aj.java,5:loadtime/src/org/aspectj/weaver/loadtime/Aj....
4,5,418129,Bug 418129 Can't introduce annotation onto int...,,2013-09-26 14:31:09,1380220000,resolved fixed,2393bef,1380650000,tests/bugs174/pr418129/Target.java tests/bugs1...,21:weaver/src/org/aspectj/weaver/bcel/BcelClas...


### Combine text field and preprocess

In [49]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\achop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [50]:
bugs_df["text"] = (
    bugs_df["summary"].fillna("") + " " +
    bugs_df["description"].fillna("")
)

bugs_df["clean_text"] = bugs_df["text"].apply(preprocess_text)

len(bugs_df)


593

## Preprocess source files

In [51]:
source_root = "CIS580_Assignment_BugLocalization\\sourceFile_aspectj\\org.aspectj"

file_paths = []
file_texts = []

for root, _, files in os.walk(source_root):
    for fname in files:
        if not fname.endswith(".java"):
            continue
        full_path = os.path.join(root, fname)
        try:
            with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
                raw = f.read()
        except Exception as e:
            print("Skipping file due to error:", full_path, e)
            continue

        file_paths.append(full_path)
        file_texts.append(preprocess_text(raw))

print("Total source files:", len(file_paths))


Total source files: 2394


## Build TF-IDF Vector on all bug text and source text

In [52]:
all_docs = list(bugs_df["clean_text"]) + file_texts

vectorizer = TfidfVectorizer(
    max_df=0.9,      # ignore extremely common tokens
    min_df=2,        # ignore extremely rare tokens
    sublinear_tf=True
)

tfidf_matrix = vectorizer.fit_transform(all_docs)

n_bugs = len(bugs_df)
bug_matrix  = tfidf_matrix[:n_bugs, :]
file_matrix = tfidf_matrix[n_bugs:, :]

bug_matrix.shape, file_matrix.shape


((593, 18850), (2394, 18850))

## Compute Cosine Similarity based on TF-IDF vectors

In [53]:
index_to_file = {idx: path for idx, path in enumerate(file_paths)}
bug_rankings = {}   # bug_id -> {file_path: rank}
for bug_idx, row in bugs_df.iterrows():
    print(f"Processing bug {bug_idx+1}/{len(bugs_df)}")
    bug_id = row["bug_id"]

    bug_vec = bug_matrix[bug_idx]              # (1, vocab)
    sims = cosine_similarity(bug_vec, file_matrix)[0]   # 1D array

    sorted_idx = np.argsort(-sims)             # high â†’ low

    ranking_dict = {}
    for rank, file_idx in enumerate(sorted_idx, start=1):
        ranking_dict[index_to_file[file_idx]] = rank

    bug_rankings[bug_id] = ranking_dict


Processing bug 1/593
Processing bug 2/593
Processing bug 3/593
Processing bug 4/593
Processing bug 5/593
Processing bug 6/593
Processing bug 7/593
Processing bug 8/593
Processing bug 9/593
Processing bug 10/593
Processing bug 11/593
Processing bug 12/593
Processing bug 13/593
Processing bug 14/593
Processing bug 15/593
Processing bug 16/593
Processing bug 17/593
Processing bug 18/593
Processing bug 19/593
Processing bug 20/593
Processing bug 21/593
Processing bug 22/593
Processing bug 23/593
Processing bug 24/593
Processing bug 25/593
Processing bug 26/593
Processing bug 27/593
Processing bug 28/593
Processing bug 29/593
Processing bug 30/593
Processing bug 31/593
Processing bug 32/593
Processing bug 33/593
Processing bug 34/593
Processing bug 35/593
Processing bug 36/593
Processing bug 37/593
Processing bug 38/593
Processing bug 39/593
Processing bug 40/593
Processing bug 41/593
Processing bug 42/593
Processing bug 43/593
Processing bug 44/593
Processing bug 45/593
Processing bug 46/5

In [54]:
rows_for_csv = []

normalized_paths = [p.replace("\\", "/") for p in file_paths]

for _, row in bugs_df.iterrows():
    bug_id = row["bug_id"]
    gt_raw = str(row["files"])

    gt_list = [g.strip() for g in gt_raw.split() if g.strip()]

    ranks_dict = bug_rankings[bug_id]
    best_rank = None

    for gt in gt_list:
        gt_norm = gt.replace("\\", "/")

        # Extract only the filename
        print("GT norm:", gt_norm)
        filename = gt_norm.split("/")[-1]  # e.g. AjState.java
        print("Filename to match:", filename)
        # Now match using filename only (because actual files are renamed)
        for file_path_norm, rank in ranks_dict.items():

            fp = file_path_norm.replace("\\", "/")

            # This is the CORRECT matching rule
            if filename in fp:
                if best_rank is None or rank < best_rank:
                    best_rank = rank

    if best_rank is None:
        best_rank = len(file_paths) + 1

    rows_for_csv.append({"BugID": bug_id, "Rank": best_rank})

result_df = pd.DataFrame(rows_for_csv)
result_df.to_csv("bug_localization_ranks.csv", index=False)
result_df.head()


GT norm: org.aspectj.ajdt.core/src/org/aspectj/ajdt/internal/core/builder/AjState.java
Filename to match: AjState.java
GT norm: tests/bugs175/pr423257/AspectX.java
Filename to match: AspectX.java
GT norm: tests/bugs175/pr423257/Test.java
Filename to match: Test.java
GT norm: tests/src/org/aspectj/systemtest/AllTests17.java
Filename to match: AllTests17.java
GT norm: tests/src/org/aspectj/systemtest/ajc175/Ajc175Tests.java
Filename to match: Ajc175Tests.java
GT norm: tests/src/org/aspectj/systemtest/ajc175/AllTestsAspectJ175.java
Filename to match: AllTestsAspectJ175.java
GT norm: tests/src/org/aspectj/systemtest/incremental/tools/MultiProjectIncrementalTests.java
Filename to match: MultiProjectIncrementalTests.java
GT norm: weaver/src/org/aspectj/weaver/bcel/asm/StackMapAdder.java
Filename to match: StackMapAdder.java
GT norm: org.aspectj.matcher/src/org/aspectj/weaver/Checker.java
Filename to match: Checker.java
GT norm: tests/bugs174/extra_inserts/Code.java
Filename to match: Code.ja

Unnamed: 0,BugID,Rank
0,423257,5
1,420210,1
2,419279,3
3,415266,67
4,418129,3


In [55]:
TOP_N = 10   # change to required value

rows_for_csv = []

# Normalize file paths first
normalized_paths = [p.replace("\\", "/") for p in file_paths]

for _, row in bugs_df.iterrows():
    bug_id = row["bug_id"]

    # sorted ranked list of source files (high â†’ low similarity)
    ranks_dict = bug_rankings[bug_id]

    # Sort by rank (1 is best)
    sorted_files = sorted(ranks_dict.items(), key=lambda x: x[1])

    # Take top-N file paths
    top_n_files = [path for path, rank in sorted_files[:TOP_N]]

    # Prepare row dict
    entry = {"BugID": bug_id}

    for i, f in enumerate(top_n_files, start=1):
        entry[f"File{i}"] = f

    rows_for_csv.append(entry)

# Make DataFrame + Save
output_df = pd.DataFrame(rows_for_csv)
output_df.to_csv("bug_localization_topN.csv", index=False)

output_df.head()


Unnamed: 0,BugID,File1,File2,File3,File4,File5,File6,File7,File8,File9,File10
0,423257,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...
1,420210,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...
2,419279,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...
3,415266,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...
4,418129,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...,CIS580_Assignment_BugLocalization\sourceFile_a...


In [56]:
(result_df["Rank"] == 1).sum()


np.int64(170)

In [57]:
result_df["Rank"].describe()


count     593.000000
mean       95.227656
std       243.581214
min         1.000000
25%         1.000000
50%         9.000000
75%        63.000000
max      1801.000000
Name: Rank, dtype: float64