In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from triagerx.dataset.text_processor import TextProcessor

from tqdm import tqdm
tqdm.pandas()

In [59]:
# Filter stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text: str) -> str:
    text = str(text)  # In case, there is nan or something else
    cleaned_text = text.strip()

    cleaned_text = re.sub(r"(https?|ftp):\/\/[^\s/$.?#].[^\s]*", "", cleaned_text)
    cleaned_text = re.sub(r"0x[\da-fA-F]+", "<hex>", cleaned_text)
    cleaned_text = re.sub(r"\b[0-9a-fA-F]{16}\b", "<hex>", cleaned_text)
    cleaned_text = re.sub(
        r"\b\d{2}:\d{2}:\d{2}:\d{4,} GMT\b",
        "",
        cleaned_text,
    )
    cleaned_text = re.sub(
        r"\b\d{2}:\d{2}:\d{2}(\.\d{2,3})?\b",
        "",
        cleaned_text,
    )
    cleaned_text = re.sub(
        r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\b",
        "",
        cleaned_text,
    )
    cleaned_text = re.sub(r"```", "", cleaned_text)
    cleaned_text = re.sub(r"-{3,}", "", cleaned_text)
    cleaned_text = re.sub(r"[\*#=+\-]{3,}", "", cleaned_text)

    cleaned_text = re.sub(r"(\r?\n)+", "\n", cleaned_text)
    cleaned_text = re.sub(r"(?![\r\n])\s+", " ", cleaned_text)
    cleaned_text = cleaned_text.strip()

    tokens = word_tokenize(cleaned_text)
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

In [68]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/data/openj9/openj9_08122024.csv"

df = pd.read_csv(dataset_path)
df = df.rename(columns={"assignees": "owner", "issue_body": "description"})

In [69]:
df.head()

Unnamed: 0,issue_number,issue_title,description,issue_url,issue_state,creator,labels,owner,component
0,2,Build instructions link in the README.md point...,The `Build instructions` link in the `README.m...,https://github.com/eclipse-openj9/openj9/issues/2,closed,aarongraham9,,gireeshpunathil,
1,3,FAQ link in the README is broken,FAQ link in the README leads to: http://www.ec...,https://github.com/eclipse-openj9/openj9/issues/3,closed,dorrab,,mpirvu,
2,5,Link to DockerFile on build instruction page i...,Link for DockerFile on [build instruction page...,https://github.com/eclipse-openj9/openj9/issues/5,closed,r30shah,,r30shah,
3,11,HOWTO Request: Managing changes across depende...,"Like all projects, OpenJ9 builds on the should...",https://github.com/eclipse-openj9/openj9/issue...,open,mgaudet,question,hzongaro,
4,13,Compilation Output is too Verbose,The output when compiling the OpenJ9 source co...,https://github.com/eclipse-openj9/openj9/issue...,closed,rservant,"enhancement, comp:build",hzongaro,comp:build


In [70]:
df["text"] = df.apply(lambda row: f"{row['issue_title']}\n{row['description']}", axis=1)

df = df.sort_values(by="issue_number")
df = df[df["owner"].notna()]
test_size = 0.1

num_issues = len(df)
print(f"Total number of issues after processing: {num_issues}")

df = df.sort_values(by="issue_number")

df_train, df_test = train_test_split(df, test_size=test_size, shuffle=False)

sample_threshold = 20
developers = df_train["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
df_train = df_train[df_train["owner"].isin(filtered_developers)]

train_owners = set(df_train["owner"])
test_owners = set(df_test["owner"])

unwanted = list(test_owners - train_owners)

df_test = df_test[~df_test["owner"].isin(unwanted)]

print(f"Training data: {len(df_train)}, Validation data: {len(df_test)}")
print(f"Number of developers: {len(df_train.owner.unique())}")

print(f"Train dataset size: {len(df_train)}")
print(f"Test dataset size: {len(df_test)}")

Total number of issues after processing: 4595
Training data: 3417, Validation data: 386
Number of developers: 50
Train dataset size: 3417
Test dataset size: 386


In [76]:
from collections import Counter
import numpy as np

def compute_smoothed_unigram_models(term_frequencies, mu=0.1):
    """ Compute smoothed unigram probabilities for all reports """
    K = len(term_frequencies)  # Total number of reports
    all_terms = set(term for tf in term_frequencies for term in tf)
    
    # Compute global term frequencies across all reports
    global_term_frequencies = Counter()
    for tf in term_frequencies:
        global_term_frequencies.update(tf)

    smoothed_models = []
    for k, tf_k in enumerate(term_frequencies):
        total_words_k = sum(tf_k.values())

        smoothed_model = {}
        for term in all_terms:
            tf_term_k = tf_k.get(term, 0)
            global_tf_term = global_term_frequencies[term]

            term_prob = (1 - mu) * (tf_term_k / total_words_k if total_words_k > 0 else 0) + \
                        mu * (global_tf_term / sum(global_term_frequencies.values()))
            
            smoothed_model[term] = term_prob

        smoothed_models.append(smoothed_model)

    return smoothed_models


In [80]:
def compute_term_frequencies(reports):
    """ Compute term frequency for each report """
    term_frequencies = []
    for report in reports:
        term_frequencies.append(Counter(report))
    return term_frequencies

In [85]:
all_bug_reports = [preprocess_text(text) for text in df_train.text.tolist()]

In [87]:
def create_report_counts(reports):
    """Creates word count dictionaries from tokenized reports."""
    all_reports_counts = []
    for report in reports:
        counts = Counter(report)
        all_reports_counts.append(counts)
    return all_reports_counts

In [89]:
all_reports_counts = create_report_counts(all_bug_reports)

In [91]:
vocabulary = set([word for report in all_bug_reports for word in report])

In [96]:
import numpy as np
from collections import Counter

def precompute_collection_probabilities(all_reports_counts, vocabulary):
    """Precomputes and stores collection-wide word probabilities."""
    collection_word_counts = {}
    for report_counts in all_reports_counts:
        for word, count in report_counts.items():
            collection_word_counts[word] = collection_word_counts.get(word, 0) + count

    total_words_in_collection = sum(collection_word_counts.values())

    collection_probabilities = {}
    for word, count in collection_word_counts.items():
      collection_probabilities[word] = count / total_words_in_collection if total_words_in_collection > 0 else 0

    return collection_probabilities

def smoothed_unigram(word, report_k_counts, collection_probabilities, vocabulary, mu):
    """Calculates smoothed unigram probability using precomputed values."""

    report_k_total_words = sum(report_k_counts.values())
    report_k_prob = report_k_counts.get(word, 0) / report_k_total_words if report_k_total_words > 0 else 0

    collection_prob = collection_probabilities.get(word, 0)  # Retrieve precomputed value

    return (1 - mu) * report_k_prob + mu * collection_prob


def kl_divergence(report_q_probs, report_k_probs):
    """Calculates the KL-divergence between two probability distributions."""
    kl = 0.0
    for word, prob_q in report_q_probs.items():
        prob_k = report_k_probs.get(word, 0)  # Handle words not in report_k
        if prob_q > 0 and prob_k > 0:  # Avoid log(0) errors and division by 0
          kl += prob_q * np.log(prob_q / prob_k)
        elif prob_q > 0: # only add if prob_q is not 0
          kl += prob_q * np.log(prob_q / 1e-10) # a small value to avoid log(0)
    return kl


In [97]:
collection_probabilities = precompute_collection_probabilities(all_reports_counts, vocabulary)

In [101]:
all_test_reports = [preprocess_text(text) for text in df_test.text.tolist()]

In [117]:
def get_kl_scores(query_report):
    query_report_counts = Counter(query_report)
    mu = 0.1
    kl_scores = []

    query_report_probs = {}
    for word in vocabulary:
      query_report_probs[word] = smoothed_unigram(word, query_report_counts, collection_probabilities, vocabulary, mu)

    for i, report_k_counts in enumerate(all_reports_counts):
        report_k_probs = {}
        for word in vocabulary:
          report_k_probs[word] = smoothed_unigram(word, report_k_counts, collection_probabilities, vocabulary, mu)

        kl = kl_divergence(query_report_probs, report_k_probs)
        kl_scores.append(kl)

    return kl_scores

In [175]:
import os
import json

def get_contribution_data(
        issue_number: int
    ):
        """
        Retrieves the contribution data for a given issue number.

        Args:
            issue_number (int): The issue number.

        Returns:
            Dict[str, List[Tuple[str, str]]]: A dictionary containing contribution data.
        """
        contributions = {}
        issue_file = f"{issue_number}.json"
        last_assignment = None

        with open(os.path.join("/home/mdafifal.mamun/notebooks/triagerX/data/openj9/issue_data", issue_file), "r") as file:
            issue = json.load(file)
            assignees = issue.get("assignees", [])
            assignee_logins = (
                [(assignee["login"], None) for assignee in assignees]
                if assignees
                else []
            )
            contributions["direct_assignment"] = assignee_logins
            timeline = issue.get("timeline_data", [])
            pull_requests, commits, discussion = [], [], []

            for timeline_event in timeline:
                event = timeline_event.get("event")
                created_at = timeline_event.get("created_at")
                actor = timeline_event.get("actor", {})

                if not actor:
                    continue

                actor = actor.get("login")

                if event == "cross-referenced" and timeline_event["source"].get(
                    "issue", {}
                ).get("pull_request"):
                    pull_requests.append((actor, created_at))
                    last_assignment = actor
                elif event == "referenced" and timeline_event.get("commit_url"):
                    commits.append((actor, created_at))
                    last_assignment = actor
                elif event == "commented":
                    discussion.append((actor, created_at))

            contributions["pull_request"] = pull_requests
            contributions["commits"] = commits
            contributions["discussion"] = discussion
            contributions["last_assignment"] = (
                [(last_assignment, None)] if last_assignment else []
            )

        return contributions

In [178]:
from collections import defaultdict

def construct_mdn(reports, developers, df_train):
    """Constructs the Multi-Developer Network (MDN) using commit and discussion data."""

    network = defaultdict(lambda: {"commits": defaultdict(int), "comments": defaultdict(int)})

    for report_id in reports:
        report_data = get_contribution_data(report_id)  # Get report data from DataFrame
        owner = df_train[df_train["issue_number"] == report_id].iloc[0]["owner"] # Get the assignee/owner

        for commit_info in report_data["commits"]:
            committer = commit_info[0]
            if committer != owner and committer in developers: # Only add if the committer is not the owner
                network[committer]["commits"][owner] += 1  # Increment commit count

        for comment_info in report_data["discussion"]:
            commenter = comment_info[0]
            if commenter != owner and commenter in developers:  # Only add if commenter is not the owner
                network[commenter]["comments"][owner] += 1 # Increment comment count

    return network


def normalize_edge_weights(network):
    """Normalizes edge weights (commits and comments) in the MDN."""

    # 1. Calculate Maximum Commit and Comment Counts Globally
    max_commit = 0  # Initialize globally
    max_comment = 0  # Initialize globally

    for dev1, interactions in network.items():
        for dev2, commits in interactions["commits"].items():
            max_commit = max(max_commit, commits)  # Find global max commit
        for dev2, comments in interactions["comments"].items():
            max_comment = max(max_comment, comments)  # Find global max comment

    for dev1, interactions in network.items():
        for dev2 in interactions["commits"]:
            network[dev1]["commits"][dev2] = network[dev1]["commits"][dev2] / max_commit + 0.00001

        for dev2 in interactions["comments"]:
            network[dev1]["comments"][dev2] = network[dev1]["comments"][dev2] / max_comment + 0.00001

    return network



def calculate_developer_rank(developer_name, network):
    """Calculates the ranking score for a developer."""

    interactions = network.get(developer_name, {"commits": {}, "comments": {}})
    rank = 0
    for dev2, commit_weight in interactions["commits"].items():
        comment_weight = interactions["comments"].get(dev2, 0)
        rank += commit_weight + comment_weight

    return rank

def rank_developers(network):
    """Ranks developers based on their ranking scores."""

    developer_ranks = [(dev, calculate_developer_rank(dev, network)) for dev in network]
    developer_ranks.sort(key=lambda x: x[1], reverse=True)
    return developer_ranks


In [None]:
ranks = []

for bug_id in range(5):
    print("Predicting for bug_id:", bug_id)
    query_report = all_test_reports[bug_id]
    kl_scores = get_kl_scores(query_report)
    kl_scores = dict(zip(df_train.issue_number.tolist(), kl_scores))
    top_similar_issues = sorted(kl_scores.items(), key=lambda x: x[1])[:20]

    # Check if component is not null
    query_component = df_test.iloc[bug_id]["component"]
    similar_component_issues = None
    if not pd.isnull(query_component):
        similar_component_issues = df_train[df_train["component"] == query_component]

    developer_set = set()
    similar_issue_set = set()

    for issue_number, _ in top_similar_issues:
        contributions = get_contribution_data(issue_number)
        similar_issue_set.add(issue_number)
        for key, value in contributions.items():
            for contributor, _ in value:
                developer_set.add(contributor)

    developer_set2 = set()
    # Iterate top similar issues by similar_component_issues
    if similar_component_issues is not None:
        for issue_number in similar_component_issues.issue_number.tolist():
            similar_issue_set.add(issue_number)
            contributions = get_contribution_data(issue_number)
            for key, value in contributions.items():
                for contributor, _ in value:
                    developer_set2.add(contributor)
                    developer_set = developer_set.intersection(developer_set2) 

    MDN = construct_mdn(similar_issue_set, developer_set, df_train)
    normalized_MDN = normalize_edge_weights(MDN)

    ranked_developers = rank_developers(normalized_MDN)
    ranks.append(ranked_developers)

Predicting for bug_id: 0
{'keithc-ca', 'ghost', 'andrewcraik', 'Thihup', 'tajila', 'R-Santhir', 'nbhuiyan', 'fjeremic', 'thallium', 'knn-k', 'AdamBrousseau', 'zl-wang', 'fengxue-IS', 'jdmpapin', 'Bromarv', 'cathyzhyi', 'hangshao0', 'edmathew234', 'Spencer-Comin', '0xdaryl', 'apaj', 'shingarov', 'JasonFengJ9', 'mpirvu', 'janvrany', 'DingliZhang', 'mstoodle', 'acrowthe', 'renfeiw', 'karianna', 'harryyu1994', 'babsingh', 'dsouzai', 'smlambert', 'llxia', 'VermaSh', 'ChengJin01', 'PascalSchumacher', 'alistair23', 'DanHeidinga', 'gacholio', 'PTamis', 'pshipton', 'mleipe', 'sxa', 'AlenBadel', 'vijaysun-omr'}
Predicting for bug_id: 1
{'pdbain-ibm'}
Predicting for bug_id: 2
{'pshipton'}
Predicting for bug_id: 3
{'dsouzai', 'keithc-ca', 'tajila', '0xdaryl', 'jdmpapin', 'JasonFengJ9', 'llxia', 'mpirvu', 'a7ehuo', 'ymanton', 'pshipton', 'mingweiarthurli', 'LongyuZhang', 'hzongaro', 'babsingh'}
Predicting for bug_id: 4
set()


In [225]:
# Compute topk accuracy
topk = 20

for k in range(1, topk + 1):
    predicted_devs = []
    for rank in ranks:
        predicted_devs.append([dev.lower() for dev, _ in rank[:k]])

    owners = df_test.owner.tolist()[:5]
    topk_accuracy = sum([1 for owner, rank in zip(owners, predicted_devs) if owner in rank]) / len(owners)
    print(f"Top-{k} accuracy: {topk_accuracy}")


Top-1 accuracy: 0.2
Top-2 accuracy: 0.2
Top-3 accuracy: 0.2
Top-4 accuracy: 0.2
Top-5 accuracy: 0.4
Top-6 accuracy: 0.4
Top-7 accuracy: 0.4
Top-8 accuracy: 0.4
Top-9 accuracy: 0.4
Top-10 accuracy: 0.4
Top-11 accuracy: 0.4
Top-12 accuracy: 0.4
Top-13 accuracy: 0.4
Top-14 accuracy: 0.4
Top-15 accuracy: 0.4
Top-16 accuracy: 0.4
Top-17 accuracy: 0.4
Top-18 accuracy: 0.4
Top-19 accuracy: 0.4
Top-20 accuracy: 0.4


0.2