In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from loguru import logger
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, PreTrainedTokenizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

tqdm.pandas()

  _torch_pytree._register_pytree_node(


# Dataset

In [2]:
def component_split(x):
    x_split = str(x).split(",")

    for s in x_split:
        if "comp:" in s.lower():
            return s.strip()
    return None

In [3]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/openj9_topic_all_issues.csv"

df = pd.read_csv(dataset_path)
print(len(df))
df = df.rename(columns={"assignees": "owner", "issue_body": "description"})
# df = df[df["owner"].notna()]

def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', regex=True)
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["labels"].notna()]
    print(f"All issues: {len(df)}")
    df = df[~df["issue_url"].str.contains("/pull/")]
    print(f"Excluding pull: {len(df)}")
    df["component"] = df["labels"].apply(component_split)
    
    df["text"] = df.progress_apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            # + "\nIssue Labels: "
            # + str(x["labels"])
            + "\nIssue Topic: "
            + str(x["topic_label"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(df)
df = clean_data(df)
df = df.sort_values(by="issue_number")

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

18278
All issues: 16342
Excluding pull: 6990


100%|██████████| 6990/6990 [00:00<00:00, 85160.94it/s]


Total number of issues: 6990


In [4]:
df["topic_hot"] = pd.get_dummies(df["topic_id"]).values.tolist()

In [5]:
components = set()

In [6]:
for val in df["component"].values:
    if val is None:
        continue
    
    split = val.split(",")
    
    for s in split:
        components.add(s.strip())

In [7]:
components

{'comp:build',
 'comp:crypto',
 'comp:doc',
 'comp:fips',
 'comp:gc',
 'comp:infra',
 'comp:jclextensions',
 'comp:jit',
 'comp:jit:aot',
 'comp:jitserver',
 'comp:jvmti',
 'comp:openssl',
 'comp:port',
 'comp:test',
 'comp:vm'}

In [8]:
component_values = df["component"].value_counts()
filtered_components = component_values.index[component_values >= 20]

df = df[df["component"].isin(filtered_components)]

In [9]:
def split_by_component(source_df, train_size=0.8):
    grouped = source_df.groupby('component')

    # Initialize two empty lists to store the split datasets
    dataset_1 = []
    dataset_2 = []

    # Iterate over each group
    for _, group_df in grouped:
        # Split the group into two halves
        first_idx = int(len(group_df) * train_size)
        group_half_1 = group_df.iloc[:first_idx]
        group_half_2 = group_df.iloc[first_idx:]
        
        # Append each half to the respective dataset
        dataset_1.append(group_half_1)
        dataset_2.append(group_half_2)

    return pd.concat(dataset_1, ignore_index=True), pd.concat(dataset_2, ignore_index=True)

In [10]:
df = df.sort_values(by="issue_number")

In [11]:
components = ["comp:vm", "comp:jvmti", "comp:jclextensions", "comp:test", "comp:build", "comp:gc"]
filtered_df = df[df["component"].isin(components)]

# Splitting parition by size
total_data = len(filtered_df)
train_size = int(total_data*0.9)
test_size = total_data - train_size
df_train = filtered_df[:train_size]
df_test = filtered_df[train_size:]


print(len(df_train), len(df_test))

2655 296


In [12]:
assert set(df_train.component.unique()) == set(df_test.component.unique())

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Generate component ids
label2idx = {label: idx for idx, label in enumerate(list(df_train["component"].unique()))}
df_train["component_id"] = [label2idx[component] for component in df_train["component"].values]
df_test["component_id"] = [label2idx[component] for component in df_test["component"].values]

df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42, shuffle=True)

print("Dataset size", len(df_train), len(df_val), len(df_test))

Dataset size 2124 531 296


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["component_id"] = [label2idx[component] for component in df_train["component"].values]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["component_id"] = [label2idx[component] for component in df_test["component"].values]


In [15]:
class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        feature: str = "text",
        target: str = "component_id",
    ):
        logger.debug("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        # self.embedding_model = SentenceTransformer("BAAI/bge-small-en")
        logger.debug("Tokenizing texts...")
        self.texts = [
            (row[feature], self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            ), torch.tensor(row.topic_hot))
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [16]:
class LBTPClassifierTopic(nn.Module):
    def __init__(
        self, output_size, topic_size, unfrozen_layers=4, embed_size=1024, dropout=0.1
    ) -> None:
        super().__init__()
        model_name = "microsoft/deberta-large"
        self.base_model = AutoModel.from_pretrained(
            model_name, output_hidden_states=True
        )
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Freeze embedding layers
        for p in self.base_model.embeddings.parameters():
            p.requires_grad = False

        # Freeze encoder layers till last {unfrozen_layers} layers
        for i in range(0, self.base_model.config.num_hidden_layers - unfrozen_layers):
            for p in self.base_model.encoder.layer[i].parameters():
                p.requires_grad = False

        filter_sizes = [3, 4, 5, 6]
        self._num_filters = 256
        self._max_tokens = 512
        self._embed_size = embed_size
        self.unfrozen_layers = unfrozen_layers
        self.conv_blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        nn.Sequential(
                            nn.Conv2d(1, self._num_filters, (K, embed_size)),
                            nn.BatchNorm2d(self._num_filters),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.MaxPool1d(self._max_tokens - (K - 1)),
                            nn.Flatten(start_dim=1),
                        )
                        for K in filter_sizes
                    ]
                )
                for _ in range(unfrozen_layers)
            ]
        )

        self.classifiers = nn.ModuleList(
            [
                nn.Linear(
                    len(filter_sizes) * self._num_filters + topic_size, output_size
                )
                for _ in range(unfrozen_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask, tok_type, topic_id):
        outputs = []

        base_out = self.base_model(input_ids=input_ids, token_type_ids=tok_type, attention_mask=attention_mask)
        # pooler_out = base_out.last_hidden_state.squeeze(0)
        hidden_states = base_out.hidden_states[-self.unfrozen_layers :]

        for i in range(self.unfrozen_layers):
            batch_size, sequence_length, hidden_size = hidden_states[i].size()
            x = [
                conv(hidden_states[i].view(batch_size, 1, sequence_length, hidden_size))
                for conv in self.conv_blocks[i]
            ]
            # Concatanating outputs of the conv block of different filter sizes
            x = torch.cat(x, dim=1)
            x = self.dropout(x)
            x = torch.cat([x, topic_id], dim=1)
            x = self.classifiers[i](x)

            outputs.append(x)

        return outputs

    def tokenizer(self) -> AutoTokenizer:
        return self._tokenizer


In [17]:
class CombineLoss(nn.Module):
    def __init__(self, weights = None) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss(weight=weights)
    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

# Training

In [18]:
num_classes = len(df_train["component"].unique())
print(num_classes)

6


In [19]:
class_counts = np.bincount(df_train["component_id"])
num_samples = sum(class_counts)
labels = df_train["component_id"].to_list() # corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
# weights_load_location = f"/work/disa_lab/projects/triagerx/models/deberta_component_prediction.pt"

# Define hyperparameters
learning_rate = 1e-5
epochs = 25
batch_size = 10

model = LBTPClassifierTopic(len(df_train.component_id.unique()), topic_size=20, unfrozen_layers=4, dropout=0.2)
criterion = CombineLoss(weights=None)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.001)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=2, factor=0.1, threshold=1e-10)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [20]:
# Load best checkpoint
weights_load_location = f"/work/disa_lab/projects/triagerx/models/deberta_component_prediction_chrono_10class.pt"
model.load_state_dict(torch.load(weights_load_location))

<All keys matched successfully>

In [21]:
# Prepare torch dataset from train and validation splits
test = TriageDataset(df_test, model.tokenizer())

[32m2024-04-21 19:13:11.791[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-04-21 19:13:11.793[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_loss = float("inf")

if torch.cuda.is_available():
    logger.debug(f"Selected compute device: {device}")
    model = model.cuda()
    criterion = criterion.cuda()

[32m2024-04-21 19:13:12.514[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mSelected compute device: cuda[0m


In [23]:
test_dataloader = DataLoader(test, batch_size=batch_size)

In [24]:
total_acc_val = 0
total_loss_val = 0
correct_top_k = 0
correct_top_k_wo_sim = 0

all_preds = []
all_labels = []
topk_preds = []

device="cuda"

model = model.cuda()

model.eval()

with torch.no_grad():

    for val_input, val_label in test_dataloader:
        val_label = val_label.to(device)
        mask = val_input[1]["attention_mask"].squeeze(1).to(device)
        input_id = val_input[1]["input_ids"].squeeze(1).to(device)
        tok_type = val_input[1]["token_type_ids"].squeeze(1).to(device)
        repr = val_input[2].to(device)

        output = model(input_id, mask, tok_type, repr)

        output = torch.sum(torch.stack(output), 0)

        #wo similarity
        _, top_k_wo_sim = output.topk(3, 1, True, True)

        topk_preds.append(top_k_wo_sim.cpu().numpy())
        top_k_wo_sim = top_k_wo_sim.t()
        

        correct_top_k_wo_sim += (
            top_k_wo_sim.eq(
                val_label.view(1, -1).expand_as(top_k_wo_sim)
            )
            .sum()
            .item()
        )


        all_preds.append(output.argmax(dim=1).cpu().numpy())
        all_labels.append(val_label.cpu().numpy())

In [25]:
topk_preds = np.concatenate(topk_preds, axis=0)

In [26]:
all_preds_cat = np.concatenate(all_preds)
all_labels_cat = np.concatenate(all_labels)

accuracy = np.mean(all_preds_cat == all_labels_cat)

precision, recall, f1_score, _ = precision_recall_fscore_support(
    all_labels_cat, all_preds_cat, average="macro"
)

In [27]:
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1_score}")

Accuracy: 0.75, Precision: 0.5888617891563539, Recall: 0.536684369082686, F1: 0.5378976175501237


In [28]:
print(f"Correct Prediction without Similarity: {correct_top_k_wo_sim}, {correct_top_k_wo_sim / len(df_test)}")
# print(f"Correct Prediction with Similarity: {correct_top_k}, {correct_top_k / len(y_df)}")

Correct Prediction without Similarity: 292, 0.9864864864864865


In [29]:
idx2label = {idx: label for idx, label in enumerate(label2idx)}

In [30]:
# Load embeddings for all train data
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
all_embeddings = similarity_model.encode(df_train.text.to_list(), batch_size=15)

  _torch_pytree._register_pytree_node(


In [31]:
def get_top_k_similar_issues(issues, k=5, threshold=0.5):
    test_embed = similarity_model.encode(issues)
    cos = util.cos_sim(test_embed, all_embeddings)
    topk_values, topk_indices = torch.topk(cos, k=k)
    topk_values = topk_values.cpu().numpy()[0]
    topk_indices = topk_indices.cpu().numpy()[0]
    
    similar_issues = []
    
    for idx, sim_score in zip(topk_indices, topk_values):
        if sim_score >= threshold:
            similar_issues.append([idx, sim_score])

    return similar_issues

In [32]:
similarity_predictions = [
    get_top_k_similar_issues(issue.text, k=3)
    for _, issue in df_test.iterrows() 
]

In [33]:
import pathlib
import os
import json

In [34]:
issues_dir = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/issue_data"
all_issues = os.listdir(issues_dir)

In [59]:
vm_users = ['pshipton', 'keithc-ca', 'gacholio', 'tajila', 'babsingh', 'JasonFengJ9', 'fengxue-IS', 'hangshao0', 'theresa.t.mammarella', 'ChengJin01', 'singh264', 'thallium', 'ThanHenderson']
jvmti_users = ['gacholio', 'tajila', 'babsingh', 'fengxue-IS']
jclextensions_users = ['JasonFengJ9', 'pshipton', 'keithc-ca']
test_users = ['LongyuZhang', 'annaibm', 'sophiaxu0424', 'KapilPowar', 'llxia']
build_users = ['adambrousseau', 'mahdipub']
gc_users = ['dmitripivkine', 'amicic', 'kangyining', 'LinHu2016']

# Putting them in dictionaries
components = {
    'comp:vm': vm_users,
    'comp:jvmti': jvmti_users,
    'comp:jclextensions': jclextensions_users,
    'comp:test': test_users,
    'comp:build': build_users,
    'comp:gc': gc_users
}

expected_users = [user for user_list in components.values() for user in user_list]

In [69]:
def get_contribution_data(issue_number):
    contributions = {}
    issue_file = f"{issue_number}.json"
    
    if issue_file in all_issues:
        with open(os.path.join(issues_dir, issue_file), "r") as file:
            issue = json.load(file)      

            assignees = issue["assignees"]
            assignee_logins = (
                [assignee["login"] for assignee in assignees] if len(assignees) > 0 else []
            )

            contributions["direct_assignment"] = assignee_logins

            timeline = issue["timeline_data"]
            pull_requests = []
            commits = []
            discussion = []

            for timeline_event in timeline:
                event = timeline_event["event"]

                if event == "cross-referenced" and timeline_event["source"]["issue"].get("pull_request", None):
                    actor = timeline_event["actor"]["login"]
                    pull_requests.append(actor)

                if event == "referenced" and timeline_event["commit_url"]:
                    actor = timeline_event["actor"]["login"]
                    commits.append(actor)

                if event == "commented":
                    actor = timeline_event["actor"]["login"]
                    discussion.append(actor)
            
            contributions["direct_assignment"] = assignee_logins
            contributions["pull_request"] = pull_requests
            contributions["commits"] = commits
            contributions["discussion"] = discussion

    return contributions
         


def get_historical_contributors(similar_issues, predicted_component_ids):
    user_contribution_counts = {}
    base_points = 1

    for issue_index, sim_score in similar_issues:
        print(base_points)
        issue = df_train.iloc[issue_index]

        print(f"Issue label: {label2idx[issue.component]} -- Predicted: {predicted_component_ids}")

        if label2idx[issue.component] not in predicted_component_ids:
            print(f"Skipping issue as label id {label2idx[issue.component]} did not match with any of {predicted_component_ids}")
            continue

        issue_number = issue.issue_number
        contributors = get_contribution_data(issue_number)

        for _, users in contributors.items():
            for user in users:
                if user not in expected_users:
                    print(f"Skipping: {user}")
                    continue            
                
                if user in components[issue.component]:
                    user_contribution_counts[user] = user_contribution_counts.get(user, 0) + base_points * 1.25
                else:
                    user_contribution_counts[user] = user_contribution_counts.get(user, 0) + base_points
        
        base_points /= 2.0
    
    user_contribution_counts = sorted(user_contribution_counts.items(), key=lambda x: x[1], reverse=True)
    print(user_contribution_counts)

In [72]:
test_idx = 176
historical_data = similarity_predictions[test_idx]
predicted_labels = topk_preds[test_idx]
print(f"Real label: {df_test.iloc[test_idx].component}")
print(f"Predicted labels: {[idx2label[pred] for pred in predicted_labels]}")
print(historical_data)
print("Issue number:", df_test.iloc[test_idx].issue_number)
print("Component:", df_test.iloc[test_idx].component)
get_historical_contributors(historical_data, predicted_labels)
# print(df_train.iloc[historical_data])

# print(df_test.iloc[test_idx])
# print(all_labels_cat[test_idx])
# print(all_preds_cat[test_idx])

Real label: comp:jvmti
Predicted labels: ['comp:vm', 'comp:gc', 'comp:jvmti']
[[255, 0.71805286], [2073, 0.69125384], [2081, 0.6891853]]
Issue number: 17520
Component: comp:jvmti
1
Issue label: 2 -- Predicted: [2 3 5]
0.5
Issue label: 2 -- Predicted: [2 3 5]
Skipping: dipak-bagadiya
0.25
Issue label: 2 -- Predicted: [2 3 5]
[('babsingh', 16.875), ('pshipton', 3.125), ('fengxue-IS', 1.5625), ('gacholio', 1.25), ('thallium', 0.3125), ('tajila', 0.3125)]


In [43]:
np.mean(all_preds_cat == all_labels_cat)

0.75

In [100]:
similar_issues = get_top_k_similar_issues(df_test.iloc[0].text)

In [101]:
similar_issues

[[1220, 0.94563663],
 [1899, 0.9409791],
 [2083, 0.91344625],
 [1610, 0.9121801],
 [544, 0.9099889]]

In [78]:
df_train.iloc[1610]

Unnamed: 0.1                                                14391
Unnamed: 0                                                     90
issue_number                                                14396
issue_url       https://github.com/eclipse-openj9/openj9/issue...
issue_title     testSoftMxDisclaimMemory_GC_3_FAILED : Segment...
description     Failure link\r\n------------\r\n\r\nFrom an in...
issue_state                                                closed
creator                                               JasonFengJ9
comments        <comment><user>dmitripivkine</user><body>This ...
owner                                                         NaN
labels                      comp:gc, test failure, blocker, jdk18
topic_id                                                       18
topic_label                               Java Crashes and Errors
component                                                 comp:gc
text            Title: testSoftMxDisclaimMemory_GC_3_FAILED : ...
topic_hot 

In [76]:
df_test.iloc[0]

Unnamed: 0.1                                                16586
Unnamed: 0                                                     95
issue_number                                                16593
issue_url       https://github.com/eclipse-openj9/openj9/issue...
issue_title     JDK19 serviceability_jvmti_j9_1_FAILED Segment...
description     Failure link\r\n------------\r\n\r\nFrom [an i...
issue_state                                                closed
creator                                               JasonFengJ9
comments        <comment><user>pshipton</user><body>@JasonFeng...
owner                                                    babsingh
labels                 comp:vm, comp:gc, test failure, os:windows
topic_id                                                       18
topic_label                               Java Crashes and Errors
component                                                 comp:vm
text            Title: JDK19 serviceability_jvmti_j9_1_FAILED ...
topic_hot 

In [30]:
df_test_filtered = df_test[df_test["owner"].notna()]

In [63]:
df_val_filtered = df_val[df_val["owner"].notna()]

In [64]:
len(df_val_filtered)

135

In [62]:
len(df_test_filtered)

101

In [65]:
df_test_filtered.component.value_counts()

component
comp:vm           46
comp:jit          26
comp:test          8
comp:build         6
comp:jitserver     5
comp:infra         5
comp:gc            3
comp:doc           2
Name: count, dtype: int64

In [82]:
df_test_filtered.iloc[0]["issue_number"]

11139