In [1]:
import re
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from loguru import logger
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, PreTrainedTokenizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

  _torch_pytree._register_pytree_node(


# Load Test Data

In [36]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/openj9_processed.csv"
df = pd.read_csv(dataset_path)

In [39]:
df.iloc[1121]

issue_number                                                 3185
issue_title     JTReg test fail : java/lang/invoke/condy/Condy...
issue_body      Sub-test failure at https://github.com/ibmrunt...
issue_url       https://github.com/eclipse-openj9/openj9/issue...
issue_state                                                closed
creator                                                 ben-walsh
labels                                               test failure
assignees                                               smlambert
component                                                     NaN
Name: 1121, dtype: object

### Filter Developers

In [3]:
vm_users = ['pshipton', 'keithc-ca', 'gacholio', 'tajila', 'babsingh', 'JasonFengJ9', 'fengxue-IS', 'hangshao0', 'theresa.t.mammarella', 'ChengJin01', 'singh264', 'thallium', 'ThanHenderson']
jvmti_users = ['gacholio', 'tajila', 'babsingh', 'fengxue-IS']
jclextensions_users = ['JasonFengJ9', 'pshipton', 'keithc-ca']
test_users = ['LongyuZhang', 'annaibm', 'sophiaxu0424', 'KapilPowar', 'llxia']
build_users = ['adambrousseau', 'mahdipub']
gc_users = ['dmitripivkine', 'amicic', 'kangyining', 'LinHu2016']

# Putting them in dictionaries
components = {
    'comp:vm': vm_users,
    'comp:jvmti': jvmti_users,
    'comp:jclextensions': jclextensions_users,
    'comp:test': test_users,
    'comp:build': build_users,
    'comp:gc': gc_users
}

expected_users = [user for user_list in components.values() for user in user_list]

In [32]:
df = df[df["owner"].isin(expected_users)]
print("Total issues after developer filtering:", len(df))

KeyError: 'owner'

# Load Models

In [5]:
from triagerx.model.lbt_p_deberta import LBTPDeberta
from triagerx.model.lbtp_bilstm import LBTPBiLSTM
from triagerx.loss.loss_functions import *

In [6]:
len(df_test.component_id.unique())

6

In [7]:
lbl2idx = {'ChengJin01': 0,
 'JasonFengJ9': 1,
 'LinHu2016': 2,
 'LongyuZhang': 3,
 'amicic': 4,
 'babsingh': 5,
 'dmitripivkine': 6,
 'fengxue-IS': 7,
 'gacholio': 8,
 'hangshao0': 9,
 'keithc-ca': 10,
 'llxia': 11,
 'pshipton': 12,
 'tajila': 13,
 'thallium': 14}

### Prepare Tokenizer

In [8]:
import yaml

In [9]:
with open("/home/mdafifal.mamun/notebooks/triagerX/triagerx/config/config.yaml", "r") as file:
    configs = yaml.safe_load(file)

In [10]:
tokenizer_config = configs["tokenizer"]

In [11]:
special_tokens = {
    "hex": "[HEX]",
    "timestamp": "[TIMESTAMP]",
    "float": "[FLOAT_VALUE]",
    "param": "[PARAM_VALUE]"
}

tokenizer = AutoTokenizer.from_pretrained(tokenizer_config["model_name"])
special_tokens_dict = {"additional_special_tokens": list(special_tokens.values())}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [12]:
df_test = df_test[df_test.owner.isin(list(lbl2idx.keys()))]

In [13]:
df_test["owner_id"] = df_test["owner"].apply(lambda owner: lbl2idx[owner])

In [14]:
def get_trained_model(weight_path: str, num_classes, unfrozen_layers, dropout, base_model, tokenizer):
    model = LBTPDeberta(
        output_size=num_classes, 
        unfrozen_layers=unfrozen_layers, 
        dropout=dropout, 
        base_model=base_model
    )

    model.base_model.resize_token_embeddings(len(tokenizer))

    model.load_state_dict(torch.load(weight_path))

    return model

In [15]:
component_model_weights = "/work/disa_lab/projects/triagerx/models/LBTPDeberta_6_classes_CombinedLoss.pt"
developer_model_weights = "/home/mdafifal.mamun/notebooks/triagerX/LBTPDeberta_u5_cv9_classes15_CombinedLoss.pt"

In [16]:
component_model = get_trained_model(
    weight_path=component_model_weights,
    num_classes=len(df_test.component_id.unique()), 
    unfrozen_layers=5, 
    dropout=0.2, 
    base_model="microsoft/deberta-base",
    tokenizer=tokenizer
)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [17]:
developer_model = get_trained_model(
    weight_path=developer_model_weights,
    num_classes=15, 
    unfrozen_layers=5, 
    dropout=0.2, 
    base_model="microsoft/deberta-large",
    tokenizer=tokenizer
)

In [18]:
similarity_model = SentenceTransformer('all-mpnet-base-v2')
all_embeddings = similarity_model.encode(df_test.text.to_list(), batch_size=15)

In [19]:
tokenizer.tokenize("yesy")

['yes', 'y']

In [20]:
from triagerx.system.triagerx import TriagerX

In [21]:
trx = TriagerX(
    developer_prediction_model=developer_model,
    component_prediction_model=component_model,
    similarity_model=similarity_model,
    tokenizer=tokenizer,
    tokenizer_config=tokenizer_config,
    issues_path=""
)

In [28]:
df_test.iloc[11]

Unnamed: 0.1                                                 6944
Unnamed: 0                                                   6944
issue_number                                                17307
issue_title         [JDK20] GetStackTraceNotSuspendedStress Fails
description     > serviceability/jvmti/stress/StackTrace/NotSu...
issue_url       https://github.com/eclipse-openj9/openj9/issue...
issue_state                                                closed
creator                                                  babsingh
labels                                                    comp:gc
owner                                                    babsingh
component                                                 comp:gc
text            Title: [JDK20] GetStackTraceNotSuspendedStress...
topic_id                                                       19
topic_probs     [0.014829874038696289, 0.0693783089518547, 0.0...
topic_label                   JVMTI Serviceability Testing Issues
component_

In [29]:
test_idx = 11

recommendations = trx.get_recommendation(df_test.iloc[test_idx].text)
print(recommendations[0])
print(df_test.iloc[test_idx].owner_id)
print("=======================")
print(recommendations[1])
print(df_test.iloc[test_idx].component_id)

torch.return_types.topk(
values=tensor([[23.0869,  6.4170,  4.3018]]),
indices=tensor([[ 5,  1, 14]]))
5
torch.return_types.topk(
values=tensor([[ 4.0076, -1.1747, -2.0426]]),
indices=tensor([[2, 3, 1]]))
3


In [23]:
df_test.iloc[10].owner_id

10