# Import Modules

In [1]:
import re
from pprint import pprint
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from loguru import logger
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, PreTrainedTokenizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

tqdm.pandas()


# Prepare Dataset

In [2]:
def component_split(x):
    x_split = str(x).split(",")

    for s in x_split:
        if "comp:" in s.lower():
            return s.strip()
    return None

In [3]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/openj9_processed.csv"

raw_df = pd.read_csv(dataset_path)
print(len(raw_df))
raw_df = raw_df.rename(columns={"assignees": "owner", "issue_body": "description"})
# df = df[df["owner"].notna()]

7758


In [4]:
special_tokens = {
    "hex": "[HEX]",
    "timestamp": "[TIMESTAMP]",
    "numeric": "[NUMERIC]",
    "param": "[PARAM_VALUE]",
    "version": "[VERSION]",
    "ip": "[IP_ADDRESS]",
    "filepath": "[FILE_PATH]",
    "url": "[URL]"
}


def clean_issue_description(text):
    text = str(text)
    cleaned_text = text.strip()
    cleaned_text = re.sub(r'(https?|ftp):\/\/[^\s/$.?#].[^\s]*', special_tokens["url"], cleaned_text)
    cleaned_text = re.sub(r'0x[\da-fA-F]+', special_tokens["hex"], cleaned_text)
    cleaned_text = re.sub(r'\b[0-9a-fA-F]{16}\b', special_tokens["hex"], cleaned_text)
    cleaned_text = re.sub(r'\b.*/([^/]+)', rf"{special_tokens['filepath']}/\1", cleaned_text)
    cleaned_text = re.sub(r"\b([A-Za-z]:)?.*\\([^\\]+)", rf"{special_tokens['filepath']}/\2", cleaned_text)
    cleaned_text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', special_tokens["ip"], cleaned_text)
    cleaned_text = re.sub(r"(?<!\w)\d+\.\d+\.\d+(\.\d+)*(_\d+)?(-[a-zA-Z]+\d*)?(?!\w)", special_tokens["version"], cleaned_text)
    cleaned_text = re.sub(r'\b\d{2}:\d{2}:\d{2}:\d{4,} GMT\b', special_tokens["timestamp"], cleaned_text)
    cleaned_text = re.sub(r'\b\d{2}:\d{2}:\d{2}(\.\d{2,3})?\b', special_tokens["timestamp"], cleaned_text)
    cleaned_text = re.sub(r'\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z\b', special_tokens["timestamp"], cleaned_text)
    cleaned_text = re.sub(r'\b[-+]?\d*\.\d+([eE][-+]?\d+)?\b', special_tokens["numeric"], cleaned_text)
    cleaned_text = re.sub(r'\d{4,}\b', special_tokens["numeric"], cleaned_text)
    cleaned_text = re.sub(r'=\s*-?\d+', f'= {special_tokens["param"]}', cleaned_text)
    cleaned_text = re.sub(r'```', "", cleaned_text)
    cleaned_text = re.sub(r'-{3,}', "", cleaned_text)
    cleaned_text = re.sub(r'[\*#=+\-]{3,}', "", cleaned_text)
    
    for special_token in special_tokens.values():
        sp_token = special_token[1:-1]
        cleaned_text = re.sub(rf'\[{sp_token}\]\s*(\[{sp_token}\]\s*)+', f"{special_token}", cleaned_text)
        
    cleaned_text = re.sub(r'(\r?\n)+', "\n", cleaned_text)
    cleaned_text = re.sub(r'(?![\r\n])\s+', " ", cleaned_text)
    cleaned_text = cleaned_text.strip()
    

    return cleaned_text

def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', regex=True)
    df["text"] = df['text'].str.replace(" +", " ", regex=True)
    df["text"] = df["text"].apply(clean_issue_description)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["labels"].notna()]
    print(f"All issues: {len(df)}")
    print(f"Excluding pull: {len(df)}")
    df = df[~df["issue_url"].str.contains("/pull/")]
    
    df["component"] = df["labels"].apply(component_split)
    
    df["text"] = df.progress_apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            # + "\nIssue Labels: "
            # + str(x["labels"])
            # + "\nIssue Topic: "
            # + str(x["topic_label"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(raw_df)
df = clean_data(df)
df = df.sort_values(by="issue_number")

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

All issues: 7348
Excluding pull: 7348


100%|██████████| 7348/7348 [00:00<00:00, 100232.03it/s]




Total number of issues: 7348


In [5]:
components = set()

In [6]:
for val in df["component"].values:
    if val is None:
        continue
    
    split = val.split(",")
    
    for s in split:
        components.add(s.strip())

In [7]:
component_values = df["component"].value_counts()
filtered_components = component_values.index[component_values >= 20]

df = df[df["component"].isin(filtered_components)]

In [8]:
df = df.sort_values(by="issue_number")

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
components = ["comp:vm", "comp:jvmti", "comp:jclextensions", "comp:test", "comp:build", "comp:gc"]
filtered_df = df[df["component"].isin(components)]

# Splitting parition by size
# total_data = len(filtered_df)
# train_size = int(total_data*0.9)
# test_size = total_data - train_size
# df_train = filtered_df[:train_size]
# df_test = filtered_df[train_size:]

df_train, df_test = train_test_split(filtered_df, test_size=0.2)
print(len(df_train), len(df_test))

2472 618


In [11]:
# df_train.to_csv("/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/component_training/df_train.csv")
# df_test.to_csv("/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/component_training/df_test.csv")

In [33]:
df_train = pd.read_csv("/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/component_training/df_train_summarized.csv")

In [34]:
df_test = pd.read_csv("/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/component_training/df_test.csv")

In [35]:
set(df_train.issue_number).intersection(set(df_test.issue_number))

set()

In [36]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,issue_number,issue_title,description,issue_url,issue_state,creator,labels,owner,component,text
0,6838,17078,[JDK20/FFI_Jtreg] Crash detected in StdLibTest,The crashed was detected in https://github.com...,https://github.com/eclipse-openj9/openj9/issue...,closed,ChengJin01,"comp:vm, project:panama, test failure, jdk20",ChengJin01,comp:vm,Title: [JDK20/FFI_Jtreg] Crash detected in Std...
1,6583,16503,cmdLineTester_criu_keepCheckpoint_2_FAILED org...,Failure link\r\n------------\r\n\r\nFrom [an i...,https://github.com/eclipse-openj9/openj9/issue...,open,JasonFengJ9,"comp:test, test failure, criu",,comp:test,Title: cmdLineTester_criu_keepCheckpoint_2_FAI...
2,2018,5822,"Java 13, deprecate -Xverify:none and -noverify",The following OpenJDK Java 13 change in the re...,https://github.com/eclipse-openj9/openj9/issue...,closed,pshipton,"comp:vm, doc:externals, jdk13",theresa-m,comp:vm,"Title: Java 13, deprecate -Xverify:none and -n..."
3,44,116,Travis PR builds broken,Travis pull request builds are broken. They ti...,https://github.com/eclipse-openj9/openj9/issue...,closed,dnakamura,comp:build,tajila,comp:build,Title: Travis PR builds broken\nDescription: T...
4,2563,7364,ppc64le Calendar.getInstance incorrect timezone,https://ci.eclipse.org/openj9/job/Test_openjdk...,https://github.com/eclipse-openj9/openj9/issue...,closed,pshipton,"comp:test, test failure",,comp:test,Title: ppc64le Calendar.getInstance incorrect ...


In [37]:
# df_train["description"] = df_train["description"].progress_apply(clean_issue_description)
df_train["text"] = df_train.progress_apply(
        lambda x: "Bug Title: "
        + str(x["issue_title"])
        # + "\nIssue Labels: "
        # + str(x["labels"])
        # + "\nIssue Topic: "
        # + str(x["topic_label"])
        # + "\nBug Summary: "
        # + str(x["summary"]),
        + "\nBug Description: "
        + str(x["description"]),
        axis=1,
    )

100%|██████████| 2472/2472 [00:00<00:00, 110620.19it/s]


In [38]:
# df_train["text"] = df_train["text"].progress_apply(clean_issue_description)

In [39]:
df_train.component.value_counts()

component
comp:vm               1433
comp:test              476
comp:build             316
comp:gc                187
comp:jclextensions      37
comp:jvmti              23
Name: count, dtype: int64

In [40]:
df_test.component.value_counts()

component
comp:vm               360
comp:test              99
comp:build             89
comp:gc                58
comp:jclextensions      7
comp:jvmti              5
Name: count, dtype: int64

In [41]:
assert set(df_train.component.unique()) == set(df_test.component.unique())

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
# Generate component ids
label2idx = {label: idx for idx, label in enumerate(sorted(list(df_train["component"].unique())))}
df_train["component_id"] = [label2idx[component] for component in df_train["component"].values]
df_test["component_id"] = [label2idx[component] for component in df_test["component"].values]

df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=77, shuffle=True)

In [44]:
len(df_train), len(df_val)

(1977, 495)

In [45]:
print("Dataset size", len(df_train), len(df_val), len(df_test))

Dataset size 1977 495 618


# Prepare PyTorch Dataset

In [46]:
len(df_train.component.unique())

6

In [47]:
class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        feature: str = "text",
        target: str = "component_id",
    ):
        logger.debug("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        # self.embedding_model = SentenceTransformer("BAAI/bge-small-en")
        logger.debug("Tokenizing texts...")
        self.texts = [
            (row[feature], self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            ))
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


# Training Parameters

In [48]:
assert set(df_test.component.unique()) == set(df_val.component.unique()) == set(df_train.component.unique())

In [49]:
num_classes = len(df_train["component"].unique())
print(num_classes)

6


In [50]:
from triagerx.loss.loss_functions import *
from triagerx.model.lbtp_bilstm import LBTPBiLSTM
from triagerx.model.lbt_p_deberta import LBTPDeberta

In [51]:
class_counts = np.bincount(df_train["component_id"])
num_samples = sum(class_counts)
labels = df_train["component_id"].to_list() # corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

# Define hyperparameters
learning_rate = 1e-5
epochs = 50
batch_size = 10
unfrozen_layers=5


model = LBTPDeberta(
    len(df_train.component_id.unique()), 
    unfrozen_layers=unfrozen_layers, 
    dropout=0.2, 
    base_model="microsoft/deberta-base"
)

criterion = CombinedLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.001)
# scheduler = ReduceLROnPlateau(optimizer, "min", patience=2, factor=0.1, threshold=1e-10)

run_name = f"comp_raw_data_{model.__class__.__name__}_u{unfrozen_layers}_{num_classes}_classes_{criterion.__class__.__name__}"
# weights_load_location = f"/work/disa_lab/projects/triagerx/models/deberta_component_prediction.pt"
weights_save_location = f"/work/disa_lab/projects/triagerx/models/{run_name}.pt"


# Load best checkpoint
# model.load_state_dict(torch.load(weights_load_location))

In [52]:
tokenizer = model.tokenizer()

In [53]:
# special_tokens_dict = {"additional_special_tokens": list(special_tokens.values())}
# num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# model.base_model.resize_token_embeddings(len(tokenizer))

In [54]:
# Prepare torch dataset from train and validation splits
train = TriageDataset(df_train, tokenizer)
val = TriageDataset(df_val, tokenizer)

[32m2024-05-12 22:02:13.348[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-05-12 22:02:13.350[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m
[32m2024-05-12 22:02:17.792[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-05-12 22:02:17.794[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m


In [33]:
import wandb
wandb.init(
    # Set the project where this run will be logged
    project="openj9", 
    # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
    name=run_name, 
    # Track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": "Deberta-LBT-P",
    "dataset": "openj9",
    "epochs": epochs,
})

[34m[1mwandb[0m: Currently logged in as: [33mafifaniks[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111901166926449, max=1.0…

In [55]:
train_dataloader = DataLoader(
    dataset=train,
    batch_size=batch_size,
    shuffle=False if sampler else True,
    sampler=sampler,
)
val_dataloader = DataLoader(val, batch_size=batch_size)

In [56]:
total_steps = len(train_dataloader) * epochs
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_loss = float("inf")

if torch.cuda.is_available():
    logger.debug(f"Selected compute device: {device}")
    model = model.cuda()
    criterion = criterion.cuda()

[32m2024-05-12 22:02:25.223[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mSelected compute device: cuda[0m


In [37]:
def log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        train_data,
        validation_data,
        topk,
    ):
        log = f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                    | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                    | Val Loss: {total_loss_val / len(validation_data): .3f} \
                    | Val Accuracy: {total_acc_val / len(validation_data): .3f} \
                    | Top 3: {topk} \
                    | Precision: {precision: .3f} \
                    | Recall: {recall: .3f} \
                    | F1-score: {f1_score: .3f}"

        logger.info(log)
        wandb.log({
            "train_acc": total_acc_train / len(train_data), 
            "train_loss": total_loss_train / len(train_data),
            "val_acc": total_acc_val / len(validation_data),
            "val_loss": total_loss_val / len(validation_data),
            "top3_acc": topk,
            "precision": precision,
            "recall": recall,
            "f1-score": f1_score
        })

# Training Loop

In [58]:
model = model.cuda()

In [39]:
patience = 10
patience_counter = 0

for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader, desc="Training Steps"):
        # print(train_input)
        optimizer.zero_grad()

        train_label = train_label.to(device)
        mask = train_input[1]["attention_mask"].squeeze(1).to(device)
        input_id = train_input[1]["input_ids"].squeeze(1).to(device)
        tok_type = train_input[1]["token_type_ids"].squeeze(1).to(device)
        # repr = train_input[2].to(device)
        # print(tok_type.shape, input_id.shape, mask.shape)
        # print(repr.dtype, input_id.dtype, mask.dtype)

        output = model(input_id, mask, tok_type)

        batch_loss = criterion(output, train_label.long())
        total_loss_train += batch_loss.item()

        output = torch.sum(torch.stack(output), 0)
        acc = (output.argmax(dim=1) == train_label).sum().item()
        
        total_acc_train += acc

        batch_loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

    total_acc_val = 0
    total_loss_val = 0
    correct_top_k = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():

        for val_input, val_label in tqdm(val_dataloader, desc="Validation Steps"):
            val_label = val_label.to(device)
            mask = val_input[1]["attention_mask"].squeeze(1).to(device)
            input_id = val_input[1]["input_ids"].squeeze(1).to(device)
            tok_type = val_input[1]["token_type_ids"].squeeze(1).to(device)
            # repr = val_input[2].to(device)

            output = model(input_id, mask, tok_type)

            batch_loss = criterion(output, val_label.long())
            total_loss_val += batch_loss.item()

            output = torch.sum(torch.stack(output), 0)
            _, top_k_predictions = output.topk(3, 1, True, True)

            top_k_predictions = top_k_predictions.t()

            correct_top_k += (
                top_k_predictions.eq(
                    val_label.view(1, -1).expand_as(top_k_predictions)
                )
                .sum()
                .item()
            )

            acc = (output.argmax(dim=1) == val_label).sum().item()

            all_preds.append(output.argmax(dim=1).cpu().numpy())
            all_labels.append(val_label.cpu().numpy())

            total_acc_val += acc

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="macro"
    )

    top10 = correct_top_k / len(df_val)

    log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        df_train,
        df_val,
        top10,
    )

    val_loss = total_loss_val / len(df_val)

    if val_loss < best_loss:
        patience_counter = 0
        logger.success("Found new best model. Saving weights...")
        torch.save(model.state_dict(), weights_save_location)
        best_loss = val_loss
    else:
        patience_counter += 1
        if patience_counter > patience:
            logger.info("Early stopping...")
            break

Training Steps: 100%|██████████| 198/198 [01:10<00:00,  2.80it/s]
Validation Steps: 100%|██████████| 50/50 [00:07<00:00,  6.96it/s]
[32m2024-05-12 17:46:38.447[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_step[0m:[36m23[0m - [1mEpochs: 1 | Train Loss:  1.274                     | Train Accuracy:  0.212                     | Val Loss:  1.037                     | Val Accuracy:  0.222                     | Top 3: 0.6303030303030303                     | Precision:  0.166                     | Recall:  0.220                     | F1-score:  0.138[0m
[32m2024-05-12 17:46:38.451[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m103[0m - [32m[1mFound new best model. Saving weights...[0m
Training Steps: 100%|██████████| 198/198 [01:11<00:00,  2.79it/s]
Validation Steps: 100%|██████████| 50/50 [00:07<00:00,  6.97it/s]
[32m2024-05-12 17:47:58.337[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_step[0m:[36m23[0m - [1mEpochs: 2 | Train Loss:  0.857 

In [40]:
wandb.finish()



VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
f1-score,▁▂▃▅▆▆▇▇▇▇▇▇▇▇▇█▇▇▇▇████████████
precision,▁▂▃▃▅▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇██▇█▇█▇▇▇███
recall,▁▄▄▅▆▇▇▇▇█▇███████▇█▇▇███████▇██
top3_acc,▁▄▅▇████████████████████████████
train_acc,▁▃▅▆▇▇▇█████████████████████████
train_loss,█▆▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▂▃▅▆▇▇▇▇▇▇▇▇▇█████▇████████████
val_loss,█▇▆▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
f1-score,0.65938
precision,0.7484
recall,0.64502
top3_acc,0.9798
train_acc,0.99949
train_loss,0.01301
val_acc,0.7899
val_loss,0.41259


# Evaluation

In [59]:
weights_load_location = weights_save_location

In [60]:
# Load best checkpoint
model.load_state_dict(torch.load(weights_load_location))

<All keys matched successfully>

In [61]:
weights_save_location

'/work/disa_lab/projects/triagerx/models/comp_raw_data_LBTPDeberta_u5_6_classes_CombinedLoss.pt'

In [62]:
# df_test["description"] = df_test["description"].progress_apply(clean_issue_description)
df_test["text"] = df_test.progress_apply(
        lambda x: "Bug Title: "
        + str(x["issue_title"])
        # + "\nIssue Labels: "
        # + str(x["labels"])
        # + "\nIssue Topic: "
        # + str(x["topic_label"])
        # + "\nBug Summary: "
        # + str(x["summary"]),
        + "\nBug Description: "
        + str(x["description"]),
        axis=1,
    )

100%|██████████| 618/618 [00:00<00:00, 63811.33it/s]


In [63]:
test_ds = TriageDataset(df_test, tokenizer)

[32m2024-05-12 22:02:44.118[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-05-12 22:02:44.120[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m


In [64]:
loader = DataLoader(test_ds, 30)

In [65]:
total_acc_val = 0
total_loss_val = 0
correct_top_k = 0
correct_top_k_wo_sim = 0

all_preds = []
all_labels = []
device="cuda"

model = model.cuda()

with torch.no_grad():

    for val_input, val_label in loader:
        val_label = val_label.to(device)
        mask = val_input[1]["attention_mask"].squeeze(1).to(device)
        input_id = val_input[1]["input_ids"].squeeze(1).to(device)
        tok_type = val_input[1]["token_type_ids"].squeeze(1).to(device)

        output = model(input_id, mask, tok_type)

        output = torch.sum(torch.stack(output), 0)

        #wo similarity
        _, top_k_wo_sim = output.topk(1, 1, True, True)

        top_k_wo_sim = top_k_wo_sim.t()

        correct_top_k_wo_sim += (
            top_k_wo_sim.eq(
                val_label.view(1, -1).expand_as(top_k_wo_sim)
            )
            .sum()
            .item()
        )

        all_preds.append(output.argmax(dim=1).cpu().numpy())
        all_labels.append(val_label.cpu().numpy())

In [66]:
print(f"Correct Prediction without Similarity: {correct_top_k_wo_sim}, {correct_top_k_wo_sim / len(df_test)}")
# print(f"Correct Prediction with Similarity: {correct_top_k}, {correct_top_k / len(y_df)}")

Correct Prediction without Similarity: 469, 0.7588996763754046


In [67]:
all_preds_np = np.concatenate(all_preds)
all_labels_np = np.concatenate(all_labels)

# Result Analysis

In [68]:
set(all_preds_np)

{0, 1, 2, 3, 4, 5}

In [69]:
print(classification_report(all_labels_np, all_preds_np))

              precision    recall  f1-score   support

           0       0.73      0.79      0.76        89
           1       0.69      0.53      0.60        58
           2       0.50      0.29      0.36         7
           3       1.00      0.20      0.33         5
           4       0.56      0.67      0.61        99
           5       0.84      0.83      0.84       360

    accuracy                           0.76       618
   macro avg       0.72      0.55      0.58       618
weighted avg       0.77      0.76      0.76       618



In [52]:
idx2label = {
    row["owner_id"]: row["owner"]
    for _, row in y_df.iterrows()
}

labels = y_df.owner_id.to_list()
labels = sorted(set(labels))
labels = [f"{idx}: {idx2label[idx]}" for idx in labels]

NameError: name 'y_df' is not defined

In [None]:
def get_topic_distribution(owner):
    print("Training topic distribution")
    print("=======================================")
    print(X_df[X_df.owner == owner].topic_label.value_counts())

    print("\n\nTesting topic distribution")
    print("=======================================")
    print(y_df[y_df.owner == owner].topic_label.value_counts())

In [None]:
X_df.topic_label.value_counts()

Chrome Tab and Window Behavior Issues     947
Build failures                            840
Chrome stability issues                   487
Layout Testing Issues                     400
Chrome crash reports                      391
Security and SSL issues                   372
Input and keyboard issues                 370
Webpage rendering regression issues       357
Chrome sync issues                        354
Shill WiFi configuration                  337
iOS File Issues                           321
Data Enhancement                          298
Touch and Scroll Issues                   273
DevTools Crashes                          260
GPU rendering issues                      235
Memory Leaks in WebCore and Blink         220
Performance testing issues in Chromium    197
WebRTC audio/video issues                 184
Bookmark issues                           174
Performance Regression in Blink            13
Name: topic_label, dtype: int64

In [None]:
get_topic_distribution("a...@chromium.org")

Training topic distribution
Chrome Tab and Window Behavior Issues     10
Webpage rendering regression issues        5
Memory Leaks in WebCore and Blink          4
Chrome stability issues                    4
DevTools Crashes                           3
Data Enhancement                           3
Input and keyboard issues                  3
Touch and Scroll Issues                    2
Bookmark issues                            1
Security and SSL issues                    1
Chrome sync issues                         1
Layout Testing Issues                      1
Build failures                             1
iOS File Issues                            1
Performance testing issues in Chromium     1
Name: topic_label, dtype: int64


Testing topic distribution
Chrome Tab and Window Behavior Issues     17
Webpage rendering regression issues        7
DevTools Crashes                           6
iOS File Issues                            6
Touch and Scroll Issues                    4
Input and k