# Import Modules

In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from loguru import logger
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, PreTrainedTokenizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix

tqdm.pandas()


  _torch_pytree._register_pytree_node(


# Prepare Dataset

In [2]:
def component_split(x):
    x_split = str(x).split(",")

    for s in x_split:
        if "comp:" in s.lower():
            return s.strip()
    return None

In [3]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/openj9/openj9_topic_all_issues.csv"

df = pd.read_csv(dataset_path)
print(len(df))
df = df.rename(columns={"assignees": "owner", "issue_body": "description"})
# df = df[df["owner"].notna()]

def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', regex=True)
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["labels"].notna()]
    print(f"All issues: {len(df)}")
    df = df[~df["issue_url"].str.contains("/pull/")]
    print(f"Excluding pull: {len(df)}")
    df["component"] = df["labels"].apply(component_split)
    
    df["text"] = df.progress_apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            # + "\nIssue Labels: "
            # + str(x["labels"])
            + "\nIssue Topic: "
            + str(x["topic_label"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(df)
df = clean_data(df)
df = df.sort_values(by="issue_number")

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

18278
All issues: 16342
Excluding pull: 6990


100%|██████████| 6990/6990 [00:00<00:00, 84917.97it/s]


Total number of issues: 6990


In [4]:
df["topic_hot"] = pd.get_dummies(df["topic_id"]).values.tolist()

In [5]:
components = set()

In [6]:
for val in df["component"].values:
    if val is None:
        continue
    
    split = val.split(",")
    
    for s in split:
        components.add(s.strip())

In [8]:
component_values = df["component"].value_counts()
filtered_components = component_values.index[component_values >= 20]

df = df[df["component"].isin(filtered_components)]

In [10]:
def split_by_component(source_df, train_size=0.8):
    grouped = source_df.groupby('component')

    # Initialize two empty lists to store the split datasets
    dataset_1 = []
    dataset_2 = []

    # Iterate over each group
    for _, group_df in grouped:
        # Split the group into two halves
        first_idx = int(len(group_df) * train_size)
        group_half_1 = group_df.iloc[:first_idx]
        group_half_2 = group_df.iloc[first_idx:]
        
        # Append each half to the respective dataset
        dataset_1.append(group_half_1)
        dataset_2.append(group_half_2)

    return pd.concat(dataset_1, ignore_index=True), pd.concat(dataset_2, ignore_index=True)

In [9]:
df = df.sort_values(by="issue_number")

In [48]:
components = ["comp:vm", "comp:jvmti", "comp:jclextensions", "comp:test", "comp:build", "comp:gc"]
filtered_df = df[df["component"].isin(components)]

# Splitting parition by size
total_data = len(filtered_df)
train_size = int(total_data*0.9)
test_size = total_data - train_size
df_train = filtered_df[:train_size]
df_test = filtered_df[train_size:]

print(len(df_train), len(df_test))

2655 296


In [49]:
df_train.component.value_counts()

component
comp:vm               1509
comp:test              509
comp:build             391
comp:gc                201
comp:jclextensions      30
comp:jvmti              15
Name: count, dtype: int64

In [50]:
df_test.component.value_counts()

component
comp:vm               186
comp:test              46
comp:gc                33
comp:build             13
comp:jclextensions     11
comp:jvmti              7
Name: count, dtype: int64

In [51]:
assert set(df_train.component.unique()) == set(df_test.component.unique())

In [52]:
from sklearn.model_selection import train_test_split

In [53]:
# Generate component ids
label2idx = {label: idx for idx, label in enumerate(list(df_train["component"].unique()))}
df_train["component_id"] = [label2idx[component] for component in df_train["component"].values]
df_test["component_id"] = [label2idx[component] for component in df_test["component"].values]

df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42, shuffle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["component_id"] = [label2idx[component] for component in df_train["component"].values]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["component_id"] = [label2idx[component] for component in df_test["component"].values]


In [54]:
print("Dataset size", len(df_train), len(df_val), len(df_test))

Dataset size 2124 531 296


# Prepare PyTorch Dataset

In [55]:
len(df_train.component.unique())

6

In [56]:
class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        feature: str = "text",
        target: str = "component_id",
    ):
        logger.debug("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        # self.embedding_model = SentenceTransformer("BAAI/bge-small-en")
        logger.debug("Tokenizing texts...")
        self.texts = [
            (row[feature], self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            ), torch.tensor(row.topic_hot))
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


# Transformer Model

In [57]:
class LBTPClassifierTopic(nn.Module):
    def __init__(
        self, output_size, topic_size, unfrozen_layers=4, embed_size=1024, dropout=0.1
    ) -> None:
        super().__init__()
        model_name = "microsoft/deberta-large"
        self.base_model = AutoModel.from_pretrained(
            model_name, output_hidden_states=True
        )
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Freeze embedding layers
        for p in self.base_model.embeddings.parameters():
            p.requires_grad = False

        # Freeze encoder layers till last {unfrozen_layers} layers
        for i in range(0, self.base_model.config.num_hidden_layers - unfrozen_layers):
            for p in self.base_model.encoder.layer[i].parameters():
                p.requires_grad = False

        filter_sizes = [3, 4, 5, 6]
        self._num_filters = 256
        self._max_tokens = 512
        self._embed_size = embed_size
        self.unfrozen_layers = unfrozen_layers
        self.conv_blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        nn.Sequential(
                            nn.Conv2d(1, self._num_filters, (K, embed_size)),
                            nn.BatchNorm2d(self._num_filters),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.MaxPool1d(self._max_tokens - (K - 1)),
                            nn.Flatten(start_dim=1),
                        )
                        for K in filter_sizes
                    ]
                )
                for _ in range(unfrozen_layers)
            ]
        )

        self.classifiers = nn.ModuleList(
            [
                nn.Linear(
                    len(filter_sizes) * self._num_filters + topic_size, output_size
                )
                for _ in range(unfrozen_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask, tok_type, topic_id):
        outputs = []

        base_out = self.base_model(input_ids=input_ids, token_type_ids=tok_type, attention_mask=attention_mask)
        # pooler_out = base_out.last_hidden_state.squeeze(0)
        hidden_states = base_out.hidden_states[-self.unfrozen_layers :]

        for i in range(self.unfrozen_layers):
            batch_size, sequence_length, hidden_size = hidden_states[i].size()
            x = [
                conv(hidden_states[i].view(batch_size, 1, sequence_length, hidden_size))
                for conv in self.conv_blocks[i]
            ]
            # Concatanating outputs of the conv block of different filter sizes
            x = torch.cat(x, dim=1)
            x = self.dropout(x)
            x = torch.cat([x, topic_id], dim=1)
            x = self.classifiers[i](x)

            outputs.append(x)

        return outputs

    def tokenizer(self) -> AutoTokenizer:
        return self._tokenizer


# Loss Functions

In [58]:
class CombineLoss(nn.Module):
    def __init__(self, weights = None) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss(weight=weights)
    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

# Training Parameters

In [59]:
assert set(df_test.component.unique()) == set(df_val.component.unique()) == set(df_train.component.unique())

In [60]:
num_classes = len(df_train["component"].unique())
print(num_classes)

6


In [61]:
class_counts = np.bincount(df_train["component_id"])
num_samples = sum(class_counts)
labels = df_train["component_id"].to_list() # corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
# weights_load_location = f"/work/disa_lab/projects/triagerx/models/deberta_component_prediction.pt"
weights_save_location = f"/work/disa_lab/projects/triagerx/models/deberta_component_prediction_chrono_10class.pt"

# Define hyperparameters
learning_rate = 1e-5
epochs = 25
batch_size = 10

model = LBTPClassifierTopic(len(df_train.component_id.unique()), topic_size=20, unfrozen_layers=4, dropout=0.2)
# Load best checkpoint
# model.load_state_dict(torch.load(weights_load_location))
criterion = CombineLoss(weights=None)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8, weight_decay=0.001)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=2, factor=0.1, threshold=1e-10)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [63]:
# Prepare torch dataset from train and validation splits
train = TriageDataset(df_train, model.tokenizer())
val = TriageDataset(df_val, model.tokenizer())

[32m2024-04-21 14:47:54.554[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-04-21 14:47:54.555[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m
[32m2024-04-21 14:47:58.575[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-04-21 14:47:58.578[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m


In [64]:
import wandb

In [65]:
wandb.init(
    # Set the project where this run will be logged
    project="openj9", 
    # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
    name=f"component_prediction_chrono_order_{num_classes}_classes", 
    # Track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": "Deberta-LBT-P",
    "dataset": "openj9",
    "epochs": epochs,
})

[34m[1mwandb[0m: Currently logged in as: [33mafifaniks[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [66]:
train_dataloader = DataLoader(
    dataset=train,
    batch_size=batch_size,
    shuffle=False if sampler else True,
    sampler=sampler,
)
val_dataloader = DataLoader(val, batch_size=batch_size)

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_loss = float("inf")

if torch.cuda.is_available():
    logger.debug(f"Selected compute device: {device}")
    model = model.cuda()
    criterion = criterion.cuda()

[32m2024-04-21 14:48:17.786[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mSelected compute device: cuda[0m


In [68]:
def log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        train_data,
        validation_data,
        topk,
    ):
        log = f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                    | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                    | Val Loss: {total_loss_val / len(validation_data): .3f} \
                    | Val Accuracy: {total_acc_val / len(validation_data): .3f} \
                    | Top 3: {topk} \
                    | Precision: {precision: .3f} \
                    | Recall: {recall: .3f} \
                    | F1-score: {f1_score: .3f}"

        logger.info(log)
        wandb.log({
            "train_acc": total_acc_train / len(train_data), 
            "train_loss": total_loss_train / len(train_data),
            "val_acc": total_acc_val / len(validation_data),
            "val_loss": total_loss_val / len(validation_data),
            "top3_acc": topk,
            "precision": precision,
            "recall": recall,
            "f1-score": f1_score
        })

# Training Loop

In [69]:
model = model.cuda()

In [70]:
patience = 5
patience_counter = 0

for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader, desc="Training Steps"):
        # print(train_input)
        train_label = train_label.to(device)
        mask = train_input[1]["attention_mask"].squeeze(1).to(device)
        input_id = train_input[1]["input_ids"].squeeze(1).to(device)
        tok_type = train_input[1]["token_type_ids"].squeeze(1).to(device)
        repr = train_input[2].to(device)
        # print(tok_type.shape, input_id.shape, mask.shape)
        # print(repr.dtype, input_id.dtype, mask.dtype)

        output = model(input_id, mask, tok_type, repr)

        batch_loss = criterion(output, train_label.long())
        total_loss_train += batch_loss.item()

        output = torch.sum(torch.stack(output), 0)
        acc = (output.argmax(dim=1) == train_label).sum().item()
        
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()

    total_acc_val = 0
    total_loss_val = 0
    correct_top_k = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():

        for val_input, val_label in tqdm(val_dataloader, desc="Validation Steps"):
            val_label = val_label.to(device)
            mask = val_input[1]["attention_mask"].squeeze(1).to(device)
            input_id = val_input[1]["input_ids"].squeeze(1).to(device)
            tok_type = val_input[1]["token_type_ids"].squeeze(1).to(device)
            repr = val_input[2].to(device)

            output = model(input_id, mask, tok_type, repr)

            batch_loss = criterion(output, val_label.long())
            total_loss_val += batch_loss.item()

            output = torch.sum(torch.stack(output), 0)
            _, top_k_predictions = output.topk(3, 1, True, True)

            top_k_predictions = top_k_predictions.t()

            correct_top_k += (
                top_k_predictions.eq(
                    val_label.view(1, -1).expand_as(top_k_predictions)
                )
                .sum()
                .item()
            )

            acc = (output.argmax(dim=1) == val_label).sum().item()

            all_preds.append(output.argmax(dim=1).cpu().numpy())
            all_labels.append(val_label.cpu().numpy())

            total_acc_val += acc

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="macro"
    )

    top10 = correct_top_k / len(df_val)

    log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        df_train,
        df_val,
        top10,
    )

    val_loss = total_loss_val / len(df_val)

    if scheduler:
        scheduler.step(val_loss)

    if val_loss < best_loss:
        patience_counter = 0
        logger.success("Found new best model. Saving weights...")
        torch.save(model.state_dict(), weights_save_location)
        best_loss = val_loss
    else:
        patience_counter += 1
        if patience_counter > patience:
            logger.info("Early stopping...")
            break

Training Steps: 100%|██████████| 213/213 [03:16<00:00,  1.09it/s]
Validation Steps: 100%|██████████| 54/54 [00:20<00:00,  2.63it/s]
[32m2024-04-21 14:52:00.777[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_step[0m:[36m23[0m - [1mEpochs: 1 | Train Loss:  0.509                     | Train Accuracy:  0.639                     | Val Loss:  0.515                     | Val Accuracy:  0.621                     | Top 3: 0.9416195856873822                     | Precision:  0.414                     | Recall:  0.536                     | F1-score:  0.452[0m
[32m2024-04-21 14:52:00.781[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m103[0m - [32m[1mFound new best model. Saving weights...[0m
Training Steps: 100%|██████████| 213/213 [03:15<00:00,  1.09it/s]
Validation Steps: 100%|██████████| 54/54 [00:20<00:00,  2.64it/s]
[32m2024-04-21 14:55:40.948[0m | [1mINFO    [0m | [36m__main__[0m:[36mlog_step[0m:[36m23[0m - [1mEpochs: 2 | Train Loss:  0.239 

In [71]:
wandb.finish()



VBox(children=(Label(value='0.005 MB of 0.086 MB uploaded\r'), FloatProgress(value=0.06106246480350695, max=1.…

0,1
f1-score,▁▄▄▅▆▅▇▆▇▇▇▆▇▆▇▅▇▇▆▇█▆
precision,▁▃▃▄▄▄▇▅▅▅█▄▅▅▅▅█▅▅▅█▅
recall,▁▇▇▆▇▅▇▄▇▇▆▆█▇▇▄▇▇▅▇█▅
top3_acc,▁▃▆▇▇▇▆▇▇▇▇▇▇█▇▇▇▇▇▇█▇
train_acc,▁▆▆▇▇▇████████████████
train_loss,█▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_acc,▁▃▅▅▅▆▇█▇▇▇▆▇▇▇██▇████
val_loss,█▆▄▃▃▂▂▁▁▁▂▂▂▂▂▁▁▁▁▁▁▁

0,1
f1-score,0.63073
precision,0.65289
recall,0.61956
top3_acc,0.9887
train_acc,0.9967
train_loss,0.01662
val_acc,0.77966
val_loss,0.30558


# Evaluation

In [72]:
# Load best checkpoint
model.load_state_dict(torch.load(weights_save_location))

<All keys matched successfully>

In [74]:
weights_save_location

'/work/disa_lab/projects/triagerx/models/deberta_component_prediction_chrono_10class.pt'

In [75]:
test_ds = TriageDataset(df_test, model.tokenizer())

[32m2024-04-21 16:15:05.351[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m9[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-04-21 16:15:05.352[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mTokenizing texts...[0m


In [77]:
loader = DataLoader(test_ds, 30)

In [None]:
# Load embeddings for all train data
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
all_embeddings = similarity_model.encode(X_df.issue_title.to_list(), batch_size=15)

  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

NameError: name 'X_df' is not defined

In [2]:
def get_top_k_similar_devs(issues, k=5, threshold=0.7):
    test_embed = similarity_model.encode(issues)
    cos = util.cos_sim(test_embed, all_embeddings)
    topk_values, topk_indices = torch.topk(cos, k=k)
    
    similarities = []
    
    for idx, sim_score in zip(topk_indices, topk_values):
        sim_threshold = sim_score >= threshold
        filtered_idx = idx[sim_threshold].numpy()
        similarities.append(X_df.iloc[filtered_idx]["owner_id"].unique().tolist())

    return similarities

In [None]:
get_top_k_similar_issues()

In [None]:
total_acc_val = 0
total_loss_val = 0
correct_top_k = 0
correct_top_k_wo_sim = 0

all_preds = []
all_labels = []
device="cuda"

model = model.cuda()

with torch.no_grad():

    for val_input, val_label in loader:
        val_label = val_label.to(device)
        mask = val_input[1]["attention_mask"].squeeze(1).to(device)
        input_id = val_input[1]["input_ids"].squeeze(1).to(device)
        tok_type = val_input[1]["token_type_ids"].squeeze(1).to(device)
        repr = val_input[2].to(device)

        output = model(input_id, mask, tok_type, repr)



        output = torch.sum(torch.stack(output), 0)

        #wo similarity
        _, top_k_wo_sim = output.topk(3, 1, True, True)

        top_k_wo_sim = top_k_wo_sim.t()

        correct_top_k_wo_sim += (
            top_k_wo_sim.eq(
                val_label.view(1, -1).expand_as(top_k_wo_sim)
            )
            .sum()
            .item()
        )


        # with similarity
        # _, top_k_predictions = output.topk(10, 1, True, True)
        # similar_preds = get_top_k_similar_devs(val_input[0], threshold=0.65)

        # unique_preds = []

        # for top, sim in zip(top_k_predictions, similar_preds):
        #     # print(top, sim)
            
        #     copy_pred = top.cpu().numpy().tolist()
        #     top_preds = top.cpu().numpy().tolist()[:5]

        #     for s in sim:
        #         if s not in top_preds:
        #             top_preds.append(s)
            
        #     if len(top_preds) < 10:
        #         top_preds = top_preds + copy_pred[5:5 + 10 - len(top_preds)]
            
        #     unique_preds.append(top_preds)

        # unique_preds = torch.tensor(unique_preds).cuda()
        # top_k_predictions = unique_preds.t()

        # correct_top_k += (
        #     top_k_predictions.eq(
        #         val_label.view(1, -1).expand_as(top_k_predictions)
        #     )
        #     .sum()
        #     .item()
        # )

        # # break

        # acc = (output.argmax(dim=1) == val_label).sum().item()

        all_preds.append(output.argmax(dim=1).cpu().numpy())
        all_labels.append(val_label.cpu().numpy())

        # total_acc_val += acc

In [None]:
print(f"Correct Prediction without Similarity: {correct_top_k_wo_sim}, {correct_top_k_wo_sim / len(df_test)}")
# print(f"Correct Prediction with Similarity: {correct_top_k}, {correct_top_k / len(y_df)}")

Correct Prediction without Similarity: 493, 1.0


In [None]:
np.concatenate(all_preds)

array([1, 1, 2, 3, 5, 6, 1, 2, 1, 1, 2, 1, 3, 2, 2, 1, 1, 2, 2, 2, 2, 1,
       4, 2, 2, 2, 1, 1, 3, 1, 2, 7, 1, 2, 3, 1, 2, 6, 2, 1, 5, 2, 6, 6,
       2, 4, 1, 2, 1, 6, 2, 1, 1, 3, 1, 1, 6, 1, 2, 1, 1, 1, 1, 6, 1, 4,
       2, 4, 2, 1, 2, 2, 1, 1, 2, 1, 2, 6, 0, 4, 4, 1, 6, 2, 5, 6, 1, 2,
       1, 1, 2, 2, 2, 2, 1, 3, 2, 1, 6, 5, 3, 1, 2, 2, 1, 3, 2, 6, 2, 5,
       2, 2, 4, 2, 6, 2, 3, 5, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 5, 1, 6, 2,
       0, 1, 1, 1, 6, 6, 1, 2, 5, 1, 3, 1, 5, 3, 2, 2, 1, 2, 2, 1, 5, 1,
       6, 6, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 5, 2, 6, 1, 6,
       6, 6, 2, 1, 1, 0, 1, 1, 6, 1, 4, 6, 2, 2, 3, 1, 2, 1, 1, 2, 2, 6,
       1, 3, 4, 4, 1, 5, 2, 2, 1, 0, 4, 2, 1, 2, 2, 6, 1, 2, 5, 2, 1, 6,
       6, 4, 2, 5, 1, 1, 2, 2, 2, 4, 1, 2, 2, 2, 1, 1, 2, 1, 3, 2, 4, 3,
       0, 4, 1, 1, 1, 1, 3, 5, 1, 1, 2, 4, 1, 2, 1, 2, 3, 5, 2, 1, 1, 6,
       1, 6, 1, 1, 4, 2, 1, 1, 1, 4, 6, 6, 0, 2, 3, 3, 1, 2, 6, 6, 4, 3,
       1, 2, 1, 6, 2, 6, 1, 0, 6, 2, 2, 5, 2, 4, 3,

In [None]:
np.concatenate(all_labels)

array([1, 1, 2, 3, 5, 6, 1, 2, 1, 1, 2, 1, 3, 2, 2, 1, 1, 2, 2, 2, 2, 1,
       4, 2, 2, 2, 1, 1, 3, 1, 2, 7, 1, 2, 3, 1, 2, 6, 2, 1, 5, 2, 6, 6,
       2, 4, 1, 2, 1, 6, 2, 1, 1, 3, 1, 1, 6, 1, 2, 1, 1, 1, 1, 6, 1, 4,
       2, 4, 2, 1, 2, 2, 1, 1, 2, 1, 2, 6, 0, 4, 4, 1, 6, 2, 5, 6, 1, 2,
       1, 1, 2, 2, 2, 2, 1, 3, 2, 1, 6, 5, 3, 1, 2, 2, 1, 3, 2, 6, 2, 5,
       2, 2, 4, 2, 6, 2, 3, 5, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 5, 1, 6, 2,
       0, 1, 1, 1, 6, 6, 1, 2, 5, 1, 3, 1, 5, 3, 2, 2, 1, 2, 2, 1, 5, 1,
       6, 6, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 5, 2, 6, 1, 6,
       6, 6, 2, 1, 1, 0, 1, 1, 6, 1, 4, 6, 2, 2, 3, 1, 2, 1, 1, 2, 2, 6,
       1, 3, 4, 4, 1, 5, 2, 2, 1, 0, 4, 2, 1, 2, 2, 6, 1, 2, 5, 2, 1, 6,
       6, 4, 2, 5, 1, 1, 2, 2, 2, 4, 1, 2, 2, 2, 1, 1, 2, 1, 3, 2, 4, 3,
       0, 4, 1, 1, 1, 1, 3, 5, 1, 1, 2, 4, 1, 2, 1, 2, 3, 5, 2, 1, 1, 6,
       1, 6, 1, 1, 4, 2, 1, 1, 1, 4, 6, 6, 0, 2, 3, 3, 1, 2, 6, 6, 4, 3,
       1, 2, 1, 6, 2, 6, 1, 0, 6, 2, 2, 5, 2, 4, 3,

In [None]:
all_preds_np = np.concatenate(all_preds)
all_labels_np = np.concatenate(all_labels)

# Result Analysis

In [None]:
print(classification_report(all_labels_np, all_preds_np, target_names=labels))

ValueError: Number of classes, 32, does not match size of target_names, 2518. Try specifying the labels parameter

In [None]:
idx2label = {
    row["owner_id"]: row["owner"]
    for _, row in y_df.iterrows()
}

labels = y_df.owner_id.to_list()
labels = sorted(set(labels))
labels = [f"{idx}: {idx2label[idx]}" for idx in labels]

In [None]:
def get_topic_distribution(owner):
    print("Training topic distribution")
    print("=======================================")
    print(X_df[X_df.owner == owner].topic_label.value_counts())

    print("\n\nTesting topic distribution")
    print("=======================================")
    print(y_df[y_df.owner == owner].topic_label.value_counts())

In [None]:
X_df.topic_label.value_counts()

Chrome Tab and Window Behavior Issues     947
Build failures                            840
Chrome stability issues                   487
Layout Testing Issues                     400
Chrome crash reports                      391
Security and SSL issues                   372
Input and keyboard issues                 370
Webpage rendering regression issues       357
Chrome sync issues                        354
Shill WiFi configuration                  337
iOS File Issues                           321
Data Enhancement                          298
Touch and Scroll Issues                   273
DevTools Crashes                          260
GPU rendering issues                      235
Memory Leaks in WebCore and Blink         220
Performance testing issues in Chromium    197
WebRTC audio/video issues                 184
Bookmark issues                           174
Performance Regression in Blink            13
Name: topic_label, dtype: int64

In [None]:
get_topic_distribution("a...@chromium.org")

Training topic distribution
Chrome Tab and Window Behavior Issues     10
Webpage rendering regression issues        5
Memory Leaks in WebCore and Blink          4
Chrome stability issues                    4
DevTools Crashes                           3
Data Enhancement                           3
Input and keyboard issues                  3
Touch and Scroll Issues                    2
Bookmark issues                            1
Security and SSL issues                    1
Chrome sync issues                         1
Layout Testing Issues                      1
Build failures                             1
iOS File Issues                            1
Performance testing issues in Chromium     1
Name: topic_label, dtype: int64


Testing topic distribution
Chrome Tab and Window Behavior Issues     17
Webpage rendering regression issues        7
DevTools Crashes                           6
iOS File Issues                            6
Touch and Scroll Issues                    4
Input and k