In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler

from triagerx.dataset.processor import DatasetProcessor
from triagerx.model.lbt_p import LBTPClassifier
from triagerx.model.roberta_cnn import RobertaCNNClassifier
from triagerx.model.roberta_fcn import RobertaFCNClassifier
from triagerx.trainer.model_trainer import ModelTrainer
from triagerx.trainer.train_config import TrainConfig


In [2]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20.json"

In [3]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [4]:
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

In [5]:

dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20.json"

df = pd.read_json(dataset_path)
df = df[df["owner"].notna()]

def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df["text"] = df.apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(df)
df = clean_data(df)

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

  df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


Total number of issues: 109979


In [6]:
num_cv = 10
sample_threshold=20
samples_per_block = len(df) // num_cv + 1
print(f"Samples per block: {samples_per_block}")

block = 1
X_df = df[:samples_per_block*block]
y_df = df[samples_per_block*block : samples_per_block * (block+1)]


developers = X_df["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
X_df = X_df[X_df["owner"].isin(filtered_developers)]

train_owners = set(X_df["owner"])
test_owners = set(y_df["owner"])

unwanted = list(test_owners - train_owners)

y_df = y_df[~y_df["owner"].isin(unwanted)]

print(f"Training data: {len(X_df)}, Validation data: {len(y_df)}")

lbl2idx = {}

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx

Samples per block: 10998
Training data: 7030, Validation data: 6095


In [7]:
X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])
y_df["owner_id"] = y_df["owner"].apply(lambda owner: lbl2idx[owner])

In [9]:
all_embeddings = similarity_model.encode(X_df.issue_title.to_list(), batch_size=15)

In [14]:
row = y_df.iloc[1:3]
row

Unnamed: 0,owner,issue_title,description,text,owner_id
11001,abarth@chromium.org,Mixed content warning can be removed,"\nWhen vising an HTTPS URL, if the page includ...",Title: Mixed content warning can be removed\nD...,10
11002,sleffler@chromium.org,no openvpn debugging from onc,\nThe current onc setups for openvpn don't sup...,Title: no openvpn debugging from onc\nDescript...,6


In [19]:
row.issue_title.to_list()



In [32]:
y_df.iloc[2]

owner                                      sleffler@chromium.org
issue_title                        no openvpn debugging from onc
description    \nThe current onc setups for openvpn don't sup...
text           Title: no openvpn debugging from onc\nDescript...
owner_id                                                       6
Name: 11002, dtype: object

In [28]:
test_embed = similarity_model.encode(row.issue_title.to_list())
cos = util.cos_sim(test_embed, all_embeddings)
topk = torch.topk(torch.tensor(cos), k=5)
print(topk)

# similar = X_df.iloc[topk.indices.numpy()][["issue_title", "owner"]]

torch.return_types.topk(
values=tensor([[0.6413, 0.4781, 0.4575, 0.4440, 0.4376],
        [0.5500, 0.5143, 0.5022, 0.4979, 0.4928]]),
indices=tensor([[2439, 5086, 4028, 4544, 5709],
        [4936, 1770, 6491,  198, 6055]]))


  topk = torch.topk(torch.tensor(cos), k=5)


In [34]:
y_df.iloc[2]


owner                                      sleffler@chromium.org
issue_title                        no openvpn debugging from onc
description    \nThe current onc setups for openvpn don't sup...
text           Title: no openvpn debugging from onc\nDescript...
owner_id                                                       6
Name: 11002, dtype: object

In [45]:
def get_top_k_similar_devs(issues, k=5):
    test_embed = similarity_model.encode(issues)
    cos = util.cos_sim(test_embed, all_embeddings)
    topk = torch.topk(torch.tensor(cos), k=k)
    
    similarities = []
    
    for it in topk.indices.numpy():
        similarities.append(X_df.iloc[it]["owner_id"].to_list())

    return similarities

In [46]:
get_top_k_similar_devs(row.issue_title.to_list())

  topk = torch.topk(torch.tensor(cos), k=k)


[[10, 96, 47, 25, 94], [118, 136, 34, 136, 6]]

In [38]:
class CombineLoss(nn.Module):
    def __init__(self, class_weights) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss(weight=class_weights)

    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

In [6]:
model = LBTPClassifier(
    output_size=len(X_df.owner_id.unique())
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
class_counts = np.bincount(X_df["owner_id"])
num_samples = sum(class_counts)
labels = X_df["owner_id"].to_list() #corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

In [8]:
from sklearn.utils.class_weight import compute_class_weight

In [24]:
class_weights = compute_class_weight('balanced', classes=X_df["owner_id"].unique(), y=X_df["owner_id"].to_numpy())

# Convert class weights to a tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

In [27]:
learning_rate = 1e-5
epochs = 50
batch_size = 15

In [29]:
sampler_name = sampler.__class__.__name__ if sampler else "None"
model_name = model.__class__.__name__

output_file = f"dt_lbtp_cv{block}_weighted_ce_{model_name}_20_{sampler_name}"
output_path = f"/home/mdafifal.mamun/notebooks/triagerX/output/{output_file}.pt"

wandb_config = {
        "project": "triagerx_dt_cv",
        "name": f"run_{output_file}",
        "config": {
        "learning_rate": learning_rate,
        "architecture": "Roberta-FCN",
        "dataset": "deeptriage",
        "epochs": epochs,
    }
}

criterion = CombineLoss(class_weights_tensor)
optimizer = Adam(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.1, threshold=1e-8)

train_config = TrainConfig(
    optimizer=optimizer,
    criterion=criterion,
    train_dataset=X_df,
    validation_dataset=y_df,
    learning_rate=learning_rate,
    batch_size=batch_size,
    epochs=epochs,
    output_file=output_path,
    sampler=sampler,
    scheduler=scheduler,
    wandb=wandb_config
)

In [47]:
import pandas as pd
import numpy as np
from loguru import logger
from torch.utils.data import Dataset

from transformers import PreTrainedTokenizer


class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
        feature: str = "text",
        target: str = "owner_id",
    ):
        logger.debug("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        logger.debug("Tokenizing texts...")
        self.texts = [
            [row.issue_title, self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )]
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [54]:
tt = torch.rand(2, 10)

In [63]:
sims = get_top_k_similar_devs(X_df.iloc[3:5].issue_title.to_list())

  topk = torch.topk(torch.tensor(cos), k=k)


In [77]:
sims

[[66, 66, 123, 124, 66], [106, 106, 106, 106, 115]]

In [78]:
indices

tensor([[0, 4, 8, 6, 3],
        [7, 8, 9, 5, 1]])

In [79]:
_, indices = tt.topk(5, 1, True, True)


out = []

torch.concat((indices, torch.tensor(sims)), dim=1)

tensor([[  0,   4,   8,   6,   3,  66,  66, 123, 124,  66],
        [  7,   8,   9,   5,   1, 106, 106, 106, 106, 115]])

In [None]:
import numpy as np
import torch
from loguru import logger
from sklearn.metrics import precision_recall_fscore_support, top_k_accuracy_score
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

import wandb
# from triagerx.dataset.triage_dataset import TriageDataset
from triagerx.trainer.train_config import TrainConfig


class ModelTrainer:
    def __init__(self, config: TrainConfig):
        self._config = config

    def _init_wandb(self):
        wandb.init(**self._config.wandb)

    def train(self, model: nn.Module):
        tokenizer = model.tokenizer()
        criterion = self._config.criterion
        optimizer = self._config.optimizer
        train_data = self._config.train_dataset
        validation_data = self._config.validation_dataset
        sampler = self._config.sampler

        train = TriageDataset(train_data, tokenizer)
        val = TriageDataset(validation_data, tokenizer)

        if self._config.wandb:
            logger.debug("Initializing wandb...")
            self._init_wandb()

        train_dataloader = DataLoader(
            dataset=train,
            batch_size=self._config.batch_size,
            shuffle=False if sampler else True,
            sampler=sampler,
        )
        val_dataloader = DataLoader(val, batch_size=self._config.batch_size)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        best_loss = float("inf")

        if torch.cuda.is_available():
            logger.debug(f"Selected compute device: {device}")
            model = model.cuda()
            criterion = criterion.cuda()

        for epoch_num in range(self._config.epochs):
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input[1]["attention_mask"].to(device)
                input_id = train_input[1]["input_ids"].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                output = torch.sum(torch.stack(output), 0)

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0
            correct_top_k = 0

            all_preds = []
            all_labels = []

            with torch.no_grad():

                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    mask = val_input[1]["attention_mask"].to(device)
                    input_id = val_input[1]["input_ids"].squeeze(1).to(device)

                    base_texts = val_input[0]

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    output = torch.sum(torch.stack(output), 0)
                    _, top_k_predictions = output.topk(5, 1, True, True)
                    sim_scores = get_top_k_similar_devs(base_texts)

                    top_k_predictions = torch.concat((top_k_predictions, torch.tensor(sim_scores)), dim=1)

                    top_k_predictions = top_k_predictions.t()

                    correct_top_k += (
                        top_k_predictions.eq(
                            val_label.view(1, -1).expand_as(top_k_predictions)
                        )
                        .sum()
                        .item()
                    )

                    acc = (output.argmax(dim=1) == val_label).sum().item()

                    all_preds.append(output.argmax(dim=1).cpu().numpy())
                    all_labels.append(val_label.cpu().numpy())

                    total_acc_val += acc

            all_preds = np.concatenate(all_preds)
            all_labels = np.concatenate(all_labels)

            precision, recall, f1_score, _ = precision_recall_fscore_support(
                all_labels, all_preds, average="macro"
            )

            top10 = correct_top_k / len(validation_data)

            self._log_step(
                epoch_num,
                total_acc_train,
                total_acc_val,
                total_loss_train,
                total_loss_val,
                precision,
                recall,
                f1_score,
                train_data,
                validation_data,
                top10,
            )

            val_loss = total_loss_val / len(validation_data)

            if self._config.scheduler:
                self._config.scheduler.step(val_loss)

            if val_loss < best_loss:
                logger.success("Found new best model. Saving weights...")
                torch.save(model.state_dict(), self._config.output_file)
                best_loss = val_loss

        if self._config.wandb:
            wandb.finish()

    def _log_step(
        self,
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        train_data,
        validation_data,
        topk,
    ):
        log = f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                    | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                    | Val Loss: {total_loss_val / len(validation_data): .3f} \
                    | Val Accuracy: {total_acc_val / len(validation_data): .3f} \
                    | Top 10: {topk} \
                    | Precision: {precision: .3f} \
                    | Recall: {recall: .3f} \
                    | F1-score: {f1_score: .3f}"

        logger.info(log)

        if self._config.wandb:
            wandb.log(
                {
                    "train_acc": total_acc_train / len(train_data),
                    "train_loss": total_loss_train / len(train_data),
                    "val_acc": total_acc_val / len(validation_data),
                    "val_loss": total_loss_val / len(validation_data),
                    "precision": precision,
                    "recall": recall,
                    "f1-score": f1_score,
                    "top10": topk,
                }
            )


In [30]:
trainer = ModelTrainer(train_config)
trainer.train(model=model)

[32m2024-02-16 13:01:08.856[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-02-16 13:01:08.858[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-02-16 13:01:19.769[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-02-16 13:01:19.772[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-02-16 13:01:28.379[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m33[0m - [34m[1mInitializing wandb...[0m
[34m[1mwandb[0m: Currently logged in as: [33mafifaniks[0m. Use [1m`wandb login --relogin`[0m to force relogin
 68%|██████▊   | 321/469 [02:56<01:21,  1.82

[32m2024-02-16 13:01:41.541[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m48[0m - [34m[1mSelected compute device: cuda[0m
  return F.conv2d(input, weight, bias, self.stride,
100%|██████████| 469/469 [04:18<00:00,  1.82it/s]
  _warn_prf(average, modifier, msg_start, len(result))
[32m2024-02-16 13:08:36.284[0m | [1mINFO    [0m | [36mtriagerx.trainer.model_trainer[0m:[36m_log_step[0m:[36m173[0m - [1mEpochs: 1 | Train Loss:  1.333                     | Train Accuracy:  0.031                     | Val Loss:  1.306                     | Val Accuracy:  0.041                     | Top 10: 0.19524200164068908                     | Precision:  0.032                     | Recall:  0.035                     | F1-score:  0.014[0m
[32m2024-02-16 13:08:36.287[0m | [32m[1mSUCCESS [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m143[0m - [32m[1mFound new best model. Saving weights...[0m
100%|██████████| 469/469 [04:1