In [16]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler

from triagerx.dataset.processor import DatasetProcessor
from triagerx.model.lbt_p import LBTPClassifier
from triagerx.model.roberta_cnn import RobertaCNNClassifier
from triagerx.model.roberta_fcn import RobertaFCNClassifier
from triagerx.trainer.model_trainer import ModelTrainer
from triagerx.trainer.train_config import TrainConfig


In [2]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20.json"

In [3]:

dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20.json"

df = pd.read_json(dataset_path)
df = df[df["owner"].notna()]

def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df["text"] = df.apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(df)
df = clean_data(df)

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

Total number of issues: 109979


In [8]:
num_cv = 10
sample_threshold=20
samples_per_block = len(df) // num_cv + 1
print(f"Samples per block: {samples_per_block}")

block = 1
X_df = df[:samples_per_block*block]
y_df = df[samples_per_block*block : samples_per_block * (block+1)]


developers = X_df["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
X_df = X_df[X_df["owner"].isin(filtered_developers)]

train_owners = set(X_df["owner"])
test_owners = set(y_df["owner"])

unwanted = list(test_owners - train_owners)

y_df = y_df[~y_df["owner"].isin(unwanted)]

print(f"Training data: {len(X_df)}, Validation data: {len(y_df)}")

lbl2idx = {}

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx

Samples per block: 10998
Training data: 7030, Validation data: 6095


In [14]:
X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])
y_df["owner_id"] = y_df["owner"].apply(lambda owner: lbl2idx[owner])

In [17]:
class CombineLoss(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss()

    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

In [18]:
model = LBTPClassifier(
    output_size=len(X_df.owner_id.unique())
)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
class_counts = np.bincount(X_df["owner_id"])
num_samples = sum(class_counts)
labels = X_df["owner_id"].to_list() #corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

In [20]:
learning_rate = 1e-5
epochs = 50
batch_size = 15

In [23]:
sampler_name = sampler.__class__.__name__ if sampler else "None"
model_name = model.__class__.__name__

output_file = f"dt_lbtp_cv{block}_{model_name}_20_{sampler_name}"
output_path = f"/home/mdafifal.mamun/notebooks/triagerX/output/{output_file}.pt"

wandb_config = {
        "project": "triagerx_dt_cv",
        "name": f"run_{output_file}",
        "config": {
        "learning_rate": learning_rate,
        "architecture": "Roberta-FCN",
        "dataset": "deeptriage",
        "epochs": epochs,
    }
}

criterion = CombineLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.1, threshold=1e-8)

train_config = TrainConfig(
    optimizer=optimizer,
    criterion=criterion,
    train_dataset=X_df,
    validation_dataset=y_df,
    learning_rate=learning_rate,
    batch_size=batch_size,
    epochs=epochs,
    output_file=output_path,
    sampler=sampler,
    scheduler=scheduler,
    wandb=wandb_config
)

In [24]:
trainer = ModelTrainer(train_config)
trainer.train(model=model)

[32m2024-01-29 14:23:32.948[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-01-29 14:23:32.951[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-01-29 14:23:43.875[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-01-29 14:23:43.877[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-01-29 14:23:52.499[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m33[0m - [34m[1mInitializing wandb...[0m
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[32m2024-01-29 14:24:06.643[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m48[0m - [34m[1mSelected compute device: cuda[0m
  return F.conv2d(input, weight, bias, self.stride,
100%|██████████| 469/469 [04:19<00:00,  1.81it/s]
  _warn_prf(average, modifier, msg_start, len(result))
[32m2024-01-29 14:31:03.337[0m | [1mINFO    [0m | [36mtriagerx.trainer.model_trainer[0m:[36m_log_step[0m:[36m173[0m - [1mEpochs: 1 | Train Loss:  1.354                     | Train Accuracy:  0.039                     | Val Loss:  1.330                     | Val Accuracy:  0.047                     | Top 10: 0.2047579983593109                     | Precision:  0.029                     | Recall:  0.036                     | F1-score:  0.015[0m
[32m2024-01-29 14:31:03.339[0m | [32m[1mSUCCESS [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m143[0m - [32m[1mFound new best model. Saving weights...[0m
100%|██████████| 469/469 [04:19

KeyboardInterrupt: 