In [4]:
import numpy as np
import torch
from loguru import logger
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm

from triagerx.loss.loss_functions import CombinedLoss
from triagerx.trainer.train_config import TrainConfig
from triagerx.utils.early_stopping import EarlyStopping
import pandas as pd
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from triagerx.dataset.text_processor import TextProcessor
from triagerx.model.cnn_transformer import CNNTransformer
from triagerx.model.fcn_transformer import FCNTransformer

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [5]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/data/google_chromium/classifier_data_20.csv"
df = pd.read_csv(dataset_path)
df = df.rename(columns={"assignees": "owner", "issue_body": "description"})
df = df[df["owner"].notna()]

num_cv = 10
block = 9
# sample_threshold=20 # Threshold to filter developers
samples_per_block = len(df) // num_cv

sliced_df = df[: samples_per_block * (block + 1)]
df_train = sliced_df[: samples_per_block * block]
df_test = sliced_df[samples_per_block * block : samples_per_block * (block + 1)]

sample_threshold = 20
developers = df_train["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
df_train = df_train[df_train["owner"].isin(filtered_developers)]

train_owners = set(df_train["owner"])
test_owners = set(df_test["owner"])

unwanted = list(test_owners - train_owners)

df_test = df_test[~df_test["owner"].isin(unwanted)]

lbl2idx = {}
idx2lbl = {}

train_owners = sorted(train_owners)

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx
    idx2lbl[idx] = dev

df_train["owner_id"] = df_train["owner"].apply(lambda owner: lbl2idx[owner])
df_test["owner_id"] = df_test["owner"].apply(lambda owner: lbl2idx[owner])
df_test = TextProcessor.prepare_dataframe(df_test, False, False, True, False, False)

print(f"Training data: {len(df_train)}, Validation data: {len(df_test)}")
print(f"Number of train developers: {len(df_train.owner.unique())}")
print(f"Number of test developers: {len(df_test.owner.unique())}")

100%|██████████| 10781/10781 [00:00<00:00, 1015992.80it/s]
[32m2024-10-25 12:37:00.799[0m | [1mINFO    [0m | [36mtriagerx.dataset.text_processor[0m:[36mprepare_dataframe[0m:[36m58[0m - [1mCleaning text...[0m
100%|██████████| 10781/10781 [00:01<00:00, 5584.60it/s]
[32m2024-10-25 12:37:02.735[0m | [1mINFO    [0m | [36mtriagerx.dataset.text_processor[0m:[36mprepare_dataframe[0m:[36m71[0m - [1mAdding description...[0m
100%|██████████| 10781/10781 [00:00<00:00, 115191.26it/s]

Training data: 98165, Validation data: 10781
Number of train developers: 986
Number of test developers: 857





In [6]:
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
import numpy as np

class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: RobertaTokenizer,
        feature: str = "text",
        target: str = "owner_id",
        max_tokens: int = 256,
    ):
        print("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        print("Tokenizing texts...")
        self.texts = [
            self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=max_tokens,
                truncation=True,
                return_tensors="pt",
            )
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y