In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler

from triagerx.dataset.processor import DatasetProcessor
from triagerx.model.lbt_p import LBTPClassifier
from triagerx.model.roberta_cnn import RobertaCNNClassifier
from triagerx.model.roberta_fcn import RobertaFCNClassifier
from triagerx.trainer.model_trainer import ModelTrainer
from triagerx.trainer.train_config import TrainConfig


In [2]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20.json"

In [2]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20_topics_kmeans.csv"

df = pd.read_csv(dataset_path)
df = df[df["owner"].notna()]

In [3]:
topic_id = 3

In [4]:
topic_df = df[df["topic_id"] == topic_id]
len(topic_df.owner.unique())

289

In [6]:
developers = topic_df["owner"].value_counts()
filtered_developers = developers.index[developers >= 5]
topic_df = topic_df[topic_df["owner"].isin(filtered_developers)]

In [7]:
len(topic_df.owner.value_counts())

90

In [8]:
len(topic_df)

1629

In [9]:
def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df["text"] = df.apply(
            lambda x: "Title: "
            + str(x["issue_title"])
            # + "\nIssue Topic: "
            # + str(x["issue_label"])
            + "\nDescription: "
            + str(x["description"]),
            axis=1,
        )
    
    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    # df["owner_id"] = pd.factorize(df["assignees"])[0]

    return df

df = prepare_dataframe(topic_df)
df = clean_data(df)

num_issues = len(df)

print(f"Total number of issues: {num_issues}")

Total number of issues: 1629


  df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')


In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,owner,issue_title,description,topic_id,topic_label,text
38,38,erikwright@chromium.org,2% regression in chromium-rel-win7-dual/dromae...,Graphs:https://chromeperf.appspot.com/report?r...,3,Performance Regression in Blink,Title: 2% regression in chromium-rel-win7-dual...
87,87,erikwright@chromium.org,2% regression in linux-release/dromaeo.jslibmo...,Graphs:https://chromeperf.appspot.com/report?r...,3,Performance Regression in Blink,Title: 2% regression in linux-release/dromaeo....
105,105,erikwright@chromium.org,10% regression in linux-release/blink_perf/Par...,Graphs:https://chromeperf.appspot.com/report?r...,3,Performance Regression in Blink,Title: 10% regression in linux-release/blink_p...
128,128,erikwright@chromium.org,6% regression in chromium-rel-mac9/image_decod...,Graphs:https://chromeperf.appspot.com/report?r...,3,Performance Regression in Blink,Title: 6% regression in chromium-rel-mac9/imag...
151,151,robertphillips@chromium.org,6% regression in chromium-rel-mac8/moz/vm_resi...,The skia roll to r11489 regresses vm_resident_...,3,Performance Regression in Blink,Title: 6% regression in chromium-rel-mac8/moz/...


In [14]:
len(df[:int(len(df)*0.8)].owner.unique())

90

In [15]:
# num_cv = 10
# sample_threshold=20
# samples_per_block = len(df) // num_cv + 1
# print(f"Samples per block: {samples_per_block}")

# block = 1
X_df = df[:int(len(df)*0.8)]
y_df = df[int(len(df)*0.8):]


train_owners = set(X_df["owner"])
test_owners = set(y_df["owner"])

unwanted = list(test_owners - train_owners)

y_df = y_df[~y_df["owner"].isin(unwanted)]

print(f"Training data: {len(X_df)}, Validation data: {len(y_df)}")
print(f"Training dev: {len(X_df.owner.unique())}, Validation dev: {len(y_df.owner.unique())}")

lbl2idx = {}

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx

Training data: 1303, Validation data: 326
Training dev: 90, Validation dev: 67


In [16]:
X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])
y_df["owner_id"] = y_df["owner"].apply(lambda owner: lbl2idx[owner])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])


In [19]:
class CombineLoss(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss()

    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

In [20]:
model = LBTPClassifier(
    output_size=len(X_df.owner_id.unique())
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
class_counts = np.bincount(X_df["owner_id"])
num_samples = sum(class_counts)
labels = X_df["owner_id"].to_list() #corresponding labels of samples

class_weights = [num_samples/class_counts[i] for i in range(len(class_counts))]
weights = [class_weights[labels[i]] for i in range(int(num_samples))]
sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))

In [22]:
learning_rate = 1e-5
epochs = 20
batch_size = 15

In [23]:
sampler_name = sampler.__class__.__name__ if sampler else "None"
model_name = model.__class__.__name__

output_file = f"dt_lbtp_kmeans_topic_{topic_id}"
output_path = f"/home/mdafifal.mamun/notebooks/triagerX/output/{output_file}.pt"

wandb_config = {
        "project": "triagerx_topic_model",
        "name": f"run_{output_file}",
        "config": {
        "learning_rate": learning_rate,
        "architecture": "Roberta-FCN",
        "dataset": "deeptriage",
        "epochs": epochs,
    }
}

criterion = CombineLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.1, threshold=1e-8)

train_config = TrainConfig(
    optimizer=optimizer,
    criterion=criterion,
    train_dataset=X_df,
    validation_dataset=y_df,
    learning_rate=learning_rate,
    batch_size=batch_size,
    epochs=epochs,
    output_file=output_path,
    sampler=sampler,
    scheduler=scheduler,
    wandb=wandb_config
)

In [24]:
trainer = ModelTrainer(train_config)
trainer.train(model=model)

[32m2024-02-13 20:50:04.235[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-02-13 20:50:04.237[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-02-13 20:50:04.830[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m17[0m - [34m[1mGenerating torch dataset...[0m
[32m2024-02-13 20:50:04.832[0m | [34m[1mDEBUG   [0m | [36mtriagerx.dataset.triage_dataset[0m:[36m__init__[0m:[36m20[0m - [34m[1mTokenizing texts...[0m
[32m2024-02-13 20:50:04.979[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m33[0m - [34m[1mInitializing wandb...[0m
[34m[1mwandb[0m: Currently logged in as: [33mafifaniks[0m. Use [1m`wandb login --relogin`[0m to force relogin


[32m2024-02-13 20:50:15.077[0m | [34m[1mDEBUG   [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m48[0m - [34m[1mSelected compute device: cuda[0m
  return F.conv2d(input, weight, bias, self.stride,
100%|██████████| 87/87 [00:46<00:00,  1.86it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[32m2024-02-13 20:51:10.623[0m | [1mINFO    [0m | [36mtriagerx.trainer.model_trainer[0m:[36m_log_step[0m:[36m173[0m - [1mEpochs: 1 | Train Loss:  1.197                     | Train Accuracy:  0.068                     | Val Loss:  1.226                     | Val Accuracy:  0.003                     | Top 10: 0.07668711656441718                     | Precision:  0.001                     | Recall:  0.014                     | F1-score:  0.001[0m
[32m2024-02-13 20:51:10.627[0m | [32m[1mSUCCESS [0m | [36mtriagerx.trainer.model_trainer[0m:[36mtrain[0m:[36m143[0m - [32m[1mFound new best model.

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
f1-score,▁▂▃▃▃▃▄▃▃▃▄▅▄▄▅▆▇▆▅█
precision,▁▁▂▂▂▂▄▃▃▃▄▅▅▄▅▅▆▆▄█
recall,▁▄▅▄▄▃▄▃▅▃▄▆▄▄▄▅▇▆▅█
top10,▁▂▄█▄▃▄▄▅▅▅▄▆▇▆▆▆▆▆▇
train_acc,▁▂▃▄▄▅▅▆▆▆▆▇▇▇▇▇████
train_loss,██▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁
val_acc,▁▂▂▇▃▃▄▂▃▃▄▄▅▅▅▆▅▆▅█
val_loss,█▆▅▁▄▆▄▄▄▃▃▅▁▁▂▁▃▃▄▃

0,1
f1-score,0.03928
precision,0.05007
recall,0.06526
top10,0.33129
train_acc,0.94781
train_loss,0.19971
val_acc,0.07975
val_loss,1.17337
