In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm

from transformers import RobertaModel, RobertaTokenizer

In [2]:
def load_dataframe(path: str) -> pd.DataFrame:
    return pd.read_csv(path)

In [3]:
def clean_data(df):
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df
    
def prepare_dataframe(df: pd.DataFrame, minimum_contribution = 300) -> pd.DataFrame:
    developers = df["owner"].value_counts()
    # filtered_developers = developers.index[developers >= minimum_contribution]
    # df = df[df["owner"].isin(filtered_developers)]

    df["text"] = df.apply(lambda x: str(x["issue_title"]) + "\n" + str(x["description"]), axis=1)
    df["owner_id"] = pd.factorize(df["owner"])[0]

    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    return df

In [6]:
data_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/classifier_data_20.csv"

df = load_dataframe(data_path)
df = prepare_dataframe(df)
df = clean_data(df)

In [8]:
print(df.iloc[2220]["text"])

possible data race in disk_cache::BackendImpl


In [6]:

print(len(df))

109979


In [7]:
len(df["owner"].value_counts())

1032

In [11]:
# df["owner"].value_counts().plot(kind="pie")

In [8]:
df

Unnamed: 0,owner,issue_title,description,text,owner_id
0,amit@chromium.org,"Scrolling with some scroll mice (touchpad, etc...",Product Version : <see about:version>URLs...,"Scrolling with some scroll mice (touchpad, etc...",0
1,jon@chromium.org,Proxy causes some or all network requests to fail,Product Version : 0.2.149.27 (1583)URLs (...,Proxy causes some or all network requests to f...,1
2,pfeldman@chromium.org,"Web inspector button ""dock to main window"" doe...",Product Version : chrome beta 1URLs (if a...,"Web inspector button ""dock to main window"" doe...",2
3,jon@chromium.org,Habari admin interface is not rendered correctly,Product Version : 0.2.149.27 (1583)URLs (...,Habari admin interface is not rendered correct...,1
4,pkasting@chromium.org,Maximize on second larger monitor not working,Product Version : 0.2.149.27URLs (if appl...,Maximize on second larger monitor not working\...,3
...,...,...,...,...,...
109974,navabi@chromium.org,Launch clank_qa recipes to the waterfall,We had git trouble,Launch clank_qa recipes to the waterfall\nWe h...,828
109975,bulach@chromium.org,data race in ThreadWatcherListTest,r255322 is culprithttp://build.chromium.org/p/...,data race in ThreadWatcherListTest\nr255322 is...,556
109976,pfeldman@chromium.org,window.console object should not be configurable,Recently sites have begun replacing window.con...,window.console object should not be configurab...,2
109977,ernstm@chromium.org,Windows GPU bots failing on multiple tests,All Windows GPU bots are failing a variety of ...,Windows GPU bots failing on multiple tests\nAl...,588


In [20]:
X = df[["issue_title", "description"]]
y = df[["owner"]]

In [22]:
X.head()

Unnamed: 0,issue_title,description
0,"Scrolling with some scroll mice (touchpad, etc...",Product Version : <see about:version>URLs...
1,Proxy causes some or all network requests to fail,Product Version : 0.2.149.27 (1583)URLs (...
2,"Web inspector button ""dock to main window"" doe...",Product Version : chrome beta 1URLs (if a...
3,Habari admin interface is not rendered correctly,Product Version : 0.2.149.27 (1583)URLs (...
4,Maximize on second larger monitor not working,Product Version : 0.2.149.27URLs (if appl...


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
np.random.seed(112)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

In [28]:
y_test["owner"].value_counts()

owner
estade@chromium.org        412
sky@chromium.org           312
pfeldman@chromium.org      238
ananta@chromium.org        235
davidjames@chromium.org    228
                          ... 
peconn@chromium.org          7
dmazzoni...@gmail.com        7
sunandt@chromium.org         7
kenobi@chromium.org          7
drinkcat@chromium.org        7
Name: count, Length: 1032, dtype: int64

In [13]:
df_val["owner"].value_counts()

owner
estade@chromium.org         115
sky@chromium.org             98
pfeldman@chromium.org        77
ananta@chromium.org          73
tha...@chromium.org          70
                           ... 
aval...@chromium.org          1
dmu...@chromium.org           1
johnjbarton@chromium.org      1
jvoung@chromium.org           1
yuzo@chromium.org             1
Name: count, Length: 1008, dtype: int64

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, model_name):
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.labels = [label for label in df['owner_id']]
        self.texts = [self.tokenizer(text,
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [15]:
class LBTLikeClassifier(nn.Module):
    def __init__(self, model_name: str, output_size, num_filters=256, bert_layers=4, embed_size=768, dropout = 0.1) -> None:
        super().__init__()
        self.base_model = RobertaModel.from_pretrained(model_name, output_hidden_states=True)
        filter_sizes = [3, 4, 5, 6]
        self.num_filters = num_filters
        self.bert_layers = bert_layers
        self.convs1 = nn.ModuleList([nn.Conv2d(self.bert_layers, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters + embed_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        x = self.base_model(input_ids, attention_mask = attention_mask)
        hidden_states = x["hidden_states"][-self.bert_layers:]
        pooler_output = x["pooler_output"]
        x = torch.stack(hidden_states, dim=1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = torch.cat([pooler_output, x], 1)
        x = self.dropout(x)
        logit = self.fc1(x)

        return self.relu(x)


In [16]:
class Classifier(nn.Module):
    def __init__(self, model_name: str, output_size, embed_size=768, dropout = 0.1) -> None:
        super().__init__()
        self.base_model = RobertaModel.from_pretrained(model_name, output_hidden_states=True)
        filter_sizes = [3, 4, 5, 6]
        num_filters = 256
        self.convs1 = nn.ModuleList([nn.Conv2d(4, num_filters, (K, embed_size)) for K in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(filter_sizes)*num_filters, output_size)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        x = self.base_model(input_ids, attention_mask = attention_mask)[2][-4:]
        x = torch.stack(x, dim=1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logit = self.fc1(x)  

        return self.relu(logit)


In [17]:
def train(model, train_data, val_data, learning_rate, epochs):
    model_name = "roberta-base"
    train, val = Dataset(train_data, model_name), Dataset(val_data, model_name)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=5, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=5)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                # print(train_label)

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                # print(output, train_label)
                # print(output.shape, train_label.shape)

                batch_loss = criterion(output, train_label.view(-1))
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

EPOCHS = 40
num_classes = len(df["owner"].unique())
model = Classifier("roberta-base", num_classes)
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|████                                                                                                                                            | 492/17597 [01:32<53:24,  5.34it/s]


KeyboardInterrupt: 