In [80]:
import pandas as pd
import numpy as np
from transformers import RobertaModel, RobertaConfig, RobertaTokenizer

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Load the teacher model (RoBERTa-large)
teacher_model = RobertaModel.from_pretrained('roberta-large', output_hidden_states=True)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
student_config = RobertaConfig.from_pretrained("roberta-large")
student_config.num_hidden_layers = 3

In [4]:
student_config.output_hidden_states = True

In [5]:
student_model = RobertaModel(student_config)

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

In [7]:
tokenizer.vocab_size

50265

# Load Data for Distillation

In [12]:
open_data = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/deep_data.csv"

In [13]:
df = pd.read_csv(open_data)[:1000]
print(len(df))
print(df.iloc[15].description)

1000
product version       allurls if applicable  not applicableother browsers testedadd ok or fail after other browsers where you have tested this issue     safari 3 fail    firefox 3 pass         ie 7 partial failwhat steps will reproduce the problem1 try to install an adblockingaddon2 failwhat is the expected resultthe expected result is an addonapi similiar to firefox extensionsallowing third parties to enhance the functionality of chromewhat happens insteadchrome offers next to no customizationoptionsplease provide any additional information below attach a screenshot ifpossiblenot applicable


In [14]:
def clean_distill_data(df):
    df["text"] = df.apply(
        lambda x: str(x["issue_title"]) + "\n" + str(x["description"]),
        axis=1,
    )
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', regex=True)
    df["text"] = df['text'].str.replace("[^A-Za-z0-9 ]+", " ", regex=True)
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    return df

In [15]:
df = clean_distill_data(df)

In [16]:
class DistillationDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer,
        feature: str,
    ):
        self.tokenizer = tokenizer
        self.texts = [
            self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )
            for _, row in df.iterrows()
        ]

    def __len__(self):
        return len(self.texts)

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)

        return batch_texts

In [17]:
dataset = DistillationDataset(df, tokenizer, "text")

In [18]:
distill_dataloader = DataLoader(dataset, shuffle=True, batch_size=10)

In [19]:
optimizer = torch.optim.Adam(student_model.parameters(), lr=5e-5)

In [20]:
num_epochs = 3
s = teacher_model.config.num_hidden_layers // student_model.config.num_hidden_layers


In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


In [22]:
teacher_model = teacher_model.to(device)
student_model = student_model.to(device)
# optimizer = optimizer.to(device)

In [23]:
num_teacher_layers = teacher_model.config.num_hidden_layers
num_student_layers = student_model.config.num_hidden_layers
s = num_teacher_layers // num_student_layers

8

In [24]:
import torch.nn.functional as F

def pkd_loss(student_reps, teacher_reps, s):
    loss = 0
    for i in range(len(student_reps)):
        student_layer = student_reps[i]
        teacher_layer = teacher_reps[i * s]
        student_layer = F.normalize(student_layer, p=2, dim=-1)
        teacher_layer = F.normalize(teacher_layer, p=2, dim=-1)
        loss += F.mse_loss(student_layer, teacher_layer)
    
    return loss

In [25]:
for epoch in range(num_epochs):
    student_model.train()
    teacher_model.eval()

    # Wrap the dataloader with tqdm for progress bar
    progress_bar = tqdm(distill_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].squeeze(1).to(student_model.device)
        attention_mask = batch['attention_mask'].squeeze(1).to(student_model.device)

        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask)
            teacher_hidden_states = teacher_outputs.hidden_states

        student_outputs = student_model(input_ids, attention_mask=attention_mask)
        student_hidden_states = student_outputs.hidden_states
        # break

        loss = pkd_loss(student_hidden_states, teacher_hidden_states, s)
        loss.backward()
        optimizer.step()

        # Update progress bar with loss value
        progress_bar.set_postfix(loss=loss.item())

Epoch 1/3:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/3: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s, loss=0.00216]
Epoch 2/3: 100%|██████████| 100/100 [00:34<00:00,  2.92it/s, loss=0.00185]
Epoch 3/3: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s, loss=0.00154]


In [26]:
!nvidia-smi

Mon May 20 23:01:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:CA:00.0 Off |                    0 |
| N/A   37C    P0              38W / 250W |   6269MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [34]:
del distill_dataloader

NameError: name 'distill_dataloader' is not defined

In [35]:
import gc

# Collect garbage
torch.cuda.empty_cache()

gc.collect()


!nvidia-smi

Mon May 20 23:04:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:CA:00.0 Off |                    0 |
| N/A   35C    P0              38W / 250W |   2837MiB / 40960MiB |     77%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Prepare Classifier Data

In [38]:
def clean_data(df):
    df["text"] = df.apply(
        lambda x: str(x["issue_title"]) + "\n" + str(x["description"]),
        axis=1,
    )
    df['text'] = df['text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', regex=True)
    df["text"] = df['text'].str.replace("[^A-Za-z0-9 ]+", " ", regex=True)
    df["text"] = df['text'].str.replace(" +", " ", regex=True)

    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    return df

In [39]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/notebook/data/deeptriage/gc_20_topics_kmeans.csv"

df = pd.read_csv(dataset_path)
df = df[df["owner"].notna()]
df = clean_data(df)

In [41]:
print(f"Total number of issues: {len(df)}")

Total number of issues: 109972


In [45]:
num_cv = 10
# sample_threshold=20 # Threshold to filter developers
samples_per_block = len(df) // num_cv
print(f"Samples per block: {samples_per_block}")

block = 1
sliced_df = df[: samples_per_block * (block+1)]

Samples per block: 10997


In [47]:
# Train and Validation preparation

X_df = sliced_df[:samples_per_block*block]
y_df = sliced_df[samples_per_block*block : samples_per_block * (block+1)]

# developers = X_df["owner"].value_counts()
# filtered_developers = developers.index[developers >= sample_threshold]
# X_df = X_df[X_df["owner"].isin(filtered_developers)]

train_owners = set(X_df["owner"])
test_owners = set(y_df["owner"])

unwanted = list(test_owners - train_owners)

y_df = y_df[~y_df["owner"].isin(unwanted)]

print(f"Training data: {len(X_df)}, Validation data: {len(y_df)}")
print(f"Number of developers: {len(X_df.owner.unique())}")

Training data: 10997, Validation data: 10806
Number of developers: 803


In [48]:
# Label encode developers

lbl2idx = {}

train_owners = sorted(train_owners)

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx

X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])
y_df["owner_id"] = y_df["owner"].apply(lambda owner: lbl2idx[owner])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df["owner_id"] = X_df["owner"].apply(lambda owner: lbl2idx[owner])


In [52]:
class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: RobertaTokenizer,
        feature: str = "text",
        target: str = "owner_id",
    ):
        print("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        print("Tokenizing texts...")
        self.texts = [self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [53]:
student_model.config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 3,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.40.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [70]:
import torch
import torch.nn.functional as F
from torch import nn


class LBTPClassifier(nn.Module):
    def __init__(
        self, student_model, output_size, unfrozen_layers=1,
    ) -> None:
        super().__init__()
        self.base_model = student_model

        # Freeze embedding layers
        for p in self.base_model.embeddings.parameters():
            p.requires_grad = False

        # Freeze encoder layers till last {unfrozen_layers} layers
        for i in range(0, self.base_model.config.num_hidden_layers - unfrozen_layers):
            for p in self.base_model.encoder.layer[i].parameters():
                p.requires_grad = False

        filter_sizes = [3, 4, 5, 6]
        self._num_filters = 256
        self._max_tokens = 512
        self._embed_size = student_model.config.hidden_size
        self.unfrozen_layers = unfrozen_layers
        self.conv_blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        nn.Sequential(
                            nn.Conv2d(1, self._num_filters, (K, self._embed_size)),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.MaxPool1d(self._max_tokens - (K - 1)),
                            nn.Flatten(start_dim=1),
                        )
                        for K in filter_sizes
                    ]
                )
                for _ in range(unfrozen_layers)
            ]
        )

        self.classifiers = nn.ModuleList(
            [
                nn.Linear(
                    len(filter_sizes) * self._num_filters + self._embed_size, output_size
                )
                for _ in range(unfrozen_layers)
            ]
        )

        # self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        outputs = []

        base_out = self.base_model(input_ids, attention_mask=attention_mask)
        pooler_out = base_out.pooler_output.squeeze(0)
        hidden_states = base_out.hidden_states[-self.unfrozen_layers :]

        for i in range(self.unfrozen_layers):
            batch_size, sequence_length, hidden_size = hidden_states[i].size()
            x = [
                conv(hidden_states[i].view(batch_size, 1, sequence_length, hidden_size))
                for conv in self.conv_blocks[i]
            ]
            x = torch.cat(x, dim=1)
            x = torch.cat([pooler_out, x], dim=1)
            # x = self.dropout(x)
            x = self.classifiers[i](x)

            outputs.append(x)

        return outputs


In [71]:
class CombineLoss(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self._ce = nn.CrossEntropyLoss()

    def forward(
        self,
        prediction,
        labels
    ) -> torch.Tensor:
        loss = 0

        for i in range(len(prediction)):
            loss += self._ce(prediction[i], labels)
            # print(loss)

        return loss

In [72]:
model = LBTPClassifier(
    student_model,
    output_size=len(X_df.owner_id.unique())
)

In [73]:
learning_rate = 1e-5
epochs = 12
batch_size = 10

criterion = CombineLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [74]:
train = TriageDataset(X_df, tokenizer)
val = TriageDataset(y_df, tokenizer)

Generating torch dataset...
Tokenizing texts...
Generating torch dataset...
Tokenizing texts...


In [75]:
train_dataloader = DataLoader(
    dataset=train,
    batch_size=batch_size,
    shuffle=True
)
val_dataloader = DataLoader(val, batch_size=batch_size)

In [76]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [77]:
model = model.to(device)
criterion = criterion.to(device)

In [78]:
def log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        train_data,
        validation_data,
        topk,
    ):
        log = f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                    | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                    | Val Loss: {total_loss_val / len(validation_data): .3f} \
                    | Val Accuracy: {total_acc_val / len(validation_data): .3f} \
                    | Top 10: {topk} \
                    | Precision: {precision: .3f} \
                    | Recall: {recall: .3f} \
                    | F1-score: {f1_score: .3f}"

        print(log)

In [81]:
for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader, desc="Training Steps"):
        # print(train_input)
        train_label = train_label.to(device)
        mask = train_input["attention_mask"].squeeze(1).to(device)
        input_id = train_input["input_ids"].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, train_label.long())
        total_loss_train += batch_loss.item()

        output = torch.sum(torch.stack(output), 0)
        acc = (output.argmax(dim=1) == train_label).sum().item()
        
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()

    total_acc_val = 0
    total_loss_val = 0
    correct_top_k = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():

        for val_input, val_label in tqdm(val_dataloader, desc="Validation Steps"):
            val_label = val_label.to(device)            
            input_id = val_input["input_ids"].squeeze(1).to(device)
            mask = val_input["attention_mask"].squeeze(1).to(device)    

            output = model(input_id, mask)

            batch_loss = criterion(output, val_label.long())
            total_loss_val += batch_loss.item()

            output = torch.sum(torch.stack(output), 0)
            _, top_k_predictions = output.topk(10, 1, True, True)

            top_k_predictions = top_k_predictions.t()

            correct_top_k += (
                top_k_predictions.eq(
                    val_label.view(1, -1).expand_as(top_k_predictions)
                )
                .sum()
                .item()
            )

            acc = (output.argmax(dim=1) == val_label).sum().item()

            all_preds.append(output.argmax(dim=1).cpu().numpy())
            all_labels.append(val_label.cpu().numpy())

            total_acc_val += acc

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    precision, recall, f1_score, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="macro"
    )

    top10 = correct_top_k / len(y_df)

    log_step(
        epoch_num,
        total_acc_train,
        total_acc_val,
        total_loss_train,
        total_loss_val,
        precision,
        recall,
        f1_score,
        X_df,
        y_df,
        top10,
    )

    val_loss = total_loss_val / len(y_df)

    # if val_loss < best_loss:
    #     print("Found new best model. Saving weights...")
    #     torch.save(model.state_dict(), weights_save_location)
    #     best_loss = val_loss

Training Steps: 100%|██████████| 1100/1100 [01:12<00:00, 15.23it/s]
Validation Steps: 100%|██████████| 1081/1081 [00:42<00:00, 25.68it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epochs: 1 | Train Loss:  0.612                     | Train Accuracy:  0.026                     | Val Loss:  0.633                     | Val Accuracy:  0.006                     | Top 10: 0.08115861558393485                     | Precision:  0.000                     | Recall:  0.001                     | F1-score:  0.000


Training Steps:  18%|█▊        | 199/1100 [00:13<00:59, 15.06it/s]


KeyboardInterrupt: 