In [1]:
import pandas as pd
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig

In [2]:
dataset_path = "/home/mdafifal.mamun/notebooks/triagerX/data/deeptriage/google_chrome/classifier_data_20.csv"
df = pd.read_csv(dataset_path)
df = df[df["owner"].notna()]

num_cv = 10
block = 9
# sample_threshold=20 # Threshold to filter developers
samples_per_block = len(df) // num_cv

sliced_df = df[: samples_per_block * (block + 1)]
df_train = sliced_df[: samples_per_block * block]
df_test = sliced_df[samples_per_block * block : samples_per_block * (block + 1)]

sample_threshold = 20
developers = df_train["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
df_train = df_train[df_train["owner"].isin(filtered_developers)]

train_owners = set(df_train["owner"])
test_owners = set(df_test["owner"])

unwanted = list(test_owners - train_owners)

df_test = df_test[~df_test["owner"].isin(unwanted)]

lbl2idx = {}
idx2lbl = {}

train_owners = sorted(train_owners)

for idx, dev in enumerate(train_owners):
    lbl2idx[dev] = idx
    idx2lbl[idx] = dev

df_train["owner_id"] = df_train["owner"].apply(lambda owner: lbl2idx[owner])
df_test["owner_id"] = df_test["owner"].apply(lambda owner: lbl2idx[owner])

print(f"Training data: {len(df_train)}, Validation data: {len(df_test)}")
print(f"Number of train developers: {len(df_train.owner.unique())}")
print(f"Number of test developers: {len(df_test.owner.unique())}")

Training data: 98165, Validation data: 10781
Number of train developers: 986
Number of test developers: 857


In [3]:
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
import numpy as np

class TriageDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: RobertaTokenizer,
        feature: str = "text",
        target: str = "owner_id",
        max_tokens: int = 256,
    ):
        print("Generating torch dataset...")
        self.tokenizer = tokenizer
        self.labels = [label for label in df[target]]
        print("Tokenizing texts...")
        self.texts = [
            self.tokenizer(
                row[feature],
                padding="max_length",
                max_length=max_tokens,
                truncation=True,
                return_tensors="pt",
            )
            for _, row in df.iterrows()
        ]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [4]:
import torch
import torch.nn as nn

device="cuda"

class LBTPClassifier(nn.Module):
    def __init__(
        self,
        embedding_model,
        output_size,
        unfrozen_layers=1,
        num_classifiers=3,
        max_tokens=256,
    ) -> None:
        super().__init__()
        self.base_model = embedding_model

        # Freeze embedding layers
        for p in self.base_model.embeddings.parameters():
            p.requires_grad = False

        # Freeze encoder layers till last {unfrozen_layers} layers
        for i in range(0, self.base_model.config.num_hidden_layers - unfrozen_layers):
            for p in self.base_model.encoder.layer[i].parameters():
                p.requires_grad = False

        filter_sizes = [3, 4, 5, 6]
        self._num_filters = 256
        self._max_tokens = max_tokens
        self._num_classifiers = num_classifiers
        self._embed_size = embedding_model.config.hidden_size
        self.unfrozen_layers = unfrozen_layers
        self.conv_blocks = nn.ModuleList(
            [
                nn.ModuleList(
                    [
                        nn.Sequential(
                            nn.Conv2d(1, self._num_filters, (K, self._embed_size)),
                            nn.ReLU(),
                            nn.Flatten(),
                            nn.MaxPool1d(self._max_tokens - (K - 1)),
                            nn.Flatten(start_dim=1),
                        )
                        for K in filter_sizes
                    ]
                )
                for _ in range(self._num_classifiers)
            ]
        )

        self.classifier_weights = nn.Parameter(torch.ones(self._num_classifiers))

        self.classifiers = nn.ModuleList(
            [
                nn.Linear(
                    len(filter_sizes) * self._num_filters + self._embed_size,
                    output_size,
                )
                for _ in range(self._num_classifiers)
            ]
        )

        # Dropout is ommitted as it is not mentioned in the LBTP paper
        # self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask=None):
        # input_ids = input["input_ids"].squeeze(1).to(device)
        # attention_mask = input["attention_mask"].squeeze(1).to(device)

        outputs = []

        base_out = self.base_model(input_ids=input_ids, attention_mask=attention_mask, output_attentions=True)

        # print(base_out.attentions)
        
        pooler_out = base_out.pooler_output.squeeze(0)
        
        if len(pooler_out.shape) == 1:
            pooler_out = pooler_out.unsqueeze(0)
            
        hidden_states = base_out.hidden_states[-self._num_classifiers :]

        for i in range(self._num_classifiers):
            batch_size, sequence_length, hidden_size = hidden_states[i].size()
            x = [
                conv(hidden_states[i].view(batch_size, 1, sequence_length, hidden_size))
                for conv in self.conv_blocks[i]
            ]  
            x = torch.cat(x, dim=1)
            x = torch.cat([pooler_out, x], dim=1)
            x = self.classifier_weights[i] * self.classifiers[i](x)

            outputs.append(x)

        return outputs

In [5]:
print("Load pretrained embedding model")
model_config = RobertaConfig.from_pretrained("roberta-large")
model_config.num_hidden_layers = 3
model_config.output_hidden_states = True
embedding_model = RobertaModel(model_config)
embedding_model.load_state_dict(torch.load("/work/disa_lab/projects/triagerx/models/distillation/lbtp_gc_base.pt"))
print("Loaded weights from the saved state.")

tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

model = LBTPClassifier(embedding_model, output_size=len(df_train.owner_id.unique()))
model.load_state_dict(torch.load("/work/disa_lab/projects/triagerx/models/lbtp_dt_gc/lbtp_gc_block9.pt"))

Load pretrained embedding model
Loaded weights from the saved state.


<All keys matched successfully>

In [35]:
def clean_data(df):
    df["text"] = df.apply(
        lambda x: str(x["issue_title"]) + "\n" + str(x["description"]),
        axis=1,
    )
    df["text"] = df["text"].str.replace(
        "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ",
        regex=True,
    )
    # df["text"] = df["text"].str.replace("[^A-Za-z0-9 ]+", " ", regex=True)
    df["text"] = df["text"].str.replace(" +", " ", regex=True)

    min_length = 15
    df = df[df["text"].str.len().gt(min_length)]

    return df

df_test = clean_data(df_test)
# dataset = TriageDataset(df_test, tokenizer)
# val_dataloader = DataLoader(dataset, batch_size=1, drop_last=True)

In [36]:
# Define model output
def model_output(inputs):
  model.eval()
  output = model(inputs)
  return torch.sum(torch.stack(output), 0)

# Define model input
model_input = model.base_model.embeddings

In [37]:
from captum.attr import LayerIntegratedGradients

lig = LayerIntegratedGradients(model_output, model_input)

In [38]:
def get_data(owner_id, sample_index):
    sample = df_test[df_test["owner_id"] == owner_id].iloc[sample_index]

    return sample.owner, sample.text

In [39]:
def construct_input_and_baseline(text):

    max_length = 254
    baseline_token_id = tokenizer.pad_token_id 
    sep_token_id = tokenizer.sep_token_id 
    cls_token_id = tokenizer.cls_token_id 

    text_ids = tokenizer.encode(text, max_length=max_length, padding="max_length", truncation=True, add_special_tokens=False)
   
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    token_list = tokenizer.convert_ids_to_tokens(input_ids)

    baseline_input_ids = [cls_token_id] + [baseline_token_id] * len(text_ids) + [sep_token_id]
    return torch.tensor([input_ids], device='cpu'), torch.tensor([baseline_input_ids], device='cpu'), token_list

owner, text = get_data(72, 1)
input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)

print(f'original text: {input_ids}')
print(f'baseline text: {baseline_input_ids}')

original text: tensor([[    0, 40025,  1295,  5209, 13561,  3260,   197,    28,  1373,   227,
         14050,     8,  3208, 50118, 16991,  2454,   500, 44827, 17357,  3548,
         48455,     8,  2454,   500, 44827, 42375,   486,  1542, 38304, 40025,
           877,   534, 46508,     8, 12760,  1397,    22, 35685,  1215,   113,
             7,  5368,     5,  5209, 13561,     4,   152,  3260, 41988,   197,
            28,  4460,     4,   170,   115,  1169,  1606,    10,  5448,     7,
            10,  8309,  6229,  2454,   500, 44827,  1380,    50,  2532,  5368,
             5, 13561,    15,     5,  1765,   526,     8,  1622,  1323,    24,
             7, 21384, 43929,     4, 29774,  8845,    15,     5,   464,    14,
          3639, 21384, 43929,  5574,    13,  3208,    35,  1437,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1, 

In [11]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines= baseline_input_ids,
                                    return_convergence_delta=True,
                                    internal_batch_size=1,
                                    target=72
                                    )
print(attributions.size())

torch.Size([1, 256, 1024])


In [13]:
def summarize_attributions(attributions):

    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    
    return attributions

attributions_sum = summarize_attributions(attributions)
print(attributions_sum.size())

torch.Size([256])


In [21]:
from captum.attr import visualization as viz

def interpret_text(text, true_class):
    input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)
    input_ids = input_ids.to(device)
    baseline_input_ids = baseline_input_ids.to(device)
    attributions, delta = lig.attribute(inputs= input_ids,
                                    baselines= baseline_input_ids,
                                    return_convergence_delta=True,
                                    internal_batch_size=1,
                                    target=true_class
                                    )
    attributions_sum = summarize_attributions(attributions)

    all_tokens= [token for token in all_tokens if token != tokenizer.pad_token]

    score_vis = viz.VisualizationDataRecord(
                        word_attributions = attributions_sum,
                        pred_prob = torch.max(model(input_ids)[0]),
                        pred_class = torch.argmax(model(input_ids)[0]).cpu().numpy(),
                        true_class = true_class,
                        attr_class = text,
                        attr_score = attributions_sum.sum(),       
                        raw_input_ids = all_tokens,
                        convergence_score = delta)

    viz.visualize_text([score_vis])


In [22]:
model = model.to(device)

In [40]:
owner_id = 65
test_data_index = 3

owner, text = get_data(owner_id, test_data_index)
print(text)
interpret_text(text, owner_id)

UtilityProcessHost, SupportsWeakPtr, and batch mode don't mix well
The relatively recent addition of batch mode to UtilityProcessHost means that clients which use batch mode are more likely to want to keep a weak ptr to it. This can be a problem due to SupportsWeakPtr when the child process goes away for a reason other than batch mode ending, if the client tries to be a good citizen and make sure it cleans up properly. Consider this example that's similar to some code I'm working on for the extensions sandboxed unpacker:class Foo : public UtilityProcessHostClient { void StartDoingStuff(); private: void Thing1Complete(); void Thing2Complete(); void CleanupIfNeeded(); base::WeakPtr<UtilityProcessHost> utility_host_;};void Foo:StartDoingStuff() { utility_host_ = UtilityProcessHost::Create(this, ...); utility_host_->StartBatchMode(); utility_host_->Send(new FooDoThing1Msg());} void Foo::Thing1Complete() { if (!utility_host_) { // handle error of losing utility host } utility_host_->Send(ne

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
65.0,424 (-6.95),"UtilityProcessHost, SupportsWeakPtr, and batch mode don't mix well The relatively recent addition of batch mode to UtilityProcessHost means that clients which use batch mode are more likely to want to keep a weak ptr to it. This can be a problem due to SupportsWeakPtr when the child process goes away for a reason other than batch mode ending, if the client tries to be a good citizen and make sure it cleans up properly. Consider this example that's similar to some code I'm working on for the extensions sandboxed unpacker:class Foo : public UtilityProcessHostClient { void StartDoingStuff(); private: void Thing1Complete(); void Thing2Complete(); void CleanupIfNeeded(); base::WeakPtr utility_host_;};void Foo:StartDoingStuff() { utility_host_ = UtilityProcessHost::Create(this, ...); utility_host_->StartBatchMode(); utility_host_->Send(new FooDoThing1Msg());} void Foo::Thing1Complete() { if (!utility_host_) { // handle error of losing utility host } utility_host_->Send(new FooDoThing2Msg());}void Foo::Thing2Complete() { CleanupIfNeeded();}void Foo::CleanupIfNeeded() { if (utility_host_) utility_host_->EndBatchMode();}Foo::~Foo() { CleanupIfNeeded();}Now if the utility child process gets killed before Foo is finished with Thing2, eg at shutdown, the UtilityProcessHost instance will be destructed, and in ~UtilityProcessHost it will call EndBatchMode itself, and then if it happens to have the last ref to a client Foo, then when it drops it Foo::CleanupIfNeeded will call EndBatchMode too because the weak ptr to the UtilityProcessHost hasn't been invalidated yet.",-7.44,"#s Ut ility Process Host , ĠSupports Weak Ptr , Ġand Ġbatch Ġmode Ġdon 't Ġmix Ġwell Ċ The Ġrelatively Ġrecent Ġaddition Ġof Ġbatch Ġmode Ġto ĠUtility Process Host Ġmeans Ġthat Ġclients Ġwhich Ġuse Ġbatch Ġmode Ġare Ġmore Ġlikely Ġto Ġwant Ġto Ġkeep Ġa Ġweak Ġptr Ġto Ġit . ĠThis Ġcan Ġbe Ġa Ġproblem Ġdue Ġto ĠSupports Weak Ptr Ġwhen Ġthe Ġchild Ġprocess Ġgoes Ġaway Ġfor Ġa Ġreason Ġother Ġthan Ġbatch Ġmode Ġending , Ġif Ġthe Ġclient Ġtries Ġto Ġbe Ġa Ġgood Ġcitizen Ġand Ġmake Ġsure Ġit Ġcleans Ġup Ġproperly . ĠConsider Ġthis Ġexample Ġthat 's Ġsimilar Ġto Ġsome Ġcode ĠI 'm Ġworking Ġon Ġfor Ġthe Ġextensions Ġsandbox ed Ġun pack er : class ĠFoo Ġ: Ġpublic ĠUtility Process Host Client Ġ{ Ġvoid ĠStart Do ing St uff (); Ġprivate : Ġvoid ĠThing 1 Complete (); Ġvoid ĠThing 2 Complete (); Ġvoid ĠClean up If Need ed (); Ġbase :: Weak Ptr < Ut ility Process Host > Ġutility _ host _ ; }; void ĠFoo : Start Do ing St uff () Ġ{ Ġutility _ host _ Ġ= ĠUtility Process Host :: Create ( this , Ġ... ); Ġutility _ host _ -> Start B atch Mode (); Ġutility _ host _ -> Send ( new ĠFoo Do T hing 1 Msg ()); } Ġvoid ĠFoo :: T hing 1 Complete () Ġ{ Ġif Ġ(! ut ility _ host _ ) Ġ{ Ġ// Ġhandle Ġerror Ġof Ġlosing Ġutility Ġhost Ġ} Ġutility _ host _ -> Send ( new ĠFoo Do T hing 2 Msg #/s"
,,,,
