In [1]:
import requests

In [2]:
download_url = "https://figshare.com/ndownloader/files/45989205"

try:
  response = requests.get(download_url, headers={'User-Agent': 'My Python Script'})
  if response.status_code == 200:
    filename = response.headers.get('content-disposition', None)
    if filename:
        filename = filename.split("filename=")[1].strip('"')
    else:
        filename = "figshare_download.data"

    with open(filename, 'wb') as f:
      f.write(response.content)

    print(f"Download complete! File saved as: {filename}")
  else:
    print(f"Error downloading file. Status code: {response.status_code}")
except Exception as ex:
  print(f"Erro no download: {str(ex)}")

Download complete! File saved as: eh-bug-study_replication-package.zip


In [3]:
!echo 'Aguarde...'
!echo 'Descompactando o arquivo eh-bug-study_replication-package.zip'
!unzip -q eh-bug-study_replication-package.zip
!echo 'Arquivo descompactado com sucesso'

Aguarde...
Descompactando o arquivo eh-bug-study_replication-package.zip
Arquivo descompactado com sucesso


In [4]:
import numpy as np
import pandas as pd
import json
from os import listdir
from os.path import basename, join

In [5]:
MY_PATH = '/content/eh-bug-study_replication-package'
NOTEBOOK_PATH = f'{MY_PATH}/notebooks/'

TEXTUAL_FIELDS_PATH = f'{MY_PATH}/data/raw/textual_fields/'
PROCESSED_DATA_PATH = f'{MY_PATH}/data/processed/'

In [6]:
file1 = NOTEBOOK_PATH + 'final_labeled_dataset_ehanalysis_col.csv'
labels = pd.read_csv(file1,sep=';')

In [7]:
BOTS_USERNAMES_JIRA = ['githubbot', 'genericqa', 'HadoopDev', 'hadoopqa', 'hudson', 'jiraposter@reviews.apache.org']

count = 0
keys = []
description = []
comments= []
summary = []

In [8]:
!echo 'Aguarde...'
!echo 'Descompactando os arquivo de dataset'
!unzip -q /content/eh-bug-study_replication-package/data/raw/textual_fields/EXTRACT_HERE.zip
!echo 'Arquivos descompactador com sucesso!'


Aguarde...
Descompactando os arquivo de dataset
Arquivos descompactador com sucesso!


In [9]:
from tqdm import tqdm

with tqdm(total=1, desc="Loading dataset", unit="s") as pbar:
  for textual_file in listdir('/content/dataset'):
      try:
        filename = f'/content/dataset/{textual_file}'
        f = open(filename)
        data = json.load(f)
        keys.append(basename(filename).split('.')[0])
        description.append(data['description'])
        summary.append(data['summary'])
        if len(data['comments']):
            comment_line = []
            for comm in data['comments']:
                if (comm['commenter_name'] not in BOTS_USERNAMES_JIRA) and (comm['commenter_id'] not in BOTS_USERNAMES_JIRA):
                    comment_line.append(comm['comment'])
            joined_comment = ' '.join(comment_line)
            if len(joined_comment) <= 512:
                comments.append(joined_comment)
            else:
                comments.append(joined_comment[:512])
        else:
            comments.append('')
        # comments.append(len(data['comments']))
        f.close()
        count+=1
      except Exception as ex:
        print(f"Erro: {ex}")
      pbar.update(1)

print(f"Loaded {count} files successfully")

Loading dataset: 4334s [00:00, 5416.18s/s]

Erro: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


Loading dataset: 10376s [00:02, 5005.80s/s]

Loaded 10375 files successfully





In [10]:
labels.columns

Index(['Project', 'Owner', 'Manager', 'Category', 'Key', 'Priority', 'Status',
       'Reporter', 'Assignee', 'Components', 'SummaryTopWords',
       'DescriptionTopWords', 'CommentsTopWords', 'CreationDate',
       'ResolutionDate', 'AffectsVersions', 'FixVersions', 'NoComments',
       'FirstCommentDate', 'LastCommentDate', 'NoWatchers', 'NoAttachments',
       'FirstAttachmentDate', 'LastAttachmentDate', 'NoAttachedPatches',
       'FirstAttachedPatchDate', 'LastAttachedPatchDate', 'InwardIssueLinks',
       'OutwardIssueLinks', 'HasMergeCommit', 'CommitsMessagesTopWords',
       'NoCommits', 'NoAuthors', 'NoCommitters', 'AuthorsFirstCommitDate',
       'AuthorsLastCommitDate', 'CommittersFirstCommitDate',
       'CommittersLastCommitDate', 'NonSrcAddFiles', 'NonSrcDelFiles',
       'NonSrcModFiles', 'NonSrcAddLines', 'NonSrcDelLines', 'SrcAddFiles',
       'SrcDelFiles', 'SrcModFiles', 'SrcAddLines', 'SrcDelLines',
       'TestAddFiles', 'TestDelFiles', 'TestModFiles', 'TestAddLine

In [11]:
data = np.column_stack((keys, summary, description, comments))
data_df = pd.DataFrame(data=data, columns=['Key', 'Summary', 'Description', 'Comments'])
pair_key_label = labels[['Key','EHCodeAnalysis', 'Type']].copy()
eh_bugs = pair_key_label[pair_key_label.Type==1].Key.values
non_eh_bugs = pair_key_label[pair_key_label.Type==0].Key.values

In [12]:
eh_code_reports = pair_key_label[pair_key_label.EHCodeAnalysis==1].Key.values
non_eh_code_reports = pair_key_label[pair_key_label.EHCodeAnalysis==0].Key.values

In [13]:
# print(len(eh_bugs), len(non_eh_bugs), pair_key_label.shape)

data_df['Type'] = -1
data_df.loc[data_df.Key.isin(eh_bugs), 'Type'] = 1
data_df.loc[data_df.Key.isin(non_eh_bugs), 'Type'] = 0

data_df['EHCodeAnalysis'] = -1
data_df.loc[data_df.Key.isin(eh_code_reports), 'EHCodeAnalysis'] = 1
data_df.loc[data_df.Key.isin(non_eh_code_reports), 'EHCodeAnalysis'] = 0

In [14]:
data_df.head()

Unnamed: 0,Key,Summary,Description,Comments,Type,EHCodeAnalysis
0,HADOOP-9782,Datanode daemon cannot be started on OS X,Datanode fails to start with the following exc...,Tests also broke on Mac. Seeing this msg in te...,0,0
1,YARN-8040,[UI2] New YARN UI webapp does not respect curr...,When ui2 is accessed behind proxy like knox/ng...,Hi [~leftnoteasy]. could u pls help to review ...,0,1
2,MAPREDUCE-5145,Change default max-attempts to be more than on...,We need to give the AM of MR jobs the chance t...,"Similar to YARN-542, I've drafted a patch, inc...",-1,-1
3,HADOOP-8534,Some tests leave a config file open causing fa...,Java xml parser keeps file locked after SAXExc...,Attaching a patch with the fix. +1. Would wrap...,-1,-1
4,HADOOP-5298,Unit test fails out on trunk org.apache.hadoop...,From: http://hudson.zones.apache.org/hudson/jo...,test log from build 760 5298_20090226.patch: c...,-1,-1


In [15]:
final_textual_labeled_data = data_df[data_df.Type!=-1].copy()

In [16]:
final_textual_labeled_data.to_pickle("final_full_text_data_comments_ehcoleanalysis_col.pkl")

In [17]:
! pip install -U accelerate
! pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/314.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/314.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [18]:
import torch
import pandas as pd
import numpy as np

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [19]:
model_name = 'distilbert-base-uncased'
device_name = 'cuda'
#device_name = 'cpu'
max_length = 512
cached_model_directory_name = 'distilbert-ehbugs'

In [20]:
# Data Loading and basic processing
# dataset = pd.read_pickle('data/full_text_comments_ehcoleanalysis_col.pkl')
dataset = pd.read_pickle('final_full_text_data_comments_ehcoleanalysis_col.pkl')

In [21]:
dataset.fillna('',inplace=True)
dataset['Textual_Type'] = 'EH-Bug'
dataset.loc[dataset['Type']==0,'Textual_Type'] = 'Non-EH-Bug'
dataset['SummaryDescriptionComments']= dataset.apply(lambda row: row['Summary'] + ' ' + row['Description'] + ' ' + row['Comments'],axis=1).values

In [24]:
dataset

Unnamed: 0,Key,Summary,Description,Comments,Type,EHCodeAnalysis,Textual_Type,SummaryDescriptionComments
0,HADOOP-9782,Datanode daemon cannot be started on OS X,Datanode fails to start with the following exc...,Tests also broke on Mac. Seeing this msg in te...,0,0,Non-EH-Bug,Datanode daemon cannot be started on OS X Data...
1,YARN-8040,[UI2] New YARN UI webapp does not respect curr...,When ui2 is accessed behind proxy like knox/ng...,Hi [~leftnoteasy]. could u pls help to review ...,0,1,Non-EH-Bug,[UI2] New YARN UI webapp does not respect curr...
5,HDFS-12963,Error log level in ShortCircuitRegistry#removeShm,{code:title=org.apache.hadoop.hdfs.server.data...,"+1 lgtm non-binding. hello, [~ajisakaa],can yo...",0,1,Non-EH-Bug,Error log level in ShortCircuitRegistry#remove...
6,HADOOP-14371,License error in TestLoadBalancingKMSClientPro...,License error in TestLoadBalancingKMSClientPro...,"Nice catch, [~xiaodong.hu]! +1 LGTM pending Je...",0,0,Non-EH-Bug,License error in TestLoadBalancingKMSClientPro...
8,HDFS-5047,Supress logging of full stack trace of quota a...,"This is a follow up to HDFS-4714, which made a...","+1 looks good to me. Committed to trunk, branc...",1,0,EH-Bug,Supress logging of full stack trace of quota a...
...,...,...,...,...,...,...,...,...
10369,HDFS-6180,dead node count / listing is very broken in JM...,After bringing up a 578 node cluster with 13 d...,"Also, if a live node is shutdown, the counter ...",0,0,Non-EH-Bug,dead node count / listing is very broken in JM...
10370,HADOOP-6498,IPC client bug may cause rpc call hang,I can reproduce some rpc call hang bug when c...,This bug is related to the ipc/Client.java. In...,1,0,EH-Bug,IPC client bug may cause rpc call hang I can ...
10371,HADOOP-12902,JavaDocs for SignerSecretProvider are out-of-d...,The Javadocs in {{AuthenticationFilter}} say:\...,Test failures are likely unrelated. Thanks [~g...,0,1,Non-EH-Bug,JavaDocs for SignerSecretProvider are out-of-d...
10372,YARN-6050,AMs can't be scheduled on racks or nodes,Yarn itself supports rack/node aware schedulin...,"Thanks [~rkanter], thanks will be very useful!...",0,0,Non-EH-Bug,AMs can't be scheduled on racks or nodes Yarn ...


In [25]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) # The model_name needs to match our pre-trained model.

In [26]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def cap_number(x):
    if x > 1:
      return 1
    elif x < 0:
      return 0
    else:
      return x

def compute_metrics(pred):
    labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    outputs = pred.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

In [27]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


In [28]:
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='/content/results',          # output directory
    logging_dir='/content/logs',            # directory for storing logs
    logging_steps=150,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)



In [29]:
unique_labels = {'EH-Bug', 'Non-EH-Bug'}
label2id = {'Non-EH-Bug': 0, 'EH-Bug': 1}
id2label = {0: 'Non-EH-Bug', 1: 'EH-Bug'}


In [30]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = dataset['SummaryDescriptionComments'], dataset['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

In [31]:
X

0        Datanode daemon cannot be started on OS X Data...
1        [UI2] New YARN UI webapp does not respect curr...
5        Error log level in ShortCircuitRegistry#remove...
6        License error in TestLoadBalancingKMSClientPro...
8        Supress logging of full stack trace of quota a...
                               ...                        
10369    dead node count / listing is very broken in JM...
10370    IPC client  bug may cause rpc call hang I can ...
10371    JavaDocs for SignerSecretProvider are out-of-d...
10372    AMs can't be scheduled on racks or nodes Yarn ...
10373    CombineFileInputFormat node input split can be...
Name: SummaryDescriptionComments, Length: 7100, dtype: object

In [32]:
y

0        Non-EH-Bug
1        Non-EH-Bug
5        Non-EH-Bug
6        Non-EH-Bug
8            EH-Bug
            ...    
10369    Non-EH-Bug
10370        EH-Bug
10371    Non-EH-Bug
10372    Non-EH-Bug
10373    Non-EH-Bug
Name: Textual_Type, Length: 7100, dtype: object

In [33]:
skf.split(X, y)

<generator object _BaseKFold.split at 0x7e3ebb8cdd20>

In [34]:
print(f"Running in {device_name}")

Running in cuda


In [35]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: Train Size {len(train_index)} | Test Size {len(test_index)}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    X_train = [str(i) for i in X_train]
    X_test = [str(i) for i in X_test]

    unique_labels = set(label for label in y_train)
    label2id = {label: id for id, label in enumerate(unique_labels)}
    id2label = {id: label for label, id in label2id.items()}

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    train_dataset = MyDataset(train_encodings, train_labels_encoded)
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device_name)
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
    )
    trainer.train()
    trainer.evaluate()
    predicted_results = trainer.predict(test_dataset)
    outputs = predicted_results.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)

    # roc_auc_score(test_labels_encoded, probas)
    folds[i] = {}
    folds[i]['pre'] = precision_score(test_labels_encoded, preds)
    folds[i]['rec'] = recall_score(test_labels_encoded, preds)
    folds[i]['acc'] = accuracy_score(test_labels_encoded, preds)
    folds[i]['auc'] = roc_auc_score(test_labels_encoded, probas)
    folds[i]['f1'] = f1_score(test_labels_encoded, preds)

    print(f"Fold {i+1}=> PRE: {folds[i]['pre']}; REC: {folds[i]['rec']}; ACC: {folds[i]['acc']}; F1S: {folds[i]['f1']}; AUC: {folds[i]['auc']}")



Fold 1: Train Size 5680 | Test Size 1420


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
150,0.1991,0.089756,0.882394
300,0.1007,0.123209,0.869718
450,0.088,0.077085,0.896479
600,0.0781,0.077696,0.893662
750,0.0674,0.078585,0.900704
900,0.0464,0.080096,0.897887
1050,0.0522,0.072561,0.908451


Fold 1=> PRE: 0.9365079365079365; REC: 0.9577922077922078; ACC: 0.9070422535211268; F1S: 0.9470304975922954; AUC: 0.8937594984802432
Fold 2: Train Size 5680 | Test Size 1420


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
150,0.1649,0.123081,0.857746
300,0.0924,0.08612,0.903521
450,0.0779,0.090973,0.892958
600,0.0727,0.075074,0.902113
750,0.0587,0.066132,0.914789
900,0.0461,0.078302,0.902113
1050,0.0394,0.075198,0.907746


Fold 2=> PRE: 0.9457050243111832; REC: 0.9472402597402597; ACC: 0.9070422535211268; F1S: 0.9464720194647203; AUC: 0.9029125794418349
Fold 3: Train Size 5680 | Test Size 1420


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
150,0.1526,0.101201,0.871127
300,0.0995,0.086433,0.880282
450,0.0763,0.082684,0.888732
600,0.0699,0.093133,0.878169
750,0.0643,0.086355,0.885915
900,0.0485,0.085007,0.893662
1050,0.0442,0.080723,0.903521


Fold 3=> PRE: 0.9208301306687163; REC: 0.9731925264012997; ACC: 0.9042253521126761; F1S: 0.9462875197472354; AUC: 0.8736562952647435
Fold 4: Train Size 5680 | Test Size 1420


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
150,0.1566,0.094925,0.86831
300,0.1015,0.078878,0.89507
450,0.0726,0.079964,0.9
600,0.0729,0.078314,0.898592
750,0.064,0.078409,0.912676
900,0.042,0.081221,0.90493
1050,0.0444,0.073526,0.907042


Fold 4=> PRE: 0.9311962470680218; REC: 0.9675060926076361; ACC: 0.9098591549295775; F1S: 0.9490039840637451; AUC: 0.8907779196162624
Fold 5: Train Size 5680 | Test Size 1420


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
150,0.1627,0.087875,0.901408
300,0.0942,0.097997,0.890845
450,0.0775,0.077234,0.906338
600,0.0806,0.081097,0.903521
750,0.0601,0.071867,0.910563
900,0.0479,0.073958,0.907042
1050,0.0406,0.073218,0.908451


Fold 5=> PRE: 0.936608557844691; REC: 0.9601949634443542; ACC: 0.9091549295774648; F1S: 0.9482551143200963; AUC: 0.8904770501033701
