<a href="https://colab.research.google.com/github/armandossrecife/mysentimentanalysis/blob/main/my_automatic_inspection_issues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-requirements

## Dependencies

### Install dependencies

- datasets from Hugging Face
- transformers Hugging Face
- torch
- accelerate
- ntlk

In [1]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatib

In [2]:
!pip -q install transformers[torch]

In [3]:
!pip -q install accelerate -U

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/315.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip -q install nltk

### Import dependencies


- torch
- pandas
- numpy
- transformers
- sklearn
- datasets
- json
- string
- nltk

In [5]:
import torch
import pandas as pd
import numpy as np

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset
import json

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse

### NLTK Dependencies

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Support Functions

In [8]:
def truncate_string(text, max_length=100, add_ellipsis=True):
  if len(text) <= max_length:
    return text

  truncated_text = text[:max_length]

  if add_ellipsis:
    truncated_text += "..."

  return truncated_text

def to_lowercase(text):
  return text.lower()

def remove_hyperlinks(text):
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if not urlparse(token).scheme]
  return ' '.join(filtered_tokens)

def remove_punctuation(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

def preprocess_text(text):
  text = to_lowercase(text)
  text = remove_hyperlinks(text)
  #text = remove_punctuation(text)
  text = remove_stopwords(text)
  return text

In [9]:
def transform_dataset(coluna_uteis, minhas_colunas, df_x_issues_inspected):
	df_x_issues_inspected = df_x_issues_inspected[coluna_uteis]
	df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)
	df_x_issues_inspected['Textual_Type'] = 'AI_Yes'
	df_x_issues_inspected.loc[df_x_issues_inspected['label']==0, 'Textual_Type'] = 'AI_No'
	df_x_issues_inspected['SummaryDescriptionComments']= df_x_issues_inspected.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments'],axis=1).values
	df_x_issues_inspected['processed_text'] = df_x_issues_inspected['SummaryDescriptionComments'].apply(preprocess_text)
	df_x_issues_inspected = df_x_issues_inspected[minhas_colunas]

	return df_x_issues_inspected

def perform_tests_in_model(X,y, label2id):
  list_my_metrics = list()
  for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: Train Size {len(train_index)} | Test Size {len(test_index)}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    X_train = [str(i) for i in X_train]
    X_test = [str(i) for i in X_test]

    unique_labels = set(label for label in y_train)
    label2id = {label: id for id, label in enumerate(unique_labels)}
    id2label = {id: label for label, id in label2id.items()}
    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    train_dataset = MyDataset(train_encodings, train_labels_encoded)
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device_name)
    trainer = Trainer(
	      model=model,
	      args=training_args,
	      train_dataset=train_dataset,
	      eval_dataset=test_dataset,
	      compute_metrics=compute_metrics
	    )
    trainer.train()
    trainer.evaluate()

    predicted_results = trainer.predict(test_dataset)
    outputs = predicted_results.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)

    # roc_auc_score(test_labels_encoded, probas)
    folds[i] = {}
    folds[i]['pre'] = precision_score(test_labels_encoded, preds)
    folds[i]['rec'] = recall_score(test_labels_encoded, preds)
    folds[i]['acc'] = accuracy_score(test_labels_encoded, preds)
    folds[i]['auc'] = roc_auc_score(test_labels_encoded, probas)
    folds[i]['f1'] = f1_score(test_labels_encoded, preds)

    print(f"Fold {i+1}=> PRE: {folds[i]['pre']}; REC: {folds[i]['rec']}; ACC: {folds[i]['acc']}; F1S: {folds[i]['f1']}; AUC: {folds[i]['auc']}")
    item_metric = {'Fold':i+1, 'PRE':folds[i]['pre'], 'REC':folds[i]['rec'], 'ACC':folds[i]['acc'], 'F1S':folds[i]['f1'], 'AUC':folds[i]['auc']}
    list_my_metrics.append(item_metric)

  return list_my_metrics

In [10]:
coluna_uteis = ['issue_key', 'summary', 'description', 'comments', 'architectural_impact']

minhas_colunas = ['issue_key', 'SummaryDescriptionComments', 'processed_text', 'Textual_Type']

# Set up the environment for the AI ​​model

## Initial Setup

In [11]:
model_name = 'distilbert-base-uncased'
device_name = 'cuda'
#device_name = 'cpu'
max_length = 512
cached_model_directory_name = 'distilbert-ehbugs'

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [13]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Support class to handle the dataset

In [14]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def cap_number(x):
    if x > 1:
      return 1
    elif x < 0:
      return 0
    else:
      return x

def compute_metrics(pred):
    labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    outputs = pred.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

## Cleaning directories

Cleaning directories related to results and **logs**

In [15]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Setup the arguments for training

In [16]:
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='/content/results',   # output directory
    logging_dir='/content/logs',     # directory for storing logs
    logging_steps=150,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)



In [17]:
unique_labels = {'AI_Yes', 'AI_No'}
label2id = {'AI_No': 0, 'AI_Yes': 1}
id2label = {0: 'AI_No', 1: 'AI_Yes'}

# A) Testes de Inspeção de Issues do Cassandra

## My Hugging Face Dataset

https://huggingface.co/datasets/armandoufpi/cassandraissuesgroundtruth


In [18]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df_treino = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["train"])
df_teste = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["test"])

In [19]:
dataset = pd.concat([df_treino, df_teste], axis=0)
df_cassandra_issues_inspected = dataset[['issue_key', 'summary', 'description', 'comments_text', 'architectural_impact']]
df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)

df_cassandra_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_cassandra_issues_inspected)
df_cassandra_issues_inspected.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)


Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes


## Create a StratifiedKFold

Stratified K-Fold cross-validator.

Provides train/test indices to split data in train/test sets.

This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.

In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_cassandra_issues_inspected['processed_text'], df_cassandra_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

In [21]:
X

Unnamed: 0,processed_text
0,"encryptionoptions instantiated title says , ot..."
1,log writing many tombstones partition log writ...
2,redesign repair messages many people reporting...
3,system.peers.tokens empty node restart using 2...
4,sstablesinbounds might actually give sstables ...
...,...
21,duplicate rows returned clause repeated values...
22,examine shortening path length post-5202 cassa...
23,throw error auto_bootstrap : true bootstrappin...
24,support wrapped range queries want support sca...


In [22]:
y

Unnamed: 0,Textual_Type
0,AI_No
1,AI_No
2,AI_Yes
3,AI_No
4,AI_Yes
...,...
21,AI_No
22,AI_No
23,AI_No
24,AI_No


In [23]:
skf.split(X, y)

<generator object _BaseKFold.split at 0x7b87c6906500>

In [24]:
print(f"Running in {device_name}")

Running in cuda


## Train and Evaluate the Model

In [25]:
list_my_metrics_cassandra = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 180 | Test Size 46


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 1=> PRE: 0.0; REC: 0.0; ACC: 0.6739130434782609; F1S: 0.0; AUC: 0.8451612903225807
Fold 2: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 2=> PRE: 0.0; REC: 0.0; ACC: 0.6888888888888889; F1S: 0.0; AUC: 0.8018433179723502
Fold 3: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 1.0; REC: 0.06666666666666667; ACC: 0.6888888888888889; F1S: 0.125; AUC: 0.6888888888888889
Fold 4: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 4=> PRE: 0.0; REC: 0.0; ACC: 0.6666666666666666; F1S: 0.0; AUC: 0.6977777777777777
Fold 5: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.3333333333333333; REC: 0.06666666666666667; ACC: 0.6444444444444445; F1S: 0.1111111111111111; AUC: 0.54


## Cassandra issues results

In [26]:
df_my_metrics_cassandra_issues = pd.DataFrame(list_my_metrics_cassandra)
df_my_metrics_cassandra_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.673913,0.0,0.845161
1,2,0.0,0.0,0.688889,0.0,0.801843
2,3,1.0,0.066667,0.688889,0.125,0.688889
3,4,0.0,0.0,0.666667,0.0,0.697778
4,5,0.333333,0.066667,0.644444,0.111111,0.54


# B) Performing tests with Kafka issues

## Original dataset

In [27]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/kafka_issues_inspected.xlsx

--2024-08-26 22:01:03--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/kafka_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147840 (144K) [application/octet-stream]
Saving to: ‘kafka_issues_inspected.xlsx’


2024-08-26 22:01:03 (77.6 MB/s) - ‘kafka_issues_inspected.xlsx’ saved [147840/147840]



In [28]:
df_kafka_issues_inspected = pd.read_excel('kafka_issues_inspected.xlsx')
df_kafka_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_kafka_issues_inspected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['Textual_Type'] = 'AI_Yes'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['SummaryDescrip

In [29]:
df_kafka_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,KAFKA-1253,Implement compression in new producer \n It...,implement compression new producer seems incre...,AI_Yes
1,KAFKA-1250,Add slf4j logging to new producer Currently t...,add slf4j logging new producer currently loggi...,AI_No
2,KAFKA-1498,new producer performance and bug improvements ...,new producer performance bug improvements seen...,AI_No
3,KAFKA-2313,javadoc fix for KafkaConsumer deserialization ...,javadoc fix kafkaconsumer deserialization kafk...,AI_No
4,KAFKA-2123,Make new consumer offset commit API use callba...,make new consumer offset commit api use callba...,AI_Yes


## Cleaning directories

In [30]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [31]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_kafka_issues_inspected['processed_text'], df_kafka_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

print(f"Running in {device_name}")

Running in cuda


## Performing tests in the model

In [32]:
list_my_metrics_kafka = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 1=> PRE: 0.0; REC: 0.0; ACC: 0.5833333333333334; F1S: 0.0; AUC: 0.7492063492063492
Fold 2: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.5454545454545454; REC: 0.4; ACC: 0.6111111111111112; F1S: 0.4615384615384615; AUC: 0.6
Fold 3: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.5; REC: 0.07142857142857142; ACC: 0.6111111111111112; F1S: 0.125; AUC: 0.6168831168831169
Fold 4: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.42857142857142855; REC: 0.21428571428571427; ACC: 0.5833333333333334; F1S: 0.2857142857142857; AUC: 0.5974025974025975
Fold 5: Train Size 144 | Test Size 35


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.0; REC: 0.0; ACC: 0.6; F1S: 0.0; AUC: 0.7482993197278911


  _warn_prf(average, modifier, msg_start, len(result))


## Kafka issues results

In [33]:
df_my_metrics_kafka_issues = pd.DataFrame(list_my_metrics_kafka)
df_my_metrics_kafka_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.583333,0.0,0.749206
1,2,0.545455,0.4,0.611111,0.461538,0.6
2,3,0.5,0.071429,0.611111,0.125,0.616883
3,4,0.428571,0.214286,0.583333,0.285714,0.597403
4,5,0.0,0.0,0.6,0.0,0.748299


# C) Testes de Inspeção de Issues do Hadoop

## Original Dataset

In [34]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/hadoop_issues_inspected.xlsx

--2024-08-26 22:03:34--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/hadoop_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 137652 (134K) [application/octet-stream]
Saving to: ‘hadoop_issues_inspected.xlsx’


2024-08-26 22:03:34 (56.1 MB/s) - ‘hadoop_issues_inspected.xlsx’ saved [137652/137652]



In [35]:
df_hadoop_issues_inspected = pd.read_excel('hadoop_issues_inspected.xlsx')
df_hadoop_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_hadoop_issues_inspected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['Textual_Type'] = 'AI_Yes'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['SummaryDescrip

In [36]:
df_hadoop_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,HADOOP-6252,Provide method to determine if a deprecated ke...,provide method determine deprecated key set co...,AI_No
1,HADOOP-6184,Provide a configuration dump in json format. ...,provide configuration dump json format . confi...,AI_Yes
2,HADOOP-6165,Add metadata to Serializations The Serializat...,add metadata serializations serialization fram...,AI_Yes
3,HADOOP-6161,Add get/setEnum to Configuration It would be ...,add get/setenum configuration would useful con...,AI_No
4,HADOOP-6103,Configuration clone constructor does not clone...,configuration clone constructor clone members ...,AI_No


## Cleaning diretories

In [37]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [38]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_hadoop_issues_inspected['processed_text'], df_hadoop_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x7b879526e730>

## Performing tests in the model


In [39]:
list_my_metrics = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 1=> PRE: 0.0; REC: 0.0; ACC: 0.6666666666666666; F1S: 0.0; AUC: 0.75
Fold 2: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 2=> PRE: 0.0; REC: 0.0; ACC: 0.6666666666666666; F1S: 0.0; AUC: 0.61
Fold 3: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


  _warn_prf(average, modifier, msg_start, len(result))


Fold 3=> PRE: 0.0; REC: 0.0; ACC: 0.6333333333333333; F1S: 0.0; AUC: 0.7799043062200958
Fold 4: Train Size 119 | Test Size 29


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.8; REC: 0.4; ACC: 0.7586206896551724; F1S: 0.5333333333333333; AUC: 0.731578947368421
Fold 5: Train Size 119 | Test Size 29


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.42857142857142855; REC: 0.3; ACC: 0.6206896551724138; F1S: 0.3529411764705882; AUC: 0.5526315789473685


## Hadoop issues results


In [40]:
df_my_metrics_hadoop_issues = pd.DataFrame(list_my_metrics)
df_my_metrics_hadoop_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.666667,0.0,0.75
1,2,0.0,0.0,0.666667,0.0,0.61
2,3,0.0,0.0,0.633333,0.0,0.779904
3,4,0.8,0.4,0.758621,0.533333,0.731579
4,5,0.428571,0.3,0.62069,0.352941,0.552632


# D) Testes de Inspeção de Issues do ActiveMQ

## Original Dataset

In [41]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/activemq_issues_inspected.xlsx

--2024-08-26 22:06:36--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/activemq_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93147 (91K) [application/octet-stream]
Saving to: ‘activemq_issues_inspected.xlsx’


2024-08-26 22:06:36 (37.7 MB/s) - ‘activemq_issues_inspected.xlsx’ saved [93147/93147]



In [42]:
df_activemq_issues_inspected = pd.read_excel('activemq_issues_inspected.xlsx')
df_activemq_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_activemq_issues_inspected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)


In [43]:
df_activemq_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,AMQ-2149,Shared Filesystem Master Slave: missing messag...,shared filesystem master slave : missing messa...,AI_Yes
1,AMQ-2128,close() from MessageListener.onMessage() with ...,close ( ) messagelistener.onmessage ( ) auto d...,AI_No
2,AMQ-2106,Allow broker to evenly distribute message grou...,allow broker evenly distribute message groups ...,AI_Yes
3,AMQ-2103,Memory leak when marshaling ActiveMQTextMessag...,memory leak marshaling activemqtextmessage per...,AI_No
4,AMQ-2087,Redelivery after rollback does not seem to wor...,redelivery rollback seem work well test case r...,AI_No


## Cleaning diretories

In [44]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [45]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_activemq_issues_inspected['processed_text'], df_activemq_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x7b879526ee30>

## Performing tests in model

In [46]:
list_my_metrics_activemq = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 105 | Test Size 27


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.42105263157894735; REC: 0.7272727272727273; ACC: 0.48148148148148145; F1S: 0.5333333333333333; AUC: 0.5056818181818181
Fold 2: Train Size 105 | Test Size 27


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.3888888888888889; REC: 0.6363636363636364; ACC: 0.4444444444444444; F1S: 0.4827586206896552; AUC: 0.40909090909090906
Fold 3: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.3333333333333333; REC: 0.45454545454545453; ACC: 0.38461538461538464; F1S: 0.3846153846153846; AUC: 0.3939393939393939
Fold 4: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.4166666666666667; REC: 0.45454545454545453; ACC: 0.5; F1S: 0.43478260869565216; AUC: 0.4727272727272728
Fold 5: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.4166666666666667; REC: 0.45454545454545453; ACC: 0.5; F1S: 0.43478260869565216; AUC: 0.5636363636363636


## ActiveMQ issues results

In [47]:
df_my_metrics_activemq_issues = pd.DataFrame(list_my_metrics_activemq)
df_my_metrics_activemq_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.421053,0.727273,0.481481,0.533333,0.505682
1,2,0.388889,0.636364,0.444444,0.482759,0.409091
2,3,0.333333,0.454545,0.384615,0.384615,0.393939
3,4,0.416667,0.454545,0.5,0.434783,0.472727
4,5,0.416667,0.454545,0.5,0.434783,0.563636


# E) Performing tests with Kafka, Hadoop and ActiveMQ

## Dataset

In [48]:
dataset = pd.concat([df_treino, df_teste], axis=0)
df_cassandra_issues_inspected = dataset[['issue_key', 'summary', 'description', 'comments_text', 'architectural_impact']]
df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)


In [49]:
df_cassandra_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_cassandra_issues_inspected)
df_cassandra_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes


## Faz o merge dos datases

Cassandra, Kafka, Hadoop e ActiveMQ

In [50]:
df_all_issues_inspected = pd.concat([df_cassandra_issues_inspected,df_kafka_issues_inspected, df_hadoop_issues_inspected, df_activemq_issues_inspected], axis=0)
df_all_issues_inspected


Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes
...,...,...,...,...
127,AMQ-8413,Support different username and password for re...,support different username password remote bro...,AI_No
128,AMQ-9202,Reentrant locks should always be locked outsid...,reentrant locks always locked outside try bloc...,AI_No
129,AMQ-9157,Add a new advisory type for dispatched message...,add new advisory type dispatched messages use ...,AI_No
130,AMQ-9153,Fix Slow Consumer Advisory for Queue subscript...,fix slow consumer advisory queue subscriptions...,AI_No


## Cleaning diretories

In [51]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [52]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_all_issues_inspected['processed_text'], df_all_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x7b879526f370>

## Performing tests in the model

In [53]:
list_my_metrics_all_repositories = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.4807692307692308; REC: 0.5; ACC: 0.6204379562043796; F1S: 0.49019607843137253; AUC: 0.632183908045977
Fold 2: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.7777777777777778; REC: 0.14; ACC: 0.6715328467153284; F1S: 0.23728813559322035; AUC: 0.6374712643678161
Fold 3: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.5625; REC: 0.18; ACC: 0.6496350364963503; F1S: 0.2727272727272727; AUC: 0.6271264367816092
Fold 4: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.5; REC: 0.6470588235294118; ACC: 0.6277372262773723; F1S: 0.5641025641025642; AUC: 0.6790925672594619
Fold 5: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.5; REC: 0.5686274509803921; ACC: 0.6277372262773723; F1S: 0.5321100917431193; AUC: 0.6657546739626082


## All issues merged results

In [54]:
df_my_metrics_all_issues = pd.DataFrame(list_my_metrics_all_repositories)
df_my_metrics_all_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.480769,0.5,0.620438,0.490196,0.632184
1,2,0.777778,0.14,0.671533,0.237288,0.637471
2,3,0.5625,0.18,0.649635,0.272727,0.627126
3,4,0.5,0.647059,0.627737,0.564103,0.679093
4,5,0.5,0.568627,0.627737,0.53211,0.665755


# Summary

## Cassandra Issues Results

In [55]:
df_my_metrics_cassandra_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.673913,0.0,0.845161
1,2,0.0,0.0,0.688889,0.0,0.801843
2,3,1.0,0.066667,0.688889,0.125,0.688889
3,4,0.0,0.0,0.666667,0.0,0.697778
4,5,0.333333,0.066667,0.644444,0.111111,0.54


## Kafka Issues Results

In [56]:
df_my_metrics_kafka_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.583333,0.0,0.749206
1,2,0.545455,0.4,0.611111,0.461538,0.6
2,3,0.5,0.071429,0.611111,0.125,0.616883
3,4,0.428571,0.214286,0.583333,0.285714,0.597403
4,5,0.0,0.0,0.6,0.0,0.748299


## Hadoop Issues Results


In [57]:
df_my_metrics_hadoop_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.0,0.0,0.666667,0.0,0.75
1,2,0.0,0.0,0.666667,0.0,0.61
2,3,0.0,0.0,0.633333,0.0,0.779904
3,4,0.8,0.4,0.758621,0.533333,0.731579
4,5,0.428571,0.3,0.62069,0.352941,0.552632


## ActiveMQ Issues Results

In [58]:
df_my_metrics_activemq_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.421053,0.727273,0.481481,0.533333,0.505682
1,2,0.388889,0.636364,0.444444,0.482759,0.409091
2,3,0.333333,0.454545,0.384615,0.384615,0.393939
3,4,0.416667,0.454545,0.5,0.434783,0.472727
4,5,0.416667,0.454545,0.5,0.434783,0.563636


## All Repositories Issues Resutls

In [59]:
df_my_metrics_all_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.480769,0.5,0.620438,0.490196,0.632184
1,2,0.777778,0.14,0.671533,0.237288,0.637471
2,3,0.5625,0.18,0.649635,0.272727,0.627126
3,4,0.5,0.647059,0.627737,0.564103,0.679093
4,5,0.5,0.568627,0.627737,0.53211,0.665755
