<a href="https://colab.research.google.com/github/armandossrecife/mysentimentanalysis/blob/main/my_automatic_inspection_issues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A) Testes de Inspeção de Issues do Cassandra

## Dependencies

### Install dependencies

- datasets from Hugging Face
- transformers Hugging Face
- torch
- accelerate
- ntlk

In [1]:
!pip -q install datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/527.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m522.2/527.3 kB[0m [31m23.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

In [2]:
!pip -q install transformers[torch]

In [3]:
!pip -q install accelerate -U

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/315.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip -q install nltk

### Import dependencies


- torch
- pandas
- numpy
- transformers
- sklearn
- datasets
- json
- string
- nltk

In [5]:
import torch
import pandas as pd
import numpy as np

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from datasets import load_dataset
import json

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from urllib.parse import urlparse

### NLTK Dependencies

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Support Functions

In [8]:
def truncate_string(text, max_length=100, add_ellipsis=True):
  if len(text) <= max_length:
    return text

  truncated_text = text[:max_length]

  if add_ellipsis:
    truncated_text += "..."

  return truncated_text

def to_lowercase(text):
  return text.lower()

def remove_hyperlinks(text):
  tokens = nltk.word_tokenize(text)
  filtered_tokens = [token for token in tokens if not urlparse(token).scheme]
  return ' '.join(filtered_tokens)

def remove_punctuation(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  words = text.split()
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

def preprocess_text(text):
  text = to_lowercase(text)
  text = remove_hyperlinks(text)
  #text = remove_punctuation(text)
  text = remove_stopwords(text)
  return text

In [9]:
coluna_uteis = ['issue_key', 'summary', 'description', 'comments', 'architectural_impact']

## My Hugging Face Dataset

https://huggingface.co/datasets/armandoufpi/cassandraissuesgroundtruth


In [10]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df_treino = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["train"])
df_teste = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Carrega o dataset e faz os devidos processamentos (transformações)

In [11]:
dataset = pd.concat([df_treino, df_teste], axis=0)
dataset['Textual_Type'] = 'AI_Yes'
dataset.loc[dataset['label']==0, 'Textual_Type'] = 'AI_No'
dataset['SummaryDescriptionComments']= dataset.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments_text'],axis=1).values
dataset['processed_text'] = dataset['SummaryDescriptionComments'].apply(preprocess_text)

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226 entries, 0 to 25
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   issue_key                   226 non-null    object
 1   summary                     226 non-null    object
 2   issue_type                  226 non-null    object
 3   issue_status                226 non-null    object
 4   issue_priority              226 non-null    object
 5   description                 226 non-null    object
 6   comments                    226 non-null    object
 7   architectural_impact        226 non-null    object
 8   comments_text               226 non-null    object
 9   label                       226 non-null    int64 
 10  label_text                  226 non-null    object
 11  Textual_Type                226 non-null    object
 12  SummaryDescriptionComments  226 non-null    object
 13  processed_text              226 non-null    object
dtype

In [13]:
dataset.head()

Unnamed: 0,issue_key,summary,issue_type,issue_status,issue_priority,description,comments,architectural_impact,comments_text,label,label_text,Textual_Type,SummaryDescriptionComments,processed_text
0,CASSANDRA-3489,EncryptionOptions should be instantiated,Bug,Resolved,Low,"As the title says, otherwise you get an NPE wh...","['There\'s a bunch of ""if encryption options i...",NO,"There\'s a bunch of ""if encryption options is ...",0,negative,AI_No,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot..."
1,CASSANDRA-16780,Log when writing many tombstones to a partition,Improvement,Resolved,Normal,Log when writing many tombstones to a partitio...,['https://github.com/krummas/cassandra/commits...,NO,https://github.com/krummas/cassandra/commits/m...,0,negative,AI_No,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...
2,CASSANDRA-5426,Redesign repair messages,Improvement,Resolved,Low,Many people have been reporting 'repair hang' ...,['Work in progress is pushed to: https://githu...,YES,https://github.com/yukim/cassandra/commits/542...,1,positive,AI_Yes,Redesign repair messages Many people have been...,redesign repair messages many people reporting...
3,CASSANDRA-5121,system.peers.tokens is empty after node restart,Bug,Resolved,Low,Using a 2 nodes fresh cluster (127.0.0.1 & 127...,"['In StorageService.handleStateNormal, when we...",NO,removeEndpoint should be used instead\n [ju...,0,negative,AI_No,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,Bug,Resolved,Normal,Same problem as with CASSANDRA-11886 - if we t...,['https://github.com/krummas/cassandra/commits...,YES,https://github.com/krummas/cassandra/commits/m...,1,positive,AI_Yes,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...


### Atributos chaves

In [14]:
dataset[['issue_key', 'summary', 'description', 'comments_text', 'label', 'Textual_Type']]

Unnamed: 0,issue_key,summary,description,comments_text,label,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated,"As the title says, otherwise you get an NPE wh...","There\'s a bunch of ""if encryption options is ...",0,AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partition,Log when writing many tombstones to a partitio...,https://github.com/krummas/cassandra/commits/m...,0,AI_No
2,CASSANDRA-5426,Redesign repair messages,Many people have been reporting 'repair hang' ...,https://github.com/yukim/cassandra/commits/542...,1,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restart,Using a 2 nodes fresh cluster (127.0.0.1 & 127...,removeEndpoint should be used instead\n [ju...,0,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,Same problem as with CASSANDRA-11886 - if we t...,https://github.com/krummas/cassandra/commits/m...,1,AI_Yes
...,...,...,...,...,...,...
21,CASSANDRA-6706,Duplicate rows returned when in clause has rep...,If a value is repeated within an IN clause the...,"[""That is kind of the intended behavior. Is it...",0,AI_No
22,CASSANDRA-6962,examine shortening path length post-5202,From CASSANDRA-5202 discussion:\n\n{quote}\nDi...,feels pretty error prone. What about keeping t...,0,AI_No
23,CASSANDRA-6972,Throw an ERROR when auto_bootstrap: true and b...,Obviously when this condition exists the node ...,false in their seed configs.' 'Yes the right f...,0,AI_No
24,CASSANDRA-758,support wrapped range queries,we want to support scanning from KeyX to KeyA ...,add wrapped range support + test' '+1 Looks go...,0,AI_No


### Dataset transformado

Dataset que será usado para os testes do Apache Cassanda

In [15]:
minhas_colunas = ['issue_key', 'SummaryDescriptionComments', 'processed_text', 'Textual_Type']
dataset2 = dataset[minhas_colunas]
dataset2.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes


In [16]:
dataset2['processed_text'][0]

Unnamed: 0,processed_text
0,"encryptionoptions instantiated title says , ot..."
0,sstablesinbounds might actually give sstables ...


## Set up the environment for the AI ​​model

### Initial Setup

In [17]:
model_name = 'distilbert-base-uncased'
device_name = 'cuda'
#device_name = 'cpu'
max_length = 512
cached_model_directory_name = 'distilbert-ehbugs'

In [18]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [19]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### Support class to handle the dataset

In [20]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def cap_number(x):
    if x > 1:
      return 1
    elif x < 0:
      return 0
    else:
      return x

def compute_metrics(pred):
    labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    outputs = pred.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)
    acc = accuracy_score(labels, preds)
    return {
      'accuracy': acc,
    }

### Cleaning directories

Cleaning directories related to results and **logs**

In [21]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Setup the arguments for training

In [22]:
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='/content/results',   # output directory
    logging_dir='/content/logs',     # directory for storing logs
    logging_steps=150,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)



In [23]:
unique_labels = {'AI_Yes', 'AI_No'}
label2id = {'AI_No': 0, 'AI_Yes': 1}
id2label = {0: 'AI_No', 1: 'AI_Yes'}

## Create a StratifiedKFold

Stratified K-Fold cross-validator.

Provides train/test indices to split data in train/test sets.

This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.

In [24]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = dataset['processed_text'], dataset['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

In [25]:
X

Unnamed: 0,processed_text
0,"encryptionoptions instantiated title says , ot..."
1,log writing many tombstones partition log writ...
2,redesign repair messages many people reporting...
3,system.peers.tokens empty node restart using 2...
4,sstablesinbounds might actually give sstables ...
...,...
21,duplicate rows returned clause repeated values...
22,examine shortening path length post-5202 cassa...
23,throw error auto_bootstrap : true bootstrappin...
24,support wrapped range queries want support sca...


In [26]:
y

Unnamed: 0,Textual_Type
0,AI_No
1,AI_No
2,AI_Yes
3,AI_No
4,AI_Yes
...,...
21,AI_No
22,AI_No
23,AI_No
24,AI_No


In [27]:
skf.split(X, y)

<generator object _BaseKFold.split at 0x78a5258c7840>

In [28]:
print(f"Running in {device_name}")

Running in cuda


## Train and Evaluate the Model

In [29]:
list_my_metrics = list()

In [30]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: Train Size {len(train_index)} | Test Size {len(test_index)}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    X_train = [str(i) for i in X_train]
    X_test = [str(i) for i in X_test]

    unique_labels = set(label for label in y_train)
    label2id = {label: id for id, label in enumerate(unique_labels)}
    id2label = {id: label for label, id in label2id.items()}

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    train_dataset = MyDataset(train_encodings, train_labels_encoded)
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device_name)
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
    )
    trainer.train()
    trainer.evaluate()
    predicted_results = trainer.predict(test_dataset)
    outputs = predicted_results.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)

    # roc_auc_score(test_labels_encoded, probas)
    folds[i] = {}
    folds[i]['pre'] = precision_score(test_labels_encoded, preds)
    folds[i]['rec'] = recall_score(test_labels_encoded, preds)
    folds[i]['acc'] = accuracy_score(test_labels_encoded, preds)
    folds[i]['auc'] = roc_auc_score(test_labels_encoded, probas)
    folds[i]['f1'] = f1_score(test_labels_encoded, preds)

    print(f"Fold {i+1}=> PRE: {folds[i]['pre']}; REC: {folds[i]['rec']}; ACC: {folds[i]['acc']}; F1S: {folds[i]['f1']}; AUC: {folds[i]['auc']}")

    item_metric = {'Fold':i+1, 'PRE':folds[i]['pre'], 'REC':folds[i]['rec'], 'ACC':folds[i]['acc'], 'F1S':folds[i]['f1'], 'AUC':folds[i]['auc']}
    list_my_metrics.append(item_metric)

Fold 1: Train Size 180 | Test Size 46


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.6739130434782609; REC: 1.0; ACC: 0.6739130434782609; F1S: 0.8051948051948052; AUC: 0.7698924731182796
Fold 2: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.6888888888888889; REC: 1.0; ACC: 0.6888888888888889; F1S: 0.8157894736842105; AUC: 0.7373271889400921
Fold 3: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.6666666666666666; REC: 1.0; ACC: 0.6666666666666666; F1S: 0.8; AUC: 0.6599999999999999
Fold 4: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.6818181818181818; REC: 1.0; ACC: 0.6888888888888889; F1S: 0.8108108108108109; AUC: 0.7333333333333334
Fold 5: Train Size 181 | Test Size 45


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.6590909090909091; REC: 0.9666666666666667; ACC: 0.6444444444444445; F1S: 0.7837837837837838; AUC: 0.6511111111111112


In [31]:
df_my_metrics = pd.DataFrame(list_my_metrics)
df_my_metrics

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.673913,1.0,0.673913,0.805195,0.769892
1,2,0.688889,1.0,0.688889,0.815789,0.737327
2,3,0.666667,1.0,0.666667,0.8,0.66
3,4,0.681818,1.0,0.688889,0.810811,0.733333
4,5,0.659091,0.966667,0.644444,0.783784,0.651111


# B) Performing tests with Kafka issues

## Original dataset

In [32]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/kafka_issues_inspected.xlsx

--2024-08-23 15:50:31--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/kafka_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 147840 (144K) [application/octet-stream]
Saving to: ‘kafka_issues_inspected.xlsx’


2024-08-23 15:50:31 (9.50 MB/s) - ‘kafka_issues_inspected.xlsx’ saved [147840/147840]



In [33]:
df_kafka_issues_inspected = pd.read_excel('kafka_issues_inspected.xlsx')

## Column adjustments

In [34]:
coluna_uteis = ['issue_key', 'summary', 'description', 'comments', 'architectural_impact']

In [35]:
df_kafka_issues_inspected = df_kafka_issues_inspected[coluna_uteis]

## Dataset transformations

Transformações necessárias para ajustar o dataset para os testes

In [36]:
df_kafka_issues_inspected['label'] = df_kafka_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)

In [37]:
df_kafka_issues_inspected['Textual_Type'] = 'AI_Yes'
df_kafka_issues_inspected.loc[df_kafka_issues_inspected['label']==0, 'Textual_Type'] = 'AI_No'
df_kafka_issues_inspected['SummaryDescriptionComments']= df_kafka_issues_inspected.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments'],axis=1).values
df_kafka_issues_inspected['processed_text'] = df_kafka_issues_inspected['SummaryDescriptionComments'].apply(preprocess_text)

## Selecting key columns

In [38]:
minhas_colunas = ['issue_key', 'SummaryDescriptionComments', 'processed_text', 'Textual_Type']
df_kafka_issues_inspected = df_kafka_issues_inspected[minhas_colunas]
df_kafka_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,KAFKA-1253,Implement compression in new producer \n It...,implement compression new producer seems incre...,AI_Yes
1,KAFKA-1250,Add slf4j logging to new producer Currently t...,add slf4j logging new producer currently loggi...,AI_No
2,KAFKA-1498,new producer performance and bug improvements ...,new producer performance bug improvements seen...,AI_No
3,KAFKA-2313,javadoc fix for KafkaConsumer deserialization ...,javadoc fix kafkaconsumer deserialization kafk...,AI_No
4,KAFKA-2123,Make new consumer offset commit API use callba...,make new consumer offset commit api use callba...,AI_Yes


## Cleaning directories

In [39]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [40]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_kafka_issues_inspected['processed_text'], df_kafka_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

In [41]:
skf.split(X, y)

print(f"Running in {device_name}")

Running in cuda


## Performing tests in the model

In [42]:
list_my_metrics = list()

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: Train Size {len(train_index)} | Test Size {len(test_index)}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    X_train = [str(i) for i in X_train]
    X_test = [str(i) for i in X_test]

    unique_labels = set(label for label in y_train)
    label2id = {label: id for id, label in enumerate(unique_labels)}
    id2label = {id: label for label, id in label2id.items()}

    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    train_dataset = MyDataset(train_encodings, train_labels_encoded)
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device_name)
    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
    )
    trainer.train()
    trainer.evaluate()
    predicted_results = trainer.predict(test_dataset)
    outputs = predicted_results.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)

    # roc_auc_score(test_labels_encoded, probas)
    folds[i] = {}
    folds[i]['pre'] = precision_score(test_labels_encoded, preds)
    folds[i]['rec'] = recall_score(test_labels_encoded, preds)
    folds[i]['acc'] = accuracy_score(test_labels_encoded, preds)
    folds[i]['auc'] = roc_auc_score(test_labels_encoded, probas)
    folds[i]['f1'] = f1_score(test_labels_encoded, preds)

    print(f"Fold {i+1}=> PRE: {folds[i]['pre']}; REC: {folds[i]['rec']}; ACC: {folds[i]['acc']}; F1S: {folds[i]['f1']}; AUC: {folds[i]['auc']}")

    item_metric = {'Fold':i+1, 'PRE':folds[i]['pre'], 'REC':folds[i]['rec'], 'ACC':folds[i]['acc'], 'F1S':folds[i]['f1'], 'AUC':folds[i]['auc']}
    list_my_metrics.append(item_metric)

Fold 1: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.5714285714285714; REC: 0.9523809523809523; ACC: 0.5555555555555556; F1S: 0.7142857142857142; AUC: 0.5396825396825397
Fold 2: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.5833333333333334; REC: 1.0; ACC: 0.5833333333333334; F1S: 0.7368421052631579; AUC: 0.7333333333333334
Fold 3: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.6111111111111112; REC: 1.0; ACC: 0.6111111111111112; F1S: 0.7586206896551725; AUC: 0.6038961038961039
Fold 4: Train Size 143 | Test Size 36


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.6111111111111112; REC: 1.0; ACC: 0.6111111111111112; F1S: 0.7586206896551725; AUC: 0.487012987012987
Fold 5: Train Size 144 | Test Size 35


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.6; REC: 1.0; ACC: 0.6; F1S: 0.7499999999999999; AUC: 0.5306122448979592


## Kafka issues results

In [43]:
df_my_metrics_kafka_issues = pd.DataFrame(list_my_metrics)
df_my_metrics_kafka_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.571429,0.952381,0.555556,0.714286,0.539683
1,2,0.583333,1.0,0.583333,0.736842,0.733333
2,3,0.611111,1.0,0.611111,0.758621,0.603896
3,4,0.611111,1.0,0.611111,0.758621,0.487013
4,5,0.6,1.0,0.6,0.75,0.530612


# C) Testes de Inspeção de Issues do Hadoop

## Original Dataset

In [44]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/hadoop_issues_inspected.xlsx

--2024-08-23 15:52:46--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/hadoop_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 137652 (134K) [application/octet-stream]
Saving to: ‘hadoop_issues_inspected.xlsx’


2024-08-23 15:52:46 (6.20 MB/s) - ‘hadoop_issues_inspected.xlsx’ saved [137652/137652]



In [45]:
df_hadoop_issues_inspected = pd.read_excel('hadoop_issues_inspected.xlsx')

In [46]:
def transform_dataset(coluna_uteis, minhas_colunas, df_x_issues_inspected):
	df_x_issues_inspected = df_x_issues_inspected[coluna_uteis]
	df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)
	df_x_issues_inspected['Textual_Type'] = 'AI_Yes'
	df_x_issues_inspected.loc[df_x_issues_inspected['label']==0, 'Textual_Type'] = 'AI_No'
	df_x_issues_inspected['SummaryDescriptionComments']= df_x_issues_inspected.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments'],axis=1).values
	df_x_issues_inspected['processed_text'] = df_x_issues_inspected['SummaryDescriptionComments'].apply(preprocess_text)
	df_x_issues_inspected = df_x_issues_inspected[minhas_colunas]

	return df_x_issues_inspected

In [47]:
df_hadoop_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_hadoop_issues_inspected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['Textual_Type'] = 'AI_Yes'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['SummaryDescrip

In [48]:
df_hadoop_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,HADOOP-6252,Provide method to determine if a deprecated ke...,provide method determine deprecated key set co...,AI_No
1,HADOOP-6184,Provide a configuration dump in json format. ...,provide configuration dump json format . confi...,AI_Yes
2,HADOOP-6165,Add metadata to Serializations The Serializat...,add metadata serializations serialization fram...,AI_Yes
3,HADOOP-6161,Add get/setEnum to Configuration It would be ...,add get/setenum configuration would useful con...,AI_No
4,HADOOP-6103,Configuration clone constructor does not clone...,configuration clone constructor clone members ...,AI_No


## Cleaning diretories

In [49]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [50]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_hadoop_issues_inspected['processed_text'], df_hadoop_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x78a4edca9380>

## Performing tests in the model


In [51]:
def perform_tests_in_model(X,y, label2id):
  list_my_metrics = list()
  for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i+1}: Train Size {len(train_index)} | Test Size {len(test_index)}")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    X_train = [str(i) for i in X_train]
    X_test = [str(i) for i in X_test]

    unique_labels = set(label for label in y_train)
    label2id = {label: id for id, label in enumerate(unique_labels)}
    id2label = {id: label for label, id in label2id.items()}
    train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

    train_labels_encoded = [float(label2id[yi]) for yi in y_train]
    test_labels_encoded  = [float(label2id[yi]) for yi in y_test]

    train_dataset = MyDataset(train_encodings, train_labels_encoded)
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device_name)
    trainer = Trainer(
	      model=model,
	      args=training_args,
	      train_dataset=train_dataset,
	      eval_dataset=test_dataset,
	      compute_metrics=compute_metrics
	    )
    trainer.train()
    trainer.evaluate()

    predicted_results = trainer.predict(test_dataset)
    outputs = predicted_results.predictions.flatten().tolist()
    probas = [cap_number(x) for x in outputs]
    preds = np.array(np.array(probas) > 0.5, dtype=int)

    # roc_auc_score(test_labels_encoded, probas)
    folds[i] = {}
    folds[i]['pre'] = precision_score(test_labels_encoded, preds)
    folds[i]['rec'] = recall_score(test_labels_encoded, preds)
    folds[i]['acc'] = accuracy_score(test_labels_encoded, preds)
    folds[i]['auc'] = roc_auc_score(test_labels_encoded, probas)
    folds[i]['f1'] = f1_score(test_labels_encoded, preds)

    print(f"Fold {i+1}=> PRE: {folds[i]['pre']}; REC: {folds[i]['rec']}; ACC: {folds[i]['acc']}; F1S: {folds[i]['f1']}; AUC: {folds[i]['auc']}")
    item_metric = {'Fold':i+1, 'PRE':folds[i]['pre'], 'REC':folds[i]['rec'], 'ACC':folds[i]['acc'], 'F1S':folds[i]['f1'], 'AUC':folds[i]['auc']}
    list_my_metrics.append(item_metric)

  return list_my_metrics

In [52]:
list_my_metrics = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.6666666666666666; REC: 1.0; ACC: 0.6666666666666666; F1S: 0.8; AUC: 0.83
Fold 2: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.6666666666666666; REC: 1.0; ACC: 0.6666666666666666; F1S: 0.8; AUC: 0.535
Fold 3: Train Size 118 | Test Size 30


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.6206896551724138; REC: 0.9473684210526315; ACC: 0.6; F1S: 0.75; AUC: 0.5741626794258373
Fold 4: Train Size 119 | Test Size 29


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.6551724137931034; REC: 1.0; ACC: 0.6551724137931034; F1S: 0.7916666666666666; AUC: 0.7315789473684211
Fold 5: Train Size 119 | Test Size 29


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.6551724137931034; REC: 1.0; ACC: 0.6551724137931034; F1S: 0.7916666666666666; AUC: 0.6157894736842104


## Hadoop issues results


In [53]:
df_my_metrics_hadoop_issues = pd.DataFrame(list_my_metrics)
df_my_metrics_hadoop_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.666667,1.0,0.666667,0.8,0.83
1,2,0.666667,1.0,0.666667,0.8,0.535
2,3,0.62069,0.947368,0.6,0.75,0.574163
3,4,0.655172,1.0,0.655172,0.791667,0.731579
4,5,0.655172,1.0,0.655172,0.791667,0.615789


# D) Testes de Inspeção de Issues do ActiveMQ

## Original Dataset

In [54]:
!wget https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/activemq_issues_inspected.xlsx

df_activemq_issues_inspected = pd.read_excel('activemq_issues_inspected.xlsx')

--2024-08-23 15:55:02--  https://raw.githubusercontent.com/armandossrecife/mysentimentanalysis/main/activemq_issues_inspected.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93147 (91K) [application/octet-stream]
Saving to: ‘activemq_issues_inspected.xlsx’


2024-08-23 15:55:02 (6.85 MB/s) - ‘activemq_issues_inspected.xlsx’ saved [93147/93147]



In [55]:
df_activemq_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_activemq_issues_inspected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_x_issues_inspected['label'] = df_x_issues_inspected['architectural_impact'].apply(lambda x: 1 if x == 'YES' else 0)


In [56]:
df_activemq_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,AMQ-2149,Shared Filesystem Master Slave: missing messag...,shared filesystem master slave : missing messa...,AI_Yes
1,AMQ-2128,close() from MessageListener.onMessage() with ...,close ( ) messagelistener.onmessage ( ) auto d...,AI_No
2,AMQ-2106,Allow broker to evenly distribute message grou...,allow broker evenly distribute message groups ...,AI_Yes
3,AMQ-2103,Memory leak when marshaling ActiveMQTextMessag...,memory leak marshaling activemqtextmessage per...,AI_No
4,AMQ-2087,Redelivery after rollback does not seem to wor...,redelivery rollback seem work well test case r...,AI_No


## Cleaning diretories

In [57]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [58]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_activemq_issues_inspected['processed_text'], df_activemq_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x78a4edcaac70>

## Performing tests in model

In [59]:
list_my_metrics_activemq = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 105 | Test Size 27


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.6153846153846154; REC: 1.0; ACC: 0.6296296296296297; F1S: 0.761904761904762; AUC: 0.4375
Fold 2: Train Size 105 | Test Size 27


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.6153846153846154; REC: 1.0; ACC: 0.6296296296296297; F1S: 0.761904761904762; AUC: 0.6079545454545454
Fold 3: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.5769230769230769; REC: 1.0; ACC: 0.5769230769230769; F1S: 0.7317073170731707; AUC: 0.5333333333333333
Fold 4: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.5769230769230769; REC: 1.0; ACC: 0.5769230769230769; F1S: 0.7317073170731707; AUC: 0.5696969696969697
Fold 5: Train Size 106 | Test Size 26


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.5769230769230769; REC: 1.0; ACC: 0.5769230769230769; F1S: 0.7317073170731707; AUC: 0.3878787878787879


## ActiveMQ issues results

In [60]:
df_my_metrics_activemq_issues = pd.DataFrame(list_my_metrics_activemq)
df_my_metrics_activemq_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.615385,1.0,0.62963,0.761905,0.4375
1,2,0.615385,1.0,0.62963,0.761905,0.607955
2,3,0.576923,1.0,0.576923,0.731707,0.533333
3,4,0.576923,1.0,0.576923,0.731707,0.569697
4,5,0.576923,1.0,0.576923,0.731707,0.387879


# E) Performing tests with Kafka, Hadoop and ActiveMQ

## Dataset

In [61]:
dataset = pd.concat([df_treino, df_teste], axis=0)

In [62]:
df_cassandra_issues_inspected = dataset[['issue_key', 'summary', 'description', 'comments_text', 'architectural_impact']]


In [63]:
df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cassandra_issues_inspected.rename(columns={'comments_text': 'comments'}, inplace=True)


In [64]:
df_cassandra_issues_inspected = transform_dataset(coluna_uteis, minhas_colunas, df_cassandra_issues_inspected)

In [65]:
df_cassandra_issues_inspected.head()

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes


## Faz o merge dos datases

Cassandra, Kafka, Hadoop e ActiveMQ

In [66]:
df_all_issues_inspected = pd.concat([df_cassandra_issues_inspected,df_kafka_issues_inspected, df_hadoop_issues_inspected, df_activemq_issues_inspected], axis=0)


In [67]:
df_all_issues_inspected

Unnamed: 0,issue_key,SummaryDescriptionComments,processed_text,Textual_Type
0,CASSANDRA-3489,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot...",AI_No
1,CASSANDRA-16780,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...,AI_No
2,CASSANDRA-5426,Redesign repair messages Many people have been...,redesign repair messages many people reporting...,AI_Yes
3,CASSANDRA-5121,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...,AI_No
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...,AI_Yes
...,...,...,...,...
127,AMQ-8413,Support different username and password for re...,support different username password remote bro...,AI_No
128,AMQ-9202,Reentrant locks should always be locked outsid...,reentrant locks always locked outside try bloc...,AI_No
129,AMQ-9157,Add a new advisory type for dispatched message...,add new advisory type dispatched messages use ...,AI_No
130,AMQ-9153,Fix Slow Consumer Advisory for Queue subscript...,fix slow consumer advisory queue subscriptions...,AI_No


## Cleaning diretories

In [68]:
!echo 'Cria a pasta results'
!rm -rf results
!mkdir results
!echo 'Cria a pasta logs'
!rm -rf logs
!mkdir logs

Cria a pasta results
Cria a pasta logs


## Create a StratifiedKFold

In [69]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=51)
X, y = df_all_issues_inspected['processed_text'], df_all_issues_inspected['Textual_Type']
skf.get_n_splits(X, y)
folds = {}

skf.split(X, y)

<generator object _BaseKFold.split at 0x78a4fda3f0d0>

## Performing tests in the model

In [70]:
list_my_metrics_all_repositories = perform_tests_in_model(X,y, label2id=label2id)

Fold 1: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 1=> PRE: 0.7301587301587301; REC: 0.5287356321839081; ACC: 0.5766423357664233; F1S: 0.6133333333333333; AUC: 0.6517241379310346
Fold 2: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 2=> PRE: 0.7567567567567568; REC: 0.6436781609195402; ACC: 0.6423357664233577; F1S: 0.6956521739130436; AUC: 0.6735632183908046
Fold 3: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 3=> PRE: 0.6694915254237288; REC: 0.9080459770114943; ACC: 0.656934306569343; F1S: 0.7707317073170731; AUC: 0.6180459770114942
Fold 4: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 4=> PRE: 0.7714285714285715; REC: 0.627906976744186; ACC: 0.6496350364963503; F1S: 0.6923076923076923; AUC: 0.6956224350205198
Fold 5: Train Size 548 | Test Size 137


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Fold 5=> PRE: 0.6277372262773723; REC: 1.0; ACC: 0.6277372262773723; F1S: 0.7713004484304933; AUC: 0.6438668490652074


In [95]:
df_my_metrics_all_issues = pd.DataFrame(list_my_metrics_all_repositories)
df_my_metrics_all_issues

Unnamed: 0,Fold,PRE,REC,ACC,F1S,AUC
0,1,0.730159,0.528736,0.576642,0.613333,0.651724
1,2,0.756757,0.643678,0.642336,0.695652,0.673563
2,3,0.669492,0.908046,0.656934,0.770732,0.618046
3,4,0.771429,0.627907,0.649635,0.692308,0.695622
4,5,0.627737,1.0,0.627737,0.7713,0.643867


# F) Create the Model (based on example from Hugging face DestilBert)

In [71]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
task = "issue-analysis"
MY_HUGGING_FACE_DATASET = "armandoufpi/cassandraissuesgroundtruth"

In [72]:
# Load pre-trained DistilBERT model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Dataset da minha conta Hugging Fase

https://huggingface.co/datasets/armandoufpi/cassandraissuesgroundtruth

In [73]:
#dataset da minha conta Hugging Fase
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df_treino = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["train"])
df_teste = pd.read_json("hf://datasets/armandoufpi/cassandraissuesgroundtruth/" + splits["test"])

In [74]:
df_treino['SummaryDescriptionComments']= df_treino.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments_text'],axis=1).values
df_treino['processed_text'] = df_treino['SummaryDescriptionComments'].apply(preprocess_text)

df_teste['SummaryDescriptionComments']= df_teste.apply(lambda row: row['summary'] + ' ' + row['description'] + ' ' + row['comments_text'],axis=1).values
df_teste['processed_text'] = df_teste['SummaryDescriptionComments'].apply(preprocess_text)

In [75]:
df_treino

Unnamed: 0,issue_key,summary,issue_type,issue_status,issue_priority,description,comments,architectural_impact,comments_text,label,label_text,SummaryDescriptionComments,processed_text
0,CASSANDRA-3489,EncryptionOptions should be instantiated,Bug,Resolved,Low,"As the title says, otherwise you get an NPE wh...","['There\'s a bunch of ""if encryption options i...",NO,"There\'s a bunch of ""if encryption options is ...",0,negative,EncryptionOptions should be instantiated As th...,"encryptionoptions instantiated title says , ot..."
1,CASSANDRA-16780,Log when writing many tombstones to a partition,Improvement,Resolved,Normal,Log when writing many tombstones to a partitio...,['https://github.com/krummas/cassandra/commits...,NO,https://github.com/krummas/cassandra/commits/m...,0,negative,Log when writing many tombstones to a partitio...,log writing many tombstones partition log writ...
2,CASSANDRA-5426,Redesign repair messages,Improvement,Resolved,Low,Many people have been reporting 'repair hang' ...,['Work in progress is pushed to: https://githu...,YES,https://github.com/yukim/cassandra/commits/542...,1,positive,Redesign repair messages Many people have been...,redesign repair messages many people reporting...
3,CASSANDRA-5121,system.peers.tokens is empty after node restart,Bug,Resolved,Low,Using a 2 nodes fresh cluster (127.0.0.1 & 127...,"['In StorageService.handleStateNormal, when we...",NO,removeEndpoint should be used instead\n [ju...,0,negative,system.peers.tokens is empty after node restar...,system.peers.tokens empty node restart using 2...
4,CASSANDRA-11944,sstablesInBounds might not actually give all s...,Bug,Resolved,Normal,Same problem as with CASSANDRA-11886 - if we t...,['https://github.com/krummas/cassandra/commits...,YES,https://github.com/krummas/cassandra/commits/m...,1,positive,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,CASSANDRA-18617,Disable the deprecated keyspace/table threshol...,Improvement,Resolved,Normal,The non-guardrail thresholds 'keyspace_count_w...,"[""Part of this change is to add converters tha...",YES,\xa0[https://github.com/apache/cassandra/pull/...,1,positive,Disable the deprecated keyspace/table threshol...,disable deprecated keyspace/table thresholds c...
196,CASSANDRA-5244,Compactions don't work while node is bootstrap...,Bug,Resolved,Urgent,It seems that there is a race condition in Sto...,"[""Thanks for the detective work, Jouni. I'll ...",NO,BLOCKED (on object monitor)\n at org.apache...,0,negative,Compactions don't work while node is bootstrap...,compactions n't work node bootstrapping seems ...
197,CASSANDRA-173,add getPendingTasks to CFSMBean,Improvement,Resolved,Low,need to add an atomicint and inc/decr it whene...,['rebased patch as 0001-CASSANDRA-173-added-CF...,NO,rebased patch as 0001-CASSANDRA-173-added-CFS-...,0,negative,add getPendingTasks to CFSMBean need to add an...,add getpendingtasks cfsmbean need add atomicin...
198,CASSANDRA-359,CFS readStats_ and diskReadStats_ are missing,Bug,Resolved,Normal,There is no description,"[""shouldn't we also get rid of getReadDiskHits...",NO,"[""shouldn't we also get rid of getReadDiskHits...",0,negative,CFS readStats_ and diskReadStats_ are missing ...,cfs readstats_ diskreadstats_ missing descript...


In [76]:
df_teste

Unnamed: 0,issue_key,summary,issue_type,issue_status,issue_priority,description,comments,architectural_impact,comments_text,label,label_text,SummaryDescriptionComments,processed_text
0,CASSANDRA-11944,sstablesInBounds might not actually give all s...,Bug,Resolved,Normal,Same problem as with CASSANDRA-11886 - if we t...,['https://github.com/krummas/cassandra/commits...,YES,https://github.com/krummas/cassandra/commits/m...,1,positive,sstablesInBounds might not actually give all s...,sstablesinbounds might actually give sstables ...
1,CASSANDRA-12988,make the consistency level for user-level auth...,Improvement,Resolved,Low,Most reads for the auth-related tables execute...,['Linked patch allows an operator to set the r...,YES,[Link|https://app.circleci.com/pipelines/githu...,1,positive,make the consistency level for user-level auth...,make consistency level user-level auth reads w...
2,CASSANDRA-15004,Anti-compaction briefly corrupts sstable state...,Bug,Resolved,Urgent,Since we use multiple sstable rewriters in ant...,['|[3.0|https://github.com/bdeggleston/cassand...,YES,not sure what is going on with the dtests thou...,1,positive,Anti-compaction briefly corrupts sstable state...,anti-compaction briefly corrupts sstable state...
3,CASSANDRA-15265,Index summary redistribution can start even wh...,Bug,Resolved,Normal,When we pause autocompaction for upgradesstabl...,['Patch adds a flag in `CompactionManager` whi...,YES,[3.0|https://circleci.com/workflow-run/8882a8a...,1,positive,Index summary redistribution can start even wh...,index summary redistribution start even compac...
4,CASSANDRA-18029,fix starting Paxos auto repair,Bug,Resolved,Normal,This test was not run in CI because of its nam...,['I fixed here what I could: [https://github.c...,YES,repaired}} rely on running regular/incremental...,1,positive,fix starting Paxos auto repair This test was n...,fix starting paxos auto repair test run ci nam...
5,CASSANDRA-18058,In-memory index and query path,New Feature,Resolved,Normal,An in-memory index using the in-memory trie st...,['The github PR for this ticket is here:\xa0\r...,YES,[https://app.circleci.com/pipelines/github/ade...,1,positive,In-memory index and query path An in-memory in...,in-memory index query path in-memory index usi...
6,CASSANDRA-18617,Disable the deprecated keyspace/table threshol...,Improvement,Resolved,Normal,The non-guardrail thresholds 'keyspace_count_w...,"[""Part of this change is to add converters tha...",YES,\xa0[https://github.com/apache/cassandra/pull/...,1,positive,Disable the deprecated keyspace/table threshol...,disable deprecated keyspace/table thresholds c...
7,CASSANDRA-1919,Add shutdownhook to flush commitlog,Improvement,Resolved,Low,this replaces the periodic_with_flush approach...,"[""The approach I took was to add a shutdownBlo...",YES,Could not create ServerSocket on address /127....,1,positive,Add shutdownhook to flush commitlog this repla...,add shutdownhook flush commitlog replaces peri...
8,CASSANDRA-414,remove sstableLock,Improvement,Resolved,Normal,There is no description,['rebased.\n\n02\n remove sstableLock. re-...,YES,the cleanup does happen. If it were the SSTR ...,1,positive,remove sstableLock There is no description the...,remove sstablelock description cleanup happen ...
9,CASSANDRA-5426,Redesign repair messages,Improvement,Resolved,Low,Many people have been reporting 'repair hang' ...,['Work in progress is pushed to: https://githu...,YES,https://github.com/yukim/cassandra/commits/542...,1,positive,Redesign repair messages Many people have been...,redesign repair messages many people reporting...


## Carrega os dados de treino e teste

In [77]:
# Load the dataset from armandoufpi hugging face
train_data = load_dataset(MY_HUGGING_FACE_DATASET, split="train")
test_data = load_dataset(MY_HUGGING_FACE_DATASET, split="test")

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/169k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/26 [00:00<?, ? examples/s]

In [78]:
train_data

Dataset({
    features: ['summary', 'architectural_impact', 'comments_text', 'label_text', 'comments', 'issue_status', 'description', 'issue_priority', 'issue_type', 'issue_key', 'label'],
    num_rows: 200
})

In [79]:
test_data

Dataset({
    features: ['summary', 'architectural_impact', 'comments_text', 'label_text', 'comments', 'issue_status', 'description', 'issue_priority', 'issue_type', 'issue_key', 'label'],
    num_rows: 26
})

In [80]:
print(f"len(train_data['summary']): {len(train_data['summary'])}")
print(f"train_data['summary'][0]: {train_data['summary'][0]}")
print(f"train_data['label'][0]: {train_data['label'][0]}")
print(f"train_data['label_text'][0]: {train_data['label_text'][0]}")
print(f"train_data['description'][0]: {train_data['description'][0]}")

len(train_data['summary']): 200
train_data['summary'][0]: EncryptionOptions should be instantiated
train_data['label'][0]: 0
train_data['label_text'][0]: negative
train_data['description'][0]: As the title says, otherwise you get an NPE when the options are missing from the yaml.  It's included in my second patch on CASSANDRA-3045 and is a one line fix.


## Processa os dados de treino e teste

In [81]:
# Function to preprocess text data
def preprocess_function_description(examples):
  return tokenizer(examples["description"], padding="max_length", truncation=True)

def preprocess_function_description(examples):
  return tokenizer(examples["description"], padding="max_length", truncation=True)

# Function to preprocess text data
def preprocess_function(examples):
  return tokenizer(examples["summary"], padding="max_length", truncation=True)

In [82]:
# Preprocess train and test data
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [83]:
train_data

Dataset({
    features: ['summary', 'architectural_impact', 'comments_text', 'label_text', 'comments', 'issue_status', 'description', 'issue_priority', 'issue_type', 'issue_key', 'label', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [84]:
test_data

Dataset({
    features: ['summary', 'architectural_impact', 'comments_text', 'label_text', 'comments', 'issue_status', 'description', 'issue_priority', 'issue_type', 'issue_key', 'label', 'input_ids', 'attention_mask'],
    num_rows: 26
})

In [85]:
# Access the 'input_ids' from the preprocessed data
#train_inputs = train_data["input_ids"]
#test_inputs = test_data["input_ids"]

## Treina o modelo

In [86]:
!rm -rf results
!mkdir results
!ls -l

total 388
-rw-r--r-- 1 root root  93147 Aug 23 15:55 activemq_issues_inspected.xlsx
-rw-r--r-- 1 root root 137652 Aug 23 15:52 hadoop_issues_inspected.xlsx
-rw-r--r-- 1 root root 147840 Aug 23 15:50 kafka_issues_inspected.xlsx
drwxr-xr-x 2 root root   4096 Aug 23 16:05 logs
drwxr-xr-x 2 root root   4096 Aug 23 16:05 results
drwxr-xr-x 1 root root   4096 Aug 21 13:28 sample_data


In [87]:
training_args = TrainingArguments(
    output_dir="results",  # Fixed typo (removed extra space)
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Assuming you meant "size" here
    learning_rate=2e-5,
)

In [88]:
# Create trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics="accuracy",
)

In [89]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=39, training_loss=0.819644047663762, metrics={'train_runtime': 39.6747, 'train_samples_per_second': 15.123, 'train_steps_per_second': 0.983, 'total_flos': 79480439193600.0, 'train_loss': 0.819644047663762, 'epoch': 3.0})

## Faz as previsões baseadas no modelo treinado

In [90]:
# TODO: fazer a analise do issue baseado em varios fields ao mesmo tempo
def analyse_issue(issue_field):
  inputs = tokenizer(issue_field, padding="max_length", truncation=True, return_tensors="pt")

  # Move the model and input to GPU if available
  if torch.cuda.is_available():
    model.to('cuda')
    inputs.to('cuda')

  with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

  # Print the predicted sentiment
  if predictions == 1:
    return "YES"
  else:
    return "NO"

### Dados de testes

In [91]:
df_teste[['issue_key', 'summary', 'architectural_impact']]

Unnamed: 0,issue_key,summary,architectural_impact
0,CASSANDRA-11944,sstablesInBounds might not actually give all s...,YES
1,CASSANDRA-12988,make the consistency level for user-level auth...,YES
2,CASSANDRA-15004,Anti-compaction briefly corrupts sstable state...,YES
3,CASSANDRA-15265,Index summary redistribution can start even wh...,YES
4,CASSANDRA-18029,fix starting Paxos auto repair,YES
5,CASSANDRA-18058,In-memory index and query path,YES
6,CASSANDRA-18617,Disable the deprecated keyspace/table threshol...,YES
7,CASSANDRA-1919,Add shutdownhook to flush commitlog,YES
8,CASSANDRA-414,remove sstableLock,YES
9,CASSANDRA-5426,Redesign repair messages,YES


In [92]:
yes_count = df_teste.architectural_impact.to_list().count("YES")
no_count = df_teste.architectural_impact.to_list().count("NO")

# Print the counts
print("YES count:", yes_count)
print("NO count:", no_count)

YES count: 10
NO count: 16


### Roda o modelo com os dados de testes (baseado apenas no campo Summary)

In [93]:
lista_analisa_summary_yes_no = []

for index, row in df_teste.iterrows():
  field = row['summary']
  issue_key = row['issue_key']
  summary = truncate_string(text=row['summary'], max_length=50)
  previsao = analyse_issue(issue_field=field)
  print(f"{issue_key}, {summary}, Architectural Impact:{previsao}")
  lista_analisa_summary_yes_no.append(previsao)

CASSANDRA-11944, sstablesInBounds might not actually give all sstab..., Architectural Impact:NO
CASSANDRA-12988, make the consistency level for user-level auth rea..., Architectural Impact:YES
CASSANDRA-15004, Anti-compaction briefly corrupts sstable state for..., Architectural Impact:NO
CASSANDRA-15265, Index summary redistribution can start even when c..., Architectural Impact:YES
CASSANDRA-18029, fix starting Paxos auto repair, Architectural Impact:NO
CASSANDRA-18058, In-memory index and query path, Architectural Impact:YES
CASSANDRA-18617, Disable the deprecated keyspace/table thresholds a..., Architectural Impact:YES
CASSANDRA-1919, Add shutdownhook to flush commitlog, Architectural Impact:NO
CASSANDRA-414, remove sstableLock, Architectural Impact:YES
CASSANDRA-5426, Redesign repair messages, Architectural Impact:YES
CASSANDRA-11540, The JVM should exit if jmx fails to bind, Architectural Impact:NO
CASSANDRA-6013, CAS may return false but still commit the insert, Architectural Imp

In [94]:
yes_count_summary = lista_analisa_summary_yes_no.count("YES")
no_count_summary = lista_analisa_summary_yes_no.count("NO")

# Print the counts
print("YES count:", yes_count_summary)
print("NO count:", no_count_summary)

YES count: 7
NO count: 19
