<a href="https://colab.research.google.com/github/armandossrecife/teste/blob/main/testes_json_inspection_issues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import os
import json
import requests
import zipfile

MY_ISSUES_ZIP = "https://github.com/Technical-Debt-Large-Scale/my_validation/raw/main/cassandra/my_issues_to_inspection_cassandra.zip"
DIRETORIO_CORRENTE = os.getcwd()
filename = "my_issues_to_inspection_cassandra.zip"
DIRETORIO_ISSUES = os.path.join(DIRETORIO_CORRENTE,"downloads")
PATH_ARQUIVO_LOCAL = os.path.join(DIRETORIO_ISSUES,filename)
PATH_ARQUIVOS_DESCOMPACTADOS =  os.path.join(DIRETORIO_CORRENTE, "my_issues")

In [68]:
# Dada uma url e um local de arquivo (destination) faz o download do conteudo da url no arquivo
def download_file(url, destination):
    try:
        response = requests.get(url) # Faz a requisicao do arquivo
        response.raise_for_status()  # Verifica se houve algum erro na requisição
        conteudo = response.content  # Guarda o conteudo binario da resposta da requisicao
        # Coloca o conteudo da requisicao em um arquivo local
        # Cria um novo arquivo e insere o conteudo neste arquivo
        with open(destination, mode='wb') as file:
            file.write(conteudo)
    except requests.exceptions.MissingSchema:
        # Caso seja uma excecao de url invalida
        print("URL inválida. Certifique-se de fornecer uma URL válida.")
        print("Download cancelado!")
        raise ValueError("URL inválida. Certifique-se de fornecer uma URL válida.")
    except requests.exceptions.ConnectionError:
        # Caso seja uma excecao de comunicacao de rede
        print(f"Erro na conexão!")
        print("Download cancelado!")
        raise ValueError("Erro na conexão!")
    except IOError:
        # Caso aconteca um erro de IO do arquivo
        print(f"Arquivo {destination} inválido!")
        print("Download cancelado!")
        raise ValueError(f"Arquivo {destination} inválido!")

def unzip_file(my_file, path_to_unzip=None):
    try:
        with zipfile.ZipFile(my_file, 'r') as zip_ref:
            zip_ref.extractall(path_to_unzip)
    except Exception as ex:
        raise ValueError(f"Erro ao descompactar: {str(ex)}")

def get_field_content(field_name, file_content):
    try:
        my_field = file_content.split(f"{field_name}:")[1]
        my_field = my_field.split('\n')[0]
        return my_field
    except Exception:
        print(f"{field_name} não existe")

In [69]:
def convert_list_of_dict_in_json(data):
    # Open the JSON file for writing in write mode ('w')
    with open("issues_to_inspection.json", "w") as outfile:
        # Use json.dump() to write the list of dictionaries to the file
        json.dump(data, outfile, indent=4)  # Optional: Add indentation for readability

def generate_selected_issues(filename):
    print('Aguarde...')
    try:
        print(f'Fazendo o download do arquivo {filename}')
        download_file(url=MY_ISSUES_ZIP, destination=PATH_ARQUIVO_LOCAL)
        print(f"Arquivo salvo em: {PATH_ARQUIVO_LOCAL}")
        print(f'Descompactando o arquivo {filename}')
        unzip_file(PATH_ARQUIVO_LOCAL)
        print(f"Arquivo {filename} descompactado com sucesso")
    except Exception as ex:
        print(f"Erro: {str(ex)}")

def convert_issues_to_json(filename, path_arquivos_descompactados):
  # Lista de issues para serem inspecionados [dict1, dict2, .., dictN]
  issues_to_inspection = []
  print("Ler os arquivos")
  # Get list of files
  filenames = os.listdir(path_arquivos_descompactados)
  print(f"Lendo {len(filenames)} arquivos...")
  for filename in filenames:
    issue = {}
    file_path = os.path.join(path_arquivos_descompactados, filename)
    with open(file_path, 'r') as file:
      file_content = file.read()
      issue_type, summary, description, comments = None, None, None, None
      try:
        issue_type = get_field_content('issue_type', file_content)
        summary = get_field_content('summary', file_content)
        description = get_field_content('description', file_content)
        comments = get_field_content('comments', file_content)
        issue["issue_id"] = filename
        issue["issue_type"] = issue_type
        issue["summary"] = summary
        issue["description"] = description
        issue["comments"] = comments
        issues_to_inspection.append(issue)
      except Exception as ex:
        print(f"{filename} com problema em: {str(ex)}")
  convert_list_of_dict_in_json(issues_to_inspection)
  print("List of dictionaries converted to JSON file: issues_to_inspection.json")

In [100]:
def convert_comment_to_text(comments_string):
  # Remove the unnecessary parts from the string
  comments_string = comments_string.split(": ")[-1]  # Assuming the colon and space separate the key-value pair
  comments_list = comments_string.split(",")
  comments = ""
  for comment in comments_list:
    comment = comment.replace("['", "")
    comment = comment.replace("']", "")
    comment = comment.replace(", '", " ")
    comment = comment.replace("'", "")
    # Replace all occurrences of '\n' with a new line character
    comment = comment.replace("\\n", "\n")
    comment = comment.replace("\\r", "\r")
    comments = comments + comment
  return comments

In [72]:
generate_selected_issues(filename)

Aguarde...
Fazendo o download do arquivo my_issues_to_inspection_cassandra.zip
Arquivo salvo em: /content/downloads/my_issues_to_inspection_cassandra.zip
Descompactando o arquivo my_issues_to_inspection_cassandra.zip
Arquivo my_issues_to_inspection_cassandra.zip descompactado com sucesso


In [73]:
convert_issues_to_json(filename, PATH_ARQUIVOS_DESCOMPACTADOS)

Ler os arquivos
Lendo 226 arquivos...
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
description não existe
comments não existe
List of dictionaries converted to JSON file: issues_to_inspection.json


In [88]:
comments_string = """comments: ['Working on this.', 'Still working on this, Todd?', ... , 'Committed.', 'Integrated in Cassandra #567 (See [https://hudson.apache.org/hudson/job/Cassandra/567/])']"""

print(comments_string)

comments: ['Working on this.', 'Still working on this, Todd?', ... , 'Committed.', 'Integrated in Cassandra #567 (See [https://hudson.apache.org/hudson/job/Cassandra/567/])']


In [101]:
my_comment = convert_comment_to_text(comments_string)
print(my_comment)

Working on this. Still working on this Todd? ...  Committed. Integrated in Cassandra #567 (See [https://hudson.apache.org/hudson/job/Cassandra/567/])


In [94]:
with open("issues_to_inspection.json", "r") as json_file:
    json_data = json.load(json_file)

In [102]:
for issue in json_data:
  print(f"issue_id: {issue['issue_id']}")
  print(f"issue_type: {issue['issue_type']}")
  print(f"summary: {issue['summary']}")
  print(f"description: {issue['description']}")
  print(f"comments: {issue['comments']}")
  if issue["comments"]:
    my_comment = convert_comment_to_text(issue['comments'])
    print(f"my_comment: {my_comment}")
  print("---"*50)

issue_id: CASSANDRA-6012
issue_type:  Bug 
summary:  CAS does not always correctly replay inProgress rounds 
description:  Paxos says that on receiving the result of a prepare from a quorum of acceptors, the proposer should propose the value of the higher-number proposal accepted amongst the ones returned by the acceptors, and only propose his own value if no acceptor has send us back a previously accepted value.
comments:  ['Attaching fix: as far as checking if we should finish an inProgress round, we only need to keep the most recent inProgress commit that has a value. But so as to not break the optimization of CASSANDRA-5667, the patch also keep the most recent inProgress, regardless of whether it has a value or not.\n', '+1', 'Committed, thanks'] 
my_comment: as far as checking if we should finish an inProgress round we only need to keep the most recent inProgress commit that has a value. But so as to not break the optimization of CASSANDRA-5667 the patch also keep the most recent 

In [103]:
!wget https://github.com/Technical-Debt-Large-Scale/my_validation/raw/main/cassandra_issues_inspected_merged.xlsx

--2024-07-01 20:07:47--  https://github.com/Technical-Debt-Large-Scale/my_validation/raw/main/cassandra_issues_inspected_merged.xlsx
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Technical-Debt-Large-Scale/my_validation/main/cassandra_issues_inspected_merged.xlsx [following]
--2024-07-01 20:07:47--  https://raw.githubusercontent.com/Technical-Debt-Large-Scale/my_validation/main/cassandra_issues_inspected_merged.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 573631 (560K) [application/octet-stream]
Saving to: ‘cassandra_issues_inspected_merged.xlsx’


2024-07-01 20:07:47 (10.6 MB/s) - ‘ca

In [104]:
import pandas

In [112]:
df_issues_gt = pandas.read_excel("cassandra_issues_inspected_merged.xlsx")
df_issues_gt.head(3)

  warn(msg)


Unnamed: 0,issue_key,summary,issue_type,issue_status,issue_priority,description,comments,architectural_impact_manual,architectural_impact,TP,TN,FP,FN
0,CASSANDRA-11272,NullPointerException (NPE) during bootstrap st...,Bug,Resolved,Normal,After bootstrapping fails due to stream closed...,"['Which Cassandra version are you using? ', 'T...",YES,YES,1,0,0,0
1,CASSANDRA-1321,loadSchemaFromYaml should push migrations to c...,Improvement,Resolved,Normal,,"['announces the last migration, which ends up ...",YES,YES,1,0,0,0
2,CASSANDRA-1641,auto-guessed memtable sizes are too high,Bug,Resolved,Normal,I've seen two cases now of the memtable sizes ...,"[""I'd like to introduce a dependency on number...",YES,YES,1,0,0,0


In [113]:
colunas_uteis = ['issue_key', 'issue_type','summary', 'description', 'comments', 'architectural_impact_manual']
df_issues_gt = df_issues_gt[colunas_uteis]
df_issues_gt

Unnamed: 0,issue_key,issue_type,summary,description,comments,architectural_impact_manual
0,CASSANDRA-11272,Bug,NullPointerException (NPE) during bootstrap st...,After bootstrapping fails due to stream closed...,"['Which Cassandra version are you using? ', 'T...",YES
1,CASSANDRA-1321,Improvement,loadSchemaFromYaml should push migrations to c...,,"['announces the last migration, which ends up ...",YES
2,CASSANDRA-1641,Bug,auto-guessed memtable sizes are too high,I've seen two cases now of the memtable sizes ...,"[""I'd like to introduce a dependency on number...",YES
3,CASSANDRA-2296,Bug,"Scrub resulting in ""bloom filter claims to be ...",Doing a scrub on a node which I upgraded from ...,"[""With debug logging turned on it looks like t...",YES
4,CASSANDRA-3117,Bug,StorageServiceMBean is missing a getCompaction...,"Without a getter, you can assign a new value b...","['+1', 'committed.', 'Integrated in Cassandra-...",YES
...,...,...,...,...,...,...
221,CASSANDRA-9631,Bug,Unnecessary required filtering for query on in...,Let's create and populate a simple table compo...,['Is there a chance that CASSANDRA-8418 introd...,NO
222,CASSANDRA-9636,Bug,Duplicate columns in selection causes Assertio...,"Prior to CASSANDRA-9532, unaliased duplicate f...","[""I'm seeing the same error als on a query not...",NO
223,CASSANDRA-9858,Bug,SelectStatement.Parameters fields should be in...,SelectStatement.Parameters fields should be in...,['https://github.com/JeremiahDJordan/cassandra...,NO
224,CASSANDRA-9880,Bug,ScrubTest.testScrubOutOfOrder should generate ...,ScrubTest#testScrubOutOfOrder is failing on tr...,"[""Patch attached as link.\nLet's see what cass...",NO


In [114]:
# prompt: convert df_issues_gt to json file

import json

df_issues_gt_json = df_issues_gt.to_json(orient="records")

with open("df_issues_gt.json", "w") as json_file:
    json_file.write(df_issues_gt_json)


In [115]:
with open("df_issues_gt.json", "r") as json_file:
    json_data = json.load(json_file)

for issue in json_data:
  print(f"issue_id: {issue['issue_key']}")
  print(f"issue_type: {issue['issue_type']}")
  print(f"summary: {issue['summary']}")
  print(f"description: {issue['description']}")
  print(f"architectural impact: {issue['architectural_impact_manual']}")
  print(f"comments: {issue['comments']}")
  if issue["comments"]:
    my_comment = convert_comment_to_text(issue['comments'])
    print(f"my_comment: {my_comment}")
  print("---"*50)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
DEBUG [pool-2-thread-359] 2011-10-18 10:34:45,376 StorageProxy.java (line 821) %s disagrees (%s)


simple fix: replace with %s with {} ... may want to consider logging better comment?
architectural impact: NO
comments: ['Fixed the erroneous debug logging statement by replacing %s with {}, as supported by SLF4J. Also made use of the {}-notation on some of the other debug logging statements in the class.', 'committed.  Thanks, Jackson and Tommy!']
my_comment: Fixed the erroneous debug logging statement by replacing %s with {} as supported by SLF4J. Also made use of the {}-notation on some of the other debug logging statements in the class. committed.  Thanks Jackson and Tommy!
------------------------------------------------------------------------------------------------------------------------------------------------------
issue_id: CASSANDRA-5571
issue_type: Improvement
summary: Reject bootstrapping endpoints tha