## Setup logging

In [1]:
from IPython.display import clear_output
import logging
from ipylogging import DisplayHandler, HTMLFormatter

handler = DisplayHandler()
handler.setFormatter(HTMLFormatter())

log = logging.getLogger()
log.addHandler(handler)
log.setLevel(logging.INFO)

## Importing dataset and modules

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from minirig import load_csv_dataset, save_csv_dataset
import os
import json

data_dir = Path('../data')
bots_dataset_path = data_dir.joinpath('bots-dataset.csv')
bots_issues_dir = data_dir.joinpath('bots-issues')
bot_dataset = load_csv_dataset(bots_dataset_path)
projects_dataset_path = data_dir.joinpath('projects-dataset.csv')
issues_bots_datasets_dir = data_dir.joinpath('datasets-per-bot')

def plot_hist(data, x_label="", y_label="", title="", bins=5):
    # matplotlib histogram
    plt.hist(data, color = 'blue', edgecolor = 'black', bins=bins)

    # Add labels
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

def get_issues_per_bot(issues_dir = None, bot = None):
    for owner in os.listdir(issues_dir.joinpath(bot)):
        for project in os.listdir(issues_dir.joinpath(bot).joinpath(owner)):
            for issue in os.listdir(issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues')):
                issue_path = issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues').joinpath(issue)
                with open(issue_path.joinpath('json')) as f:
                    yield [issue_path.joinpath('json'), json.load(f)]
                
    

## Filtering bots with few than 100 issues

In [3]:
filtered_bots_dataset = [x for x in bot_dataset if x['issue_count'] != 'na' and int(x['issue_count']) >= 100]

## Creating CSV

In [12]:
from helpers import *
import csv
import os

def get_owner_project(path):
    return f"{split_path[4]}_{split_path[5]}"
    
def get_issue_json(path):
    try:
        with open(path, 'r') as file:
            return json.load(file)
    except Exception as e:
        return None

project_data = {}
for bot in filtered_bots_dataset:
    cnt = 1
    #if f"{bot['account']}.csv" in os.listdir(data_dir.joinpath('datasets-per-bot')) or bot['account'] == "brotherlogic":
    if bot['account'] in ['gopherbot', 'glassfishrobot', 'k8s-ci-robot', 'fejta-bot', 'brotherlogic']: 
        continue
    dataset = []
    for path in bots_issues_dir.joinpath(bot['account']).rglob( '*json' ):
        split_path = str(path).split('/')
        issue_file = get_issue_json(path)        
        if issue_file:
            owner_project = get_owner_project(split_path)
            if 'comments' in split_path:
                for i, c in enumerate(issue_file):
                    print(path)
                    try:
                        row = create_dataset_row(bot['account'], c, text_section='body', is_comment=True, comment_number=i, owner_project=owner_project)
                    except:
                        pass
                    dataset.append(row)
            else:
                
                row_1 = create_dataset_row(bot['account'], issue_file, text_section='title', is_comment=False, owner_project=owner_project)
                dataset.append(row_1)
    
                row_2 = create_dataset_row(bot['account'], issue_file, text_section='body', is_comment=False, owner_project=owner_project)
                dataset.append(row_2)
            cnt += 1
            clear_output()
            print(path)
            logging.info(f'Processing sectiong for {bot["account"]} ({cnt})...')
                
    with open(data_dir.joinpath('datasets-per-bot').joinpath(f"{bot['account']}.csv"), 'w') as file:
        clear_output()
        logging.info(f'Writing csv file for {bot["account"]}...')
        fields = dataset[0].keys()
        writer = csv.DictWriter(file, fieldnames = fields, escapechar='\\') 
        writer.writeheader() 
        writer.writerows(dataset)
    logging.info('Finished...')

../data/bots-issues/fire-bot/doughogan/testrepo/issues/1/comments/json


KeyError: 'comments'

## Extracting the the number of projects per bot

In [None]:
import pandas as pd
bots_dataset = pd.read_csv(bots_dataset_path)
bots_dataset.fillna(0, inplace=True)
bots_dataset['issue_count'].replace('na', 0, inplace=True)
bots_dataset.issue_count = pd.to_numeric(bots_dataset.issue_count, errors='coerce')
filtered_bots = bots_dataset.loc[bots_dataset['issue_count'] >= 100]

for i in filtered_bots.index:
    bot = filtered_bots['account'][i]
    print(bot)
    if bot == "brotherlogic":
        continue
    bot_df = pd.read_csv(issues_bots_datasets_dir.joinpath(f'{bot}.csv'))
    n_projects = len(bot_df.loc[:, ['td-label-li2022-emse', 'owner_project']].groupby('owner_project').count())
    bots_dataset['n-projects'][i] = n_projects
    bots_dataset.to_csv(bots_dataset_path, index=False)

## Extracting the proportion of SATD per bot

In [86]:
bot_df.loc[:,['owner_project', 'issue', 'td-label-li2022-emse']].groupby(['owner_project']).value_counts()

owner_project                                 issue  td-label-li2022-emse
alwin-joseph_target-test-jsf-issue-migration  329    non-SATD                31
                                              322    non-SATD                29
                                              316    non-SATD                29
                                              323    non-SATD                27
                                              6      non-SATD                26
                                                                             ..
javaee_wadl                                   75     SATD                     2
                                                     non-SATD                 2
                                              34     SATD                     2
                                              56     SATD                     2
                                              53     SATD                     2
Name: count, Length: 76887, dtype: int64

In [8]:
for i in filtered_bots_dataset.index:
    bot = filtered_bots['account'][i]
    if bot == "brotherlogic":
        continue
    bot_df = pd.read_csv(issues_bots_datasets_dir.joinpath(f'{bot}.csv'))
    print(f"{bot}: {len(bot_df.loc[bot_df['type'].str.contains('comment')])}")

TypeError: 'builtin_function_or_method' object is not iterable

In [10]:
len(dataset)

244245