# Data Collection

## Set up logging

In [2]:
import logging
from ipylogging import DisplayHandler, HTMLFormatter

handler = DisplayHandler()
handler.setFormatter(HTMLFormatter())


log = logging.getLogger()
log.addHandler(handler)
log.setLevel(logging.INFO)

## Global variables

In [1]:
from pathlib import Path

data_dir = Path('../data')
cache_dir_github = data_dir.joinpath('github')
bots_dataset_path = data_dir.joinpath('bots-dataset.csv')
bots_issues_dir = data_dir.joinpath('bots-issues')


github_token = open('../gh-token.txt','r').readlines()[0].strip()


## Step 1 - Collect bots from [Golzadeh et al.](https://zenodo.org/record/4000388)'s dataset

Download dataset from [Golzadeh et al.](https://zenodo.org/record/4000388)

In [None]:
import urllib
import gzip
import os

url_bot_dataset = "https://zenodo.org/record/4000388/files/groundtruthbots.csv.gz"
path_bot_dataset = data_dir.joinpath('groundtruthbots.csv')

gz_path, _ = urllib.request.urlretrieve(url_bot_dataset)
with gzip.open(gz_path, "rb") as f_in, open(path_bot_dataset, "wb") as f_out:
    f_out.write(f_in.read())

Extract bots

In [None]:
from minirig import load_csv_dataset, save_csv_dataset

bot_dataset = load_csv_dataset(path_bot_dataset)
bot_dataset = [{'account': row['account']} for row in bot_dataset if row['type'] == 'Bot']
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 2 - Collect the number of issues per bot

In [None]:
from minirig import load_csv_dataset, save_csv_dataset
from minirig import GHRequests

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

for bot in bot_dataset:
    try:
        bot['issue_count'] = gh_api.get_number_issues_involving_user(bot['account'], force=True)
    except:
        bot['issue_count'] = 'na'

bot_dataset.sort(reverse=True, key=lambda x: -1 if x['issue_count'] == 'na' else x['issue_count'])
save_csv_dataset('../data/new-bot-dataset.csv', data=bot_dataset, s)

## Step 3 - Download issues for each bot

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/',''))
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        
        if not bot_issue_path.joinpath('json').exists():    
            full_issue = gh_api.get_issue_info(issue['number'], owner, project)  
            bot_issue_path.mkdir(parents=True, exist_ok=True)
            with open(bot_issue_path.joinpath('json'), 'w') as f:
                json.dump(full_issue, f)

        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1

## Step 4 - Download comments for each issue

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/','')).joinpath('comments')
        bot_issue_path.mkdir(parents=True, exist_ok=True)
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        comments = gh_api.get_comments_per_issue(issue['number'], owner, project, force = True)           
        with open(bot_issue_path.joinpath('json'), 'w') as f:
            json.dump(comments, f)
        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1

## Step 5 - Labeling sections with [Li et al.](http://doi.org/10.1007/s10664-022-10128-3)'s Model 

In [None]:
from IPython.display import clear_output
from minirig import load_csv_dataset,  save_csv_dataset
import pandas as pd
import json
import os
from model_li2022_emse import *

#nltk.download('punkt')
#model1_li2022_emse = Model1_IssueTracker_Li2022_ESEM('../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-weight_file.hdf5', 
                 #'../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-word_embedding_file.bin')

def load_issues_per_bot(bot, issue_count):
    issues = []
    cnt = 1
    for owner in os.listdir(bots_issues_dir.joinpath(bot)):
        for project in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner)):
            for issue in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues')):
                issue_path = bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues').joinpath(issue)
                with open(issue_path.joinpath('json')) as f:
                    print(issue_path)
                    try:
                        issue_file = json.load(f)
                        yield {'path':issue_path, 'content':issue_file}
                    except:
                        issues.append(f'{str(issue_path)}\n')
    with open(data_dir.joinpath('issues_error').joinpath(f'issue_error_{bot}.txt'), 'w') as f:
        f.writelines(issues)
        
def store_labeled_issues(batch, predictions):
    for i, issue in enumerate(batch):
        issue['content']['td-label-li2022-emse'] = str(predictions[i])
        with open(Path(issue['path']).joinpath('json'), 'w') as f:
            json.dump(issue['content'], f)

def load_issues_comments_per_bot(bot, issue_count):
    issues = []
    cnt = 1
    for owner in os.listdir(bots_issues_dir.joinpath(bot)):
        for project in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner)):
            for issue in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues')):
                comments_path = bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues').joinpath(issue).joinpath('comments')
                with open(comments_path.joinpath('json')) as f:
                    comments_file = json.load(f)
                    batch_item = {'path':comments_path, 'content':comments_file, 'n-comments':len(comments_file)}
                    yield batch_item

def prepare_comments_batch(batch):
    batched_comments = []
    for comments_file in batch:
        for c in comments_file['content']:
            batched_comments.append(c['body'])
    return batched_comments

def store_labeled_comments(batch, predictions):
    pred_idx = 0
    for comments in batch:
        for i in range(int(comments['n-comments'])):       
            comments['content'][i]['td-label-li2022-emse'] = str(predictions[pred_idx])
            pred_idx += 1
                       
        with open(Path(comments['path']).joinpath('json'), 'w') as f:
            json.dump(comments['content'], f)

def check_if_labeled(comments):
    for c  in comments['content']:
        if isinstance(c, dict) and 'td-label-li2022-emse' not in c.keys():
            return False
    return True

In [None]:
batch_size = 128
bots_dataset = pd.read_csv('../data/bots-dataset-labeling-management.csv',delimiter=',')

for i in bots_dataset.index:
    cnt = 1
    if bots_dataset['label_finished'][i] == 'yes':
        continue
    bot = bots_dataset['account'][i]
    batch = []
    for issue in load_issues_per_bot(bot, bots_dataset['issue_count'][i]):
        model1_li2022_emse.clear_model_session()
        if 'body' in issue['content'].keys() and issue['content']['body'] != None and 'td-label-li2022-emse' not in issue['content'].keys():
            batch.append(issue)
        if len(batch) >= batch_size:
            predictions = model1_li2022_emse.label_sections_in_batch(comments=[x['content']['body'] for x in batch], batch_size=batch_size)
            store_labeled_issues(batch, predictions)
            batch = []         
        clear_output()
        logging.info(f'{bot}: {cnt}/{bots_dataset["issue_count"][i]} (len batch:{len(batch)})')
        cnt += 1
    if len(batch) > 0:
        predictions = model1_li2022_emse.label_sections_in_batch(comments=[x['content']['body'] for x in batch], batch_size=batch_size)
        store_labeled_issues(batch, predictions)
        batch = []
    bots_dataset['label_finished'][i] = 'yes'
    bots_dataset.to_csv('../data/bots-dataset-labeling-management.csv', index=False)

In [None]:
batch_size = 2
bots_dataset = pd.read_csv('../data/bots-dataset-labeling-management.csv',delimiter=',')

for i in bots_dataset.index:
    cnt = 1
    if bots_dataset['label_comments_finished'][i] == 'yes':
        continue
    bot = bots_dataset['account'][i]
    batch = []
    for comments in load_issues_comments_per_bot(bot, bots_dataset['issue_count'][i]):
        if cnt == 3148:
            continue
        if comments == None or check_if_labeled(comments):
            cnt += 1
            clear_output()
            logging.info(f'{bot}: {cnt}/{bots_dataset["issue_count"][i]}')
            continue
        batch.append(comments)
        batched_comments = []
        if len(batch) >= batch_size:
            model1_li2022_emse.clear_model_session()
            batched_comments = prepare_comments_batch(batch)
            predictions = []
            predictions = model1_li2022_emse.label_sections_in_batch(batched_comments, batch_size=len(batched_comments))
            store_labeled_comments(batch, predictions)
            batch = []
        clear_output()
        logging.info(f'{bot}: {cnt}/{bots_dataset["issue_count"][i]} (len batch:{len(batched_comments)})')
        cnt += 1
    if len(batch) > 1:
        predictions = model1_li2022_emse.label_sections_in_batch(batched_comments, batch_size=len(batched_comments))
        store_labeled_comments(batch, predictions)
        batch = []
        
    bots_dataset['label_comments_finished'][i] = 'yes'
    bots_dataset.to_csv('../data/bots-dataset-labeling-management.csv', index=False)

In [None]:
from helpers import *
import csv
import os

def get_owner_project(path):
    return f"{split_path[4]}_{split_path[5]}"
    
def get_issue_json(path):
    try:
        with open(path, 'r') as file:
            return json.load(file)
    except Exception as e:
        return None

project_data = {}
for bot in filtered_bots_dataset:
    cnt = 1
    dataset = []
    for path in bots_issues_dir.joinpath(bot['account']).rglob( '*json' ):
        split_path = str(path).split('/')
        issue_file = get_issue_json(path)        
        if issue_file:
            owner_project = get_owner_project(split_path)
            if 'comments' in split_path:
                for i, c in enumerate(issue_file):
                    row = create_dataset_row(bot['account'], c, text_section='body', is_comment = True, comment_number = i, owner_project = owner_project)
                    dataset.append(row)
            else:
                row_1 = create_dataset_row(bot['account'], issue_file, text_section='title', is_comment=False, owner_project=owner_project)
                dataset.append(row_1)
    
                row_2 = create_dataset_row(bot['account'], issue_file, text_section='body', is_comment=False, owner_project=owner_project)
                dataset.append(row_2)
           
            cnt += 1
            clear_output()
            print(path)
            logging.info(f'Processing sectiong for {bot["account"]} ({cnt})...')
                
    with open(data_dir.joinpath('datasets-per-bot').joinpath(f"{bot['account']}.csv"), 'w') as file:
        clear_output()
        logging.info(f'Writing csv file for {bot["account"]}...')
        fields = dataset[0].keys()
        writer = csv.DictWriter(file, fieldnames = fields, escapechar='\\') 
        writer.writeheader() 
        writer.writerows(dataset)
    logging.info('Finished...')

## Step 6 - Generating dataset

#### Step 6.1 - Collecting the number of issue types per task

In [None]:
from IPython.display import clear_output
from minirig import load_csv_dataset, save_csv_dataset
import pandas as pd
import json
import os

def load_issues_per_bot(bot, issue_count):
    issues = []
    cnt = 1
    for owner in os.listdir(bots_issues_dir.joinpath(bot)):
        for project in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner)):
            for issue in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues')):
                issue_path = bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues').joinpath(issue)
                yield issue_path


bots_more_100_issues = [x for x in load_csv_dataset(bots_dataset_path) if int(x['issue_count']) >= 100]
bots_dataset_totals = []
issues_not_labeled = []
issue_error = []
comments_error = []
for bot_i in bots_more_100_issues:
    cnt = 1
    bot = bot_i['account']
    row = {"bot":bot,"n-issues-td":0, "n-issues-non-td": 0, "opened-satd": 0, "opened-non-satd": 0, "closed-satd": 0, "closed-non-satd": 0, "comments-satd": 0, "comments-non-satd": 0}    
    for issue_path in load_issues_per_bot(bot, bot_i['issue_count']):
        try:
            with open(issue_path.joinpath('json')) as f:
                issue = json.load(f)
                is_closed = issue['state'] == 'closed'
        except:
            issue_error.append(f'{str(issue_path)}\n')
            continue
            
        try:
            has_td = issue['td-label-li2022-emse'] == 'SATD'
        except:
            issues_not_labeled.append(issue_path)
            continue

        if has_td:
            row['n-issues-td'] += 1
        else:
            row['n-issues-non-td'] += 1

        if issue['user']['login'] == bot:
            if has_td:
                row['opened-satd'] += 1
            else: 
                row['opened-non-satd'] += 1
            
        if is_closed and issue['closed_by'] != None:
            if issue['closed_by']['login'] == bot:
                if has_td:
                    row['closed-satd'] += 1
                else: 
                    row['closed-non-satd'] += 1
        try:
            f = open(issue_path.joinpath('comments').joinpath('json'))
            comments = json.load(f)
        except:
            comments_error.append(str(issue_path)+'\n')
            continue

        if type(comments) is list:
            for c in comments:
                if c['user']['login'] == bot:
                    if has_td:
                        row['comments-satd'] += 1
                    else:
                        row['comments-non-satd'] += 1
                    break
                    
        elif type(comments) is dict:
            if 'user' in comments.keys():
                if comments['user']['login'] == bot:
                    if has_td:
                        row['comments-satd'] += 1
                    else:
                        row['comments-non-satd'] += 1
            else:
                comments_error.append(str(issue_path)+'\n')
                continue
                
        else:
            comments_error.append(str(issue_path)+'\n')
            continue
        
                    
        clear_output()
        cnt += 1
        print(bot,cnt,bot_i['issue_count'])   
    bots_dataset_totals.append(row)
    save_csv_dataset(data_dir.joinpath('bots-dataset-tasks.csv'), bots_dataset_totals)

#### Step 6.2 - Normalizing the data

In [38]:
import pandas as pd
df = pd.read_csv(data_dir.joinpath('bots-dataset-tasks.csv'))
total_issues_td = df['total-issues-td'].sum()
total_issues_non_td = df['total-issues-non-td'].sum()
total_issues = total_issues_td + total_issues_non_td
proportion_td = total_issues_td / total_issues
proportion_non_td = total_issues_non_td / total_issues

df['opened-satd-norm-bot'] = df['opened-satd']/(df['opened-satd'] + df['opened-non-satd'])
df['opened-non-satd-norm-bot'] = df['opened-non-satd']/(df['opened-satd'] + df['opened-non-satd'])
df['closed-satd-norm-bot'] = df['closed-satd']/(df['closed-satd'] + df['closed-non-satd'])
df['closed-non-satd-norm-bot'] = df['closed-non-satd']/(df['closed-satd'] + df['closed-non-satd'])
df['comments-satd-norm-bot'] = df['comments-satd']/(df['comments-satd'] + df['comments-non-satd'])
df['comments-non-satd-norm-bot'] = df['comments-non-satd']/(df['comments-satd'] + df['comments-non-satd'])


df['opened-satd-norm-pop'] = df['opened-satd-norm-bot']/proportion_td
df['opened-non-satd-norm-pop'] = df['opened-non-satd-norm-bot']/proportion_non_td
df['closed-satd-norm-pop'] = df['closed-satd-norm-bot']/proportion_td
df['closed-non-satd-norm-pop'] = df['closed-non-satd-norm-bot']/proportion_non_td
df['comments-satd-norm-pop'] = df['comments-satd-norm-bot']/proportion_td
df['comments-non-satd-norm-pop'] = df['comments-non-satd-norm-bot']/proportion_non_td

df.set_index('bot', inplace=True)
df.to_csv(data_dir.joinpath('final-dataset.csv'))