# Data Collection

## Set up logging

In [1]:
import logging
from ipylogging import DisplayHandler, HTMLFormatter

handler = DisplayHandler()
handler.setFormatter(HTMLFormatter())

log = logging.getLogger()
log.addHandler(handler)
log.setLevel(logging.INFO)

## Global variables

In [2]:
from pathlib import Path

data_dir = Path('../data')
cache_dir_github = data_dir.joinpath('github')
bots_dataset_path = data_dir.joinpath('bots-dataset.csv')
bots_issues_dir = data_dir.joinpath('bots-issues')

#github_token = open('../gh-token.txt','r').readlines()[0].strip()


## Step 1 - Collect bots from [Golzadeh et al.](https://zenodo.org/record/4000388)'s dataset

Download dataset from [Golzadeh et al.](https://zenodo.org/record/4000388)

In [None]:
import urllib
import gzip
import os

url_bot_dataset = "https://zenodo.org/record/4000388/files/groundtruthbots.csv.gz"
path_bot_dataset = data_dir.joinpath('groundtruthbots.csv')

gz_path, _ = urllib.request.urlretrieve(url_bot_dataset)
with gzip.open(gz_path, "rb") as f_in, open(path_bot_dataset, "wb") as f_out:
    f_out.write(f_in.read())

Extract bots

In [None]:
from minirig import load_csv_dataset, save_csv_dataset

bot_dataset = load_csv_dataset(path_bot_dataset)
bot_dataset = [{'account': row['account']} for row in bot_dataset if row['type'] == 'Bot']
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 2 - Collect the number of issues per bot

In [None]:
from minirig import load_csv_dataset, save_csv_dataset
from minirig import GHRequests

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

for bot in bot_dataset:
    try:
        bot['issue_count'] = gh_api.get_number_issues_involving_user(bot['account'], force=True)
    except:
        bot['issue_count'] = 'na'

bot_dataset.sort(reverse=True, key=lambda x: -1 if x['issue_count'] == 'na' else x['issue_count'])
save_csv_dataset('../data/new-bot-dataset.csv', data=bot_dataset, s)

## Step 3 - Download issues for each bot

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/',''))
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        
        if not bot_issue_path.joinpath('json').exists():    
            full_issue = gh_api.get_issue_info(issue['number'], owner, project)  
            bot_issue_path.mkdir(parents=True, exist_ok=True)
            with open(bot_issue_path.joinpath('json'), 'w') as f:
                json.dump(full_issue, f)

        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

vaadin_bot = [x for x in bot_dataset if x['account'] == 'vaadin-bot']

for bot in vaadin_bot:
    
    for issue in gh_api.get_issues_involving_user(bot['account'], force= True):
        
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/',''))
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        
        if not bot_issue_path.joinpath('json').exists():    
            full_issue = gh_api.get_issue_info(issue['number'], owner, project)  
            bot_issue_path.mkdir(parents=True, exist_ok=True)
            with open(bot_issue_path.joinpath('json'), 'w') as f:
                json.dump(full_issue, f)

        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1

## Step 4 - Download comments for each issue

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/','')).joinpath('comments')
        bot_issue_path.mkdir(parents=True, exist_ok=True)
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        comments = gh_api.get_comments_per_issue(issue['number'], owner, project, force = True)           
        with open(bot_issue_path.joinpath('json'), 'w') as f:
            json.dump(comments, f)
        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        print(bot_issue_path)
        cnt += 1

## Step 6 - Labeling sections with [Li et al.](http://doi.org/10.1007/s10664-022-10128-3)'s Model 

In [3]:
from model_li2022_emse import *
nltk.download('punkt')
model1_li2022_emse = Model1_IssueTracker_Li2022_ESEM('../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-weight_file.hdf5', 
                 '../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-word_embedding_file.bin')

2023-05-10 19:27:42.477399: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/mambauser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading model ../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-weight_file.hdf5...




In [None]:
from IPython.display import clear_output
from minirig import load_csv_dataset
import pandas as pd
import json
import os

bots_dataset = pd.read_csv('../data/bots-dataset-labeling-management.csv',delimiter=';')
for i in bots_dataset.index:
    cnt = 1
    if bots_dataset['label_finished'][i] == 'yes':
        continue
    bot = bots_dataset['account'][i]
    for owner in os.listdir(bots_issues_dir.joinpath(bot)):
        for project in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner)):
            for issue in os.listdir(bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues')):
                issue_path = bots_issues_dir.joinpath(bot).joinpath(owner).joinpath(project).joinpath('issues').joinpath(issue)
                
                with open(issue_path.joinpath('json')) as f:
                    try:
                        issue_file = json.load(f)
                    except:
                        pass
                
                if 'td-label-li2022-emse' not in issue_file.keys():
                
                    issue_file['td-label-li2022-emse'] = model1_li2022_emse.label(issue_file['body'])

                    with open(issue_path.joinpath('json'), 'w') as f:
                        json.dump(issue_file, f)
                
                with open(issue_path.joinpath('comments').joinpath('json')) as f:
                    comments = json.load(f)
                    
                for c in comments:
                    if 'td-label-li2022-emse' not in c.keys():
                        c['td-label-li2022-emse'] = model1_li2022_emse.label(c['body'])
                
                with open(issue_path.joinpath('comments').joinpath('json'),'w') as f:
                    json.dump(comments, f)
                    
                clear_output()
                logging.info(f'{bot}: {cnt}/{bots_dataset["issue_count"][i]}')
                cnt += 1
    bots_dataset['label_finished'][i] = 'yes'
    bots_dataset.to_csv('../data/bots-dataset-labeling-management.csv',delimiter=';')



## Step 7 - Labeling sections with [Li et al.](http://doi.org/10.1109/TSE.2022.3224378)'s Model 


In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json
from model_li2022_tse import *

v = Model1_IssueTracker_Li2022_TSE('../model/satd-issue_mul.hdf5', '../model/fasttext_issue_300.bin')

issues_dataset = pd.read_csv('../data/dataset-backup.csv')

In [None]:
issues_dataset['td-label-li-tse'] = '-'
len_dataset = len(issues_dataset)
for j, i in enumerate(issues_dataset.index):
    issues_dataset['td-label'][i] = v.classify_prob_comment(issues_dataset['text'][i])
    clear_output()
    log.info(f'Handling {i+1} out {len_dataset} lines')