# Data Collection

## Set up logging

In [1]:
import logging
from ipylogging import DisplayHandler, HTMLFormatter

handler = DisplayHandler()
handler.setFormatter(HTMLFormatter())

log = logging.getLogger()
log.addHandler(handler)
log.setLevel(logging.INFO)

## Global variables

In [3]:
from pathlib import Path

data_dir = Path('../data')
cache_dir_github = data_dir.joinpath('github')
bots_dataset_path = data_dir.joinpath('bots-dataset.csv')
bots_issues_dir = data_dir.joinpath('bots-issues')

#github_token = open('../gh-token.txt','r').readlines()[0].strip()

## Step 1 - Collect bots from [Golzadeh et al.](https://zenodo.org/record/4000388)'s dataset

Download dataset from [Golzadeh et al.](https://zenodo.org/record/4000388)

In [None]:
import urllib
import gzip
import os

url_bot_dataset = "https://zenodo.org/record/4000388/files/groundtruthbots.csv.gz"
path_bot_dataset = data_dir.joinpath('groundtruthbots.csv')

gz_path, _ = urllib.request.urlretrieve(url_bot_dataset)
with gzip.open(gz_path, "rb") as f_in, open(path_bot_dataset, "wb") as f_out:
    f_out.write(f_in.read())

Extract bots

In [None]:
from minirig import load_csv_dataset, save_csv_dataset

bot_dataset = load_csv_dataset(path_bot_dataset)
bot_dataset = [{'account': row['account']} for row in bot_dataset if row['type'] == 'Bot']
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 2 - Collect the number of issues per bot

In [None]:
from minirig import load_csv_dataset, save_csv_dataset
from minirig import GHRequests

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

for bot in bot_dataset:
    try:
        bot['issue_count'] = gh_api.get_number_issues_involving_user(bot['account'])
    except:
        bot['issue_count'] = 'na'

bot_dataset.sort(reverse=True, key=lambda x: -1 if x['issue_count'] == 'na' else x['issue_count'])
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 3 - Download issues for each bot

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        bot_issue_path = bot_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/',''))
        bot_issue_path.mkdir(parents=True, exist_ok=True)
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        full_issue = gh_api.get_issue_info(issue['number'], owner, project)       
        if not bot_issue_path.joinpath('json').exists():
            with open(bot_issue_path.joinpath('json'), 'w') as f:
                json.dump(full_issue, f)

        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1

## Step 4 - Download comments for each issue

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

issues_errors = []

for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        bot_issue_path = bots_issues_dir.joinpath(bot['account']).joinpath(issue['html_url'].replace('https://github.com/','')).joinpath('comments')
        bot_issue_path.mkdir(parents=True, exist_ok=True)
        owner, project = issue['repository_url'].replace('https://api.github.com/repos/', '').split('/')
        comments = gh_api.get_comments_per_issue(issue['number'], owner, project, force = True)           
        with open(bot_issue_path.joinpath('json'), 'w') as f:
            json.dump(comments, f)
        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        print(bot_issue_path)
        cnt += 1

## Step 5 - Creating Dataset

In [30]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset, save_csv_dataset
import pandas as pd
import json
from helpers import *
    
bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)
dataset_name = 'dataset-backup.csv'
dataset = []  

headers =['bot',
            'owner',
            'project',
            'issue',
            'text',
            'type',
            'author-login',
            'open-date',
            'state',
            'close-date',
            'closed-by',
            'n-comments',
            'td-label']


for bot in bot_dataset:
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        rows = []
        owner, project = (issue['repository_url'].replace('https://api.github.com/repos/', '').split('/'))
        issue_path = bots_issues_dir.joinpath(f"{bot['account']}/{owner}/{project}/issues/{issue['number']}")
        with open(issue_path.joinpath('json'), 'r') as f:
            issue_json = json.load(f)
        rows.append(create_dataset_row(bot['account'], issue_json, text_section='body', is_comment=False, comment_number=None,owner=owner, project=project))
        if issue['comments'] > 0:
            with open(issue_path.joinpath('comments').joinpath('json'), 'r') as f:
                comments = json.load(f)
            for i,comment in enumerate(comments):
                rows.append(create_dataset_row(bot['account'], comment, is_comment=True, comment_number=i, owner=owner, project=project))
        dataset.extend(rows)
        cnt += 1
    save_csv_dataset(data=dataset,header=headers, filename = data_dir.joinpath(dataset_name))

NameError: name 'bot_issues_dir' is not defined

## Step 6 - Labeling sections with [Li et al.](http://doi.org/10.1007/s10664-022-10128-3)'s Model 

In [31]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json
from model_li2022_emse import *

model1_li2022_emse = Model1_IssueTracker_Li2022_ESEM('../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-weight_file.hdf5', 
                 '../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-word_embedding_file.bin')

issues_dataset = pd.read_csv('../data/dataset-backup.csv')

Loading model ../model/mode1-issue-tracker-li2022-emse/model1-issue-tracker-li2022-esem-weight_file.hdf5...




In [33]:
issues_dataset['td-label-li-emse'] = '-'
len_dataset = len(issues_dataset)
for j, i in enumerate(issues_dataset.index):
    issues_dataset['td-label'][i] = model1_li2022_emse.predict(str(issues_dataset['text'][i]))
    clear_output()
    log.info(f'Handling {i+1} out {len_dataset} lines')
issues_dataset.to_csv('../data/dataset-labeled.csv')

Traceback (most recent call last):
  File "C:\tools\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\joao\AppData\Local\Temp\ipykernel_11548\2361851197.py", line 4, in <module>
    issues_dataset['td-label'][i] = model1_li2022_emse.predict(str(issues_dataset['text'][i]))
  File "C:\Users\joao\Documents\bots-td-daniel-version\scripts\model_li2022_emse.py", line 84, in predict
    y_pred = self._model.predict(input_x)
  File "C:\Users\joao\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "C:\Users\joao\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2317, in predict
    data_handler = data_adapter.get_data_handler(
  File "C:\Users\joao\AppData\Roaming\Python\Python39\site-packages\keras\engine\data_adapter.py", line 1579, in get_data_handler
    return DataHa

TypeError: object of type 'NoneType' has no len()

## Step 7 - Labeling sections with [Li et al.](http://doi.org/10.1109/TSE.2022.3224378)'s Model 


In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json
from model_li2022_tse import *

v = Model1_IssueTracker_Li2022_TSE('../model/satd-issue_mul.hdf5', '../model/fasttext_issue_300.bin')

issues_dataset = pd.read_csv('../data/dataset-backup.csv')

In [None]:
issues_dataset['td-label-li-tse'] = '-'
len_dataset = len(issues_dataset)
for j, i in enumerate(issues_dataset.index):
    issues_dataset['td-label'][i] = v.classify_prob_comment(issues_dataset['text'][i])
    clear_output()
    log.info(f'Handling {i+1} out {len_dataset} lines')