# Data Collection

## Set up logging

In [1]:
import logging
from ipylogging import DisplayHandler, HTMLFormatter

handler = DisplayHandler()
handler.setFormatter(HTMLFormatter())

log = logging.getLogger()
log.addHandler(handler)
log.setLevel(logging.INFO)

## Global variables

In [2]:
from pathlib import Path

data_dir = Path('../data')
cache_dir_github = data_dir.joinpath('github')
bots_dataset_path = data_dir.joinpath('bots-dataset.csv')
bots_issues_dir = data_dir.joinpath('bots-issues')

github_token = open('../gh-token.txt','r').readlines()[0].strip()

## Step 1 - Collect bots from [Golzadeh et al.](https://zenodo.org/record/4000388)'s dataset

Download dataset from [Golzadeh et al.](https://zenodo.org/record/4000388)

In [4]:
import urllib
import gzip
import os

url_bot_dataset = "https://zenodo.org/record/4000388/files/groundtruthbots.csv.gz"
path_bot_dataset = data_dir.joinpath('groundtruthbots.csv')

gz_path, _ = urllib.request.urlretrieve(url_bot_dataset)
with gzip.open(gz_path, "rb") as f_in, open(path_bot_dataset, "wb") as f_out:
    f_out.write(f_in.read())

Extract bots

In [5]:
from minirig import load_csv_dataset, save_csv_dataset

bot_dataset = load_csv_dataset(path_bot_dataset)
bot_dataset = [{'account': row['account']} for row in bot_dataset if row['type'] == 'Bot']
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 2 - Collect the number of issues per bot

In [None]:
from minirig import load_csv_dataset, save_csv_dataset
from minirig import GHRequests

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

for bot in bot_dataset:
    try:
        bot['issue_count'] = gh_api.get_number_issues_involving_user(bot['account'])
    except:
        bot['issue_count'] = 'na'

bot_dataset.sort(reverse=True, key=lambda x: -1 if x['issue_count'] == 'na' else x['issue_count'])
save_csv_dataset(bots_dataset_path, data=bot_dataset)

## Step 3 - Download issues for each bot

In [None]:
from IPython.display import clear_output
from minirig import GHRequests, load_csv_dataset
import pandas as pd
import json

bot_dataset = load_csv_dataset(bots_dataset_path)
gh_api = GHRequests(token=github_token,cache_dir=cache_dir_github)

for bot in bot_dataset:
    bot_issues_dir = bots_issues_dir.joinpath(bot['account'])
    cnt = 1
    for issue in gh_api.get_issues_involving_user(bot['account']):
        bot_issue_path = bot_issues_dir.joinpath(issue['html_url'].replace('https://github.com/',''))
        bot_issue_path.mkdir(parents=True, exist_ok=True)
        if not bot_issue_path.joinpath('json').exists():
            with open(bot_issue_path.joinpath('json'), 'w') as f:
                json.dump(issue, f)

        clear_output()
        logging.info(f"{bot['account']}: {cnt} of {bot['issue_count']}")
        cnt += 1
