In [4]:
import pandas as pd
import numpy as np
import collections
import datetime
import time
from github import Github, RateLimitExceededException
import os
import itertools
import seaborn as sns
import io
from dotenv import load_dotenv, find_dotenv
import boto3

In [None]:
load_dotenv(find_dotenv())

In [None]:
g = Github(os.getenv('GITHUB_ACCESS_TOKEN'))

In [None]:
name = os.getenv('REPO_NAME')

if '/' in name:
    REPO = name
    USER = ''
else:
    USER = name
    REPO = ''

In [None]:
# whether to use ceph or store locally

use_ceph = True

if use_ceph:
    s3_endpoint_url = os.environ["OBJECT_STORAGE_ENDPOINT_URL"]
    s3_access_key = os.environ["AWS_ACCESS_KEY_ID"]
    s3_secret_key = os.environ["AWS_SECRET_ACCESS_KEY"]
    s3_bucket = os.environ["OBJECT_STORAGE_BUCKET_NAME"]

    s3 = boto3.client(
        service_name="s3",
        aws_access_key_id=s3_access_key,
        aws_secret_access_key=s3_secret_key,
        endpoint_url=s3_endpoint_url)

In [None]:
# add all repos to list, or create list of one
repos_ = []

if USER:
    org = g.get_user(USER)
    repos = org.get_repos()
    for repo in repos:
        repos_.append(repo)
else:
    to_add = g.get_repo(REPO)
    repos_.append(to_add)

In [None]:
def get_issues_for_repo(repo):
    """
    takes in a pygithub repo object and returns the list of pygithub issue objects
    """
    issues = []
    issues_left = True
    max_date = datetime.datetime.min
    while issues_left:
        issues_left = False
        try:
            for issue in repo.get_issues(state = 'all', direction = 'asc', since = max_date):
                max_date = issue.created_at
                if issue.pull_request is None:
                    issues.append(issue)
        except RateLimitExceededException:
            # rate limit exception
            print(len(issues), 'issues added for', {repo.full_name})
            print('SLEEPING NOW FOR AN HOUR')
            time.sleep(60*61)
            issues_left = True
    return issues

In [None]:
# add in all issues
all_issues = {}

if USER:
    print('getting issues for repos')
    for repo in repos_:
        all_issues[repo.full_name] = get_issues_for_repo(repo)
else:
    repo = repos_[0]
    all_issues[repo.full_name] = get_issues_for_repo(repo)

total_issues = sum([len(b) for a,b in all_issues.items()])
print(total_issues, 'total issues')

In [None]:
# create a dataframe for issues

issue_data = []

for repo, issues in all_issues.items():
    for issue in issues:
        labels = [lbl.name for lbl in issue.labels]
        labels = '\t'.join(labels) if labels else np.nan
        id_ = repo + '/' + str(issue.number)
        data = [id_,repo, issue.title, issue.body, issue.created_at, issue.user.login, len(labels), labels]
        issue_data.append(data)
cols = ['id', 'repo', 'title', 'body', 'created_at', 'created_by', 'num_labels', 'labels']
issues_df = pd.DataFrame(issue_data, columns = cols)

In [None]:
# show issues dataframe
issues_df.sample(5)

In [None]:
# view most common labels

label_lst = list(itertools.chain.from_iterable([ls.split('\t') for ls in issues_df.labels.dropna()]))
counter = collections.Counter(label_lst)

counter.most_common(20)

In [None]:
# view most common issue creators

issues_df.created_by.value_counts()[:20]

In [None]:
# view distribution of the number of issues in a label

ax = sns.histplot(issues_df.num_labels, bins = int(max(issues_df.num_labels)))
ax.set_xlim((0,6))

In [None]:
# save dataframe
savename = USER if USER else REPO.replace('/', '-_-')

if use_ceph:
    with io.StringIO() as csv_buffer:
        issues_df.to_csv(csv_buffer, index=False)

        response = s3.put_object(
            Bucket=s3_bucket, Key=f'data/{savename}.csv', Body=csv_buffer.getvalue()
        )
issues_df.to_csv(os.path.join('../../data',savename + '.csv'), index = None)