In [5]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import collections
import datetime
import time
from github import Github, RateLimitExceededException
import os
import itertools
import seaborn as sns

In [6]:
g = Github(os.getenv('GITHUB_ACCESS_TOKEN'))

In [17]:
# one of USER or REPO must be empty

# username of person or organization
USER = 'thoth-station'
# OR  person-org-name/repo-name
REPO = ''

In [18]:
# add all repos to list, or create list of one
repos_ = []

if USER:
    org = g.get_user(USER)
    repos = org.get_repos()
    for repo in repos:
        repos_.append(repo)
else:
    to_add = g.get_repo(REPO)
    repos_.append(to_add)

In [19]:
def get_issues_for_repo(repo):
    """
    takes in a pygithub repo object and returns the list of pygithub issue objects
    """
    issues = []
    issues_left = True
    max_date = datetime.datetime.min
    while issues_left:
        issues_left = False
        try:
            for issue in repo.get_issues(state = 'all', direction = 'asc', since = max_date):        
                max_date = issue.created_at
                if issue.pull_request is None:
                    issues.append(issue)
        except RateLimitExceededException as e:
            # rate limit exception
            print(len(issues), 'issues added for', {repo.full_name})
            print('SLEEPING NOW')
            time.sleep(60*61)
            issues_left = True
    return issues

In [None]:
# add in all issues
all_issues = {}

if USER:
    print('getting issues for repos')
    for repo in tqdm(repos_):
        all_issues[repo.full_name] = get_issues_for_repo(repo)
else:
    repo = repos_[0]
    all_issues[repo.full_name] = get_issues_for_repo(repo)
    
total_issues = sum([len(b) for a,b in all_issues.items()])
print(total_issues, 'total issues')

getting issues for repos


  0%|          | 0/161 [00:00<?, ?it/s]

87 issues added for {'thoth-station/slo-reporter'}
SLEEPING NOW


In [None]:
# create a dataframe for issues

issue_data = []

for repo, issues in tqdm(all_issues.items()):
    for issue in issues:
        labels = [l.name for l in issue.labels]
        issue_data.append([repo + '/' + str(issue.number),repo, issue.title, issue.body, issue.created_at, issue.user.login, len(labels), '\t'.join(labels) if labels else np.nan])
issues_df = pd.DataFrame(issue_data, columns = ['id', 'repo', 'title', 'body', 'created_at', 'created_by', 'num_labels', 'labels'])

In [None]:
# show issues dataframe
issues_df.sample(5)

In [None]:
# view most common labels

label_lst = list(itertools.chain.from_iterable([ls.split('\t') for ls in issues_df.labels.dropna()]))
counter = collections.Counter(label_lst)

counter.most_common(20)

In [None]:
# view most common issue creators

issues_df.created_by.value_counts()[:20]

In [None]:
# view distribution of the number of issues in a label

ax = sns.histplot(issues_df.num_labels, bins = int(max(issues_df.num_labels)))
ax.set_xlim((0,6))

In [None]:
# save dataframe
SAVENAME = USER if USER else REPO.replace('/', '-_-')
issues_df.to_csv(os.path.join('../../data',SAVENAME + '.csv'), index = None)