In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import collections
import datetime
import time
from github import Github
import os
import itertools
from github import RateLimitExceededException
from sklearn.model_selection import KFold
import seaborn as sns

In [2]:
g = Github(os.getenv('GITHUB_ACCESS_TOKEN'))

In [3]:
# one of USER or REPO must be empty

# username of person or organization
USER = 'thoth-station'
# OR  person-org-name/repo-name
REPO = ''

In [4]:
# add all repos to list, or create list of one
repos_ = []

if USER:
    org = g.get_user(USER)
    repos = org.get_repos()
    for repo in repos:
        repos_.append(repo)
else:
    to_add = g.get_repo(REPO)
    repos_.append(to_add)

In [5]:
def get_issues_for_repo(repo):
    """
    takes in a pygithub repo object and returns the list of pygithub issue objects
    """
    issues = []
    issues_left = True
    max_date = datetime.datetime.min
    while issues_left:
        issues_left = False
        try:
            for issue in repo.get_issues(state = 'all', direction = 'asc', since = max_date):        
                max_date = issue.created_at
                if issue.pull_request is None:
                    issues.append(issue)
        except RateLimitExceededException as e:
            # rate limit exception
            print(len(issues), 'issues added for', {repo.full_name})
            print('SLEEPING NOW')
            time.sleep(60*61)
            issues_left = True
    return issues

In [6]:
# add in all issues
all_issues = {}

if USER:
    print('getting issues for repos')
    for repo in tqdm(repos_):
        all_issues[repo.full_name] = get_issues_for_repo(repo)
else:
    repo = repos_[0]
    all_issues[repo.full_name] = get_issues_for_repo(repo)
    
total_issues = sum([len(b) for a,b in all_issues.items()])
print(total_issues, 'total issues')

getting issues for repos


  0%|          | 0/160 [00:00<?, ?it/s]

84 issues added for {'thoth-station/solver'}
SLEEPING NOW
6173 total issues


In [8]:
# create a dataframe for issues
# create a dataframe for labels

issue_data = []
label_data = []

for repo, issues in tqdm(all_issues.items()):
    for issue in issues:
        issue_data.append([repo + '/' + str(issue.number),repo, issue.title, issue.body, issue.created_at, issue.user.login, len(issue.labels)])
        for l in issue.labels:
            label_data.append([repo + '/' + str(issue.number), l.name])
issues_df = pd.DataFrame(issue_data, columns = ['id', 'repo', 'title', 'body', 'created_at', 'created_by', 'num_labels'])
labels_df = pd.DataFrame(label_data, columns = ['id', 'name'])

  0%|          | 0/160 [00:00<?, ?it/s]

In [9]:
# show issues dataframe
issues_df.sample(5)

Unnamed: 0,id,repo,title,body,created_at,created_by,num_labels
2694,thoth-station/management-api/730,thoth-station/management-api,Provide a sync endpoint,**Is your feature request related to a problem...,2021-03-08 16:09:39,fridex,2
5505,thoth-station/thoth-application/659,thoth-station/thoth-application,FileNotFoundError: No source distribution foun...,**Describe the bug**\r\nFileNotFoundError: No ...,2020-11-18 15:15:39,pacospace,2
1471,thoth-station/graph-sync-job/64,thoth-station/graph-sync-job,Failed to update dependencies to their latest ...,\nAutomatic dependency update failed for the c...,2018-09-15 21:26:02,sesheta,1
3540,thoth-station/package-releases-job/477,thoth-station/package-releases-job,New patch release,"Hey, Kebechet!\r\n\r\nCreate a new patch relea...",2020-09-16 17:07:36,pacospace,1
3670,thoth-station/python/28,thoth-station/python,New minor release,,2018-12-11 18:16:25,fridex,0


In [10]:
# show labels dataframe
labels_df.sample(5)

Unnamed: 0,id,name
303,thoth-station/adviser/1926,kind/feature
2944,thoth-station/package-extract/102,bot
4681,thoth-station/thoth-application/726,sig/pipelines
5410,thoth-station/user-api/1295,bot
3687,thoth-station/slo-reporter/159,bot


In [None]:
# view most common labels

labels_df.name.value_counts()[:20]

In [None]:
# view most common issue creators

issues_df.created_by.value_counts()[:20]

In [None]:
# view distribution of the number of issues in a label

ax = sns.histplot(issues_df.num_labels, bins = int(max(issues_df.num_labels)))
ax.set_xlim((0,6))

In [11]:
# save dataframe
SAVENAME = USER if USER else REPO.replace('/', '-_-')
issues_df.to_csv(os.path.join('../../data',SAVENAME + '-issues.csv'), index = None)
labels_df.to_csv(os.path.join('../../data',SAVENAME + '-labels.csv'), index = None)