In [None]:
import json 
import requests
import regex as re

from collections import defaultdict
#from github import Github

## Collecting issues with commits 

In [None]:
issues = json.load(open('drive/MyDrive/issues.json', 'r'))
non_prs = [issue for issue in issues if "pull_request" not in issue.keys()]

In [None]:
headers = {'Authorization': 'token ' + '<token>'}

In [None]:
issues_with_commits = defaultdict(lambda: defaultdict(lambda : {'body': None, 'commits': [], 'labels':[]}))
for issue in non_prs:
  if issue['timeline_url']:
    number = issue['number']
    labels = []
    if('labels' in issue.keys()):
      labels = [t['name'] for t in issue['labels']]
    try:
      timeline = requests.get(issue['timeline_url'], headers=headers).json()
    except:
      print(number)
      continue 
    has_commit = False 
    commits = []
    for item in timeline:
      if('commit_url' in item.keys() and item['commit_url']!=None):
        has_commit = True 
        commit = requests.get(item['commit_url'], headers=headers).json()
        if('files' in commit.keys()):
          files = [f['filename'] for f in commit['files']]
          commits.append([commit['node_id'], commit['commit']['message'], commit['commit']['committer']['date'], files])
    if(has_commit):
      issues_with_commits[number]['body'] = issue['body']
      issues_with_commits[number]['commits'] = commits
      issues_with_commits[number]['labels'] = labels

In [None]:
to_save = dict(issues_with_commits)

In [None]:
with open(f'issues_with_commits.jsonl', 'w') as f:
  for issue in to_save:
    json_record = json.dumps({'issue_number': issue, **to_save[issue]})
    f.write(json_record+'\n')

## Pruning to bug-related issues

In [None]:
final = []
with open('issues.jsonl', 'r') as f:
  for line in f:
    final.append(json.loads(line))
print(len(final))

3206


In [None]:
all_bugs = set()
for issue in non_prs:
  labels = []
  if('labels' in issue.keys()):
    labels = [t['name'] for t in issue['labels']]
  if('bug' in labels):
    all_bugs.add(issue['number'])

In [None]:
issues_with_commits = set()
for issue in final:
  issues_with_commits.add(issue['issue_number'])

### Bug label

In [None]:
issues_with_bug_label = set() 
for issue in final:
  if('bug' in issue['labels']):
    issues_with_bug_label.add(issue['issue_number'])
print(len(issues_with_bug_label))

1202


### "Fix" in commit

In [None]:
issues_with_fix_in_commit = set()
for issue in final:
  search_pattern = r'fixes #\d+'
  all_commit_messages = "\n".join(c[1].lower() for c in issue['commits'])
  if(re.search(search_pattern, all_commit_messages)):
    issues_with_fix_in_commit.add(issue['issue_number'])
print(len(issues_with_fix_in_commit))

2240


### Buggy regex in title/body

In [None]:
issues_with_buggy_in_text = set()
for issue in final:
  search_pattern = r"bug | does not work | doesn't work | broken | not working | to reproduce" 
  if(re.search(search_pattern, issue["body"].lower()+" "+issue["title"].lower())):
    issues_with_buggy_in_text.add(issue["issue_number"])
print(len(issues_with_buggy_in_text))

373


In [None]:
all_issues = issues_with_bug_label.union(issues_with_fix_in_commit).union(issues_with_buggy_in_text)
print(len(all_issues))

2617


### Final dataset

In [None]:
final_dataset = []
for issue in final:
  source = []
  if(issue['issue_number'] in issues_with_bug_label):
    source.append('label')
  if(issue['issue_number'] in issues_with_buggy_in_text):
    source.append('title/body')
  if(issue['issue_number'] in issues_with_fix_in_commit):
    source.append('commit')
  issue['source'] = source 
  final_dataset.append(issue)
print(len(final_dataset))

3206


In [None]:
with open(f'bug_issues.jsonl', 'w') as f:
  for issue in final_dataset:
    json_record = json.dumps(issue)
    f.write(json_record+'\n')