In [1]:
import glob 
import json 
import regex as re

In [2]:
all_issue_files = glob.glob('all_issues/*.json')
repo_map = {}
for file in all_issue_files:
    name = file.split('.')[-2].split('_')[-1]
    issues = json.load(open(file, 'r'))
    
    all_map = {}
    for issue in issues:
        number = issue["number"]
        body = issue["body"]
        title = issue["title"]
        created = issue['created_at']
        closed = issue['closed_at']

        all_map[number] = [title, body, created, closed]

    if(name not in repo_map):
        repo_map[name] = all_map
    else:
        repo_map[name].update(all_map)

In [4]:
sum = 0
for repo in repo_map:
    if(repo == 'youtube-dl'):
        continue
    print(repo, len(repo_map[repo]))
    sum+=len(repo_map[repo])
print(sum)

scikit-learn 22468
ipython 12262
pandas 46952
ansible 77728
159410


In [120]:
final_data = {}

## Issues 

In [121]:
relevant_files = glob.glob("misc/issues/*.jsonl")
sum = 0
for file in relevant_files:
    name = file.split('.')[-2].split('/')[-1]
    if(name == 'youtube-dl'):
        continue 
    print(name)
    final_data[name] = {}
    with open(file, 'r') as f:
        for line in f:
            issue_entry = json.loads(line)
            issue_number = issue_entry['issue_number']
            if(issue_entry['body']!=None):
                final_data[name][issue_number] = issue_entry 
    sum+=len(final_data[name])
print(sum)

ansible
ipython
pandas
scikit-learn
13428


## PRs

In [122]:
relevant_files = glob.glob("misc/prs/*.jsonl")
sum = 0
for file in relevant_files:
    name = file.split('.')[-2].split('/')[-1]
    if(name == 'youtube-dl'):
        continue 
    print(name)
    with open(file, 'r') as f:
        for line in f:
            issue_entry = json.loads(line)
            sum+=len(issue_entry['linked_number'])
            for i, issue_number in enumerate(issue_entry['linked_number']):
                if(issue_number in final_data[name]):
                    if('linked_issue_number' not in final_data[name][issue_number]):
                        final_data[name][issue_number]['linked_pr_number'] = []
                    final_data[name][issue_number]['linked_pr_number'].append(issue_number)
                    final_data[name][issue_number]['commits'].extend(issue_entry['commits'])
                elif issue_entry['body']!=None:
                    final_data[name][issue_number] = {}
                    final_data[name][issue_number]['issue_number'] = issue_number
                    final_data[name][issue_number]['title'] = issue_entry['title']
                    final_data[name][issue_number]['body'] = issue_entry['body']
                    final_data[name][issue_number]['commits'] = issue_entry['commits']
                    final_data[name][issue_number]['labels'] = issue_entry['labels']
                    final_data[name][issue_number]['created_at'] = issue_entry['linked_created_at'][i]
                    final_data[name][issue_number]['closed_at'] = issue_entry['linked_closed_at'][i]
                    final_data[name][issue_number]['linked_pr_number'] = [issue_number]
print(sum)


ansible
ipython
pandas
scikit-learn
9425


In [123]:
sum = 0
for repo in final_data:
    if(repo == 'youtube-dl'):
        continue
    print(repo, len(final_data[repo]))
    sum+=len(final_data[repo])
print(sum)

ansible 7166
ipython 1881
pandas 8190
scikit-learn 1567
18804


## Only Python commits

In [124]:
sum=0
for repo in final_data:
    og_files = 0
    new_files = 0

    og_issues = 0
    new_issues = 0

    issues_to_keep = {}
    for issue_number in final_data[repo]:
        issue = final_data[repo][issue_number]
        commits = issue['commits']
        commits_to_keep = []
        for commit in commits:
            files = commit[-1]
            og_files+=len(files)
            py_files = [f for f in files if f[-3:] == '.py']
            new_files+=len(py_files)
            if(len(py_files) > 0):
                commit[-1] = py_files 
                commits_to_keep.append(commit)
        if(len(commits_to_keep) > 0):
            issue['commits'] = commits_to_keep
            issues_to_keep[issue_number] = issue 
    final_data[repo] = issues_to_keep
    sum+=len(issues_to_keep)

In [125]:
sum

16100

## Only Bugs

In [126]:
bug_issues = {}

### Label

In [127]:
repowise_label = {}
sum = 0
for repo in final_data:
    issues_with_bug_label = set() 
    all_labels = []
    for issue in final_data[repo].values():
        all_labels.extend(issue['labels'])
        for label in issue['labels']:
            if('bug' in label.lower()):
                issues_with_bug_label.add(issue['issue_number'])
                break
    repowise_label[repo] = issues_with_bug_label
    sum+=len(issues_with_bug_label)
    print(repo, len(issues_with_bug_label))
print(sum)

ansible 4232
ipython 391
pandas 2741
scikit-learn 266
7630


### Title/body

In [128]:
search_pattern = r"bug|does not work|doesn't work|fail|broken|not working|buggy|to reproduce" 
repowise_text = {}
sum = 0
for repo in final_data:
    issues_with_buggy_in_text = set()   
    for issue in final_data[repo].values():    
        if(re.search(search_pattern, issue["body"].lower()+" "+issue["title"].lower())):
            issues_with_buggy_in_text.add(issue["issue_number"])
    repowise_text[repo] = issues_with_buggy_in_text
    sum+=len(issues_with_buggy_in_text)
    print(repo, len(issues_with_buggy_in_text))
print(sum)

ansible 4610
ipython 503
pandas 3181
scikit-learn 492
8786


In [129]:
titles_bodies = {'titles':[], 'bodies':[]}
tb_list = []

In [130]:
sum = 0
for repo in final_data:
    merged = repowise_label[repo].union(repowise_text[repo])
    print(repo, len(merged))
    with open(f'bugs4py/{repo}.jsonl', 'w') as f:
        for issue in final_data[repo].values():
            if(issue['issue_number'] in merged):
                sum+=1
                tb_list.append((issue['title'], issue['body']))
                source = []
                commits = []
                if(issue['issue_number'] in repowise_label[repo]):
                    source.append('label')
                if(issue['issue_number'] in repowise_text[repo]):
                    source.append('regex')
                for commit in issue['commits']:
                    commit_d = {}
                    commit_d['node_id'] = commit[0]
                    commit_d['commit_message'] = commit[1]
                    commit_d['commit_timestamp'] = commit[2]
                    commit_d['files'] = commit[3]
                    commits.append(commit_d)
                issue['method'] = source 
                issue['commits'] = commits
                json_record = json.dumps({**issue})
                f.write(json_record+'\n')

ansible 4889
ipython 727
pandas 4328
scikit-learn 606


In [131]:
sum

10550