##  Building a pull request classification dataset

In [13]:
import json
import os

import javalang
import numpy as np
from javalang.tokenizer import LexerError
from unidiff import PatchSet


# List of repositories that will make up the dataset
repositories = ['spring-projects/spring-boot', 
                'spring-projects/spring-framework',
                'spring-projects/spring-integration',
                'spring-projects/spring-security']

# Filename extension for code files
code_ext = '.java'

In [14]:
def is_code_change(diff, code_ext):
    """
    Returns true if the diff contains changes from at least one code file
    :param diff: pull request diff
    :param code_ext: filename extension for code files
    :return: 
    """
    patch = PatchSet(diff)
    for file in patch.added_files + patch.modified_files + patch.removed_files:
        if os.path.splitext(file.path)[1] == code_ext:
            return True
    return False

In [None]:
def get_tokens(code_changes):
    """
    Returns tokens from Java code snippets.
    NOTE: This requires well formed Java code as input and doesn't work on diffs
    :param code_changes: Java code encoded as a string
    :return: tokens extracted from the input code
    """
    tokens  = list(), list()
    
    for file_changes in code_changes:
        try:
            file_tokens = (x.value.strip() for x in javalang.tokenizer.tokenize(file_changes))
            tokens.append(' '.join(file_tokens)) #.encode('string-escape'))

        except LexerError:
            file_tokens = list()
            lines = file_changes.split('\n')
            for line in lines:
                try:
                    line_tokens = [x.value.strip() for x in javalang.tokenizer.tokenize(line)]
                    if line_tokens and line_tokens[0] != '*':
                        file_tokens.append(' '.join(line_tokens)) #.encode('string-escape'))
                except:
                    pass
            if file_tokens:
                tokens.append(' '.join(file_tokens))    

In [15]:
# List containing labelled pull requests
labelled_prs = list()

for repo_name in repositories:
    # Load pull request metadata dict
    with open(os.path.join('data', repo_name, 'pr_metadata.json'), 'r') as metadata_file:
        metadata_dict = json.load(metadata_file)
    
    with open(os.path.join('data', repo_name, 'pr_diffs.json'), 'r') as diff_file:
        diff_dict = json.load(diff_file)
        
    for pull_request in metadata_dict:
        diff = diff_dict[str(pull_request['number'])]
        if is_code_change(diff, code_ext):
            for label in pull_request['labels']:
                if label['name'].startswith('type'):
                    labelled_prs.append((pull_request, diff))
                    break

print("Number of labelled pull requests:", len(labelled_prs))

Number of labelled pull requests: 1989


In [None]:
# List containing all the dataset samples
dataset = list()

for pull_request, diff in labelled_prs:
    diff_changes = list()
    patch = PatchSet(diff)
    for file in patch.added_files + patch.modified_files + patch.removed_files:
        if os.path.splitext(file.path)[1] == code_ext:
            file_changes = list()
            for hunk in file:
                for line in hunk:
                    if not line.value.isspace():
                        file_changes.append(' '.join(line.value.split()))
            diff_changes.append('\n'.join(file_changes))
                
train_split, dev_split, test_split = (list(x) for x in split_dataset(np.array(dataset, dtype='O'), groups))

with open('train.tsv', 'w') as tsv_file:
    tsv_file.write('\n'.join(['\t'.join(x) for x in train_split]))
with open('dev.tsv', 'w') as tsv_file:
    tsv_file.write('\n'.join(['\t'.join(x) for x in dev_split]))
with open('test.tsv', 'w') as tsv_file:
    tsv_file.write('\n'.join(['\t'.join(x) for x in test_split]))
