##  Building a pull request classification dataset

In [21]:
import json
import os
from collections import defaultdict

import javalang
from javalang.tokenizer import LexerError
from sklearn.model_selection import train_test_split
from unidiff import PatchSet


# List of repositories that will make up the dataset
repositories = ['spring-projects/spring-boot', 
                'spring-projects/spring-framework',
                'spring-projects/spring-integration',
                'spring-projects/spring-security']

labels = {
    'type: task': 0, 
    'type: documentation': 0, 
    'type: dependency-upgrade': 0, 
    'type: regression': 0, 
    'type: blocker': 0, 
    'type: backport': 0,
    'type: enhancement': 1, 
    'type: bug': 2, 
}

# Filename extension for code files
code_ext = '.java'

In [22]:
def is_code_change(diff, code_ext):
    """
    Returns true if the diff contains changes from at least one code file
    :param diff: pull request diff
    :param code_ext: filename extension for code files
    :return: 
    """
    patch = PatchSet(diff)
    for file in patch.added_files + patch.modified_files + patch.removed_files:
        if os.path.splitext(file.path)[1] == code_ext:
            return True
    return False

In [23]:
def get_tokens(code_changes):
    """
    Returns tokens from Java code snippets.
    NOTE: This requires well formed Java code as input and doesn't work on diffs
    :param code_changes: Java code encoded as a string
    :return: tokens extracted from the input code
    """
    tokens  = list(), list()
    
    for file_changes in code_changes:
        try:
            file_tokens = (x.value.strip() for x in javalang.tokenizer.tokenize(file_changes))
            tokens.append(' '.join(file_tokens)) #.encode('string-escape'))

        except LexerError:
            file_tokens = list()
            lines = file_changes.split('\n')
            for line in lines:
                try:
                    line_tokens = [x.value.strip() for x in javalang.tokenizer.tokenize(line)]
                    if line_tokens and line_tokens[0] != '*':
                        file_tokens.append(' '.join(line_tokens)) #.encode('string-escape'))
                except:
                    pass
            if file_tokens:
                tokens.append(' '.join(file_tokens))    

In [24]:
def split_dataset(dataset):
    """
    
    :param data_x: 
    :param data_y: 
    :return: 
    """
    train_split, test_split = train_test_split(dataset, test_size=0.15, random_state=42)
    train_split, dev_split = train_test_split(train_split, test_size=0.176, random_state=42)
    print("Train, dev and test split sizes:", len(train_split), len(dev_split), len(test_split))
    return train_split, dev_split, test_split

In [25]:
# List containing labelled pull requests
labelled_prs = list()

label_counts = defaultdict(int)
for repo_name in repositories:
    # Load pull request metadata dict
    with open(os.path.join('data', repo_name, 'pr_metadata.json'), 'r') as metadata_file:
        metadata_dict = json.load(metadata_file)
    
    with open(os.path.join('data', repo_name, 'pr_diffs.json'), 'r') as diff_file:
        diff_dict = json.load(diff_file)
        
    for pull_request in metadata_dict:
        diff = diff_dict[str(pull_request['number'])]
        if is_code_change(diff, code_ext):
            for label in pull_request['labels']:
                if label['name'].startswith('type'):
                    labelled_prs.append((pull_request, diff, labels[label['name']]))
                    label_counts[label['name']] += 1
                    break

print("Number of labelled pull requests:", len(labelled_prs))
print("Dataset distribution:", label_counts)

Number of labelled pull requests: 1989
Dataset distribution: defaultdict(<class 'int'>, {'type: task': 525, 'type: enhancement': 1010, 'type: bug': 339, 'type: documentation': 79, 'type: dependency-upgrade': 8, 'type: regression': 8, 'type: blocker': 16, 'type: backport': 4})


In [26]:
# List containing all the dataset samples
dataset = list()

for pull_request, diff, label in labelled_prs:
    diff_changes = list()
    patch = PatchSet(diff)
    
    for file in patch.added_files + patch.modified_files + patch.removed_files:
        if os.path.splitext(file.path)[1] == code_ext:
            file_changes = list()
            for hunk in file:
                for line in hunk:
                    if not line.value.isspace():
                        file_changes.append(' '.join(line.value.split()))
            diff_changes.append('\n'.join(file_changes))
    
    dataset.append((pull_request['base']['repo']['full_name'], pull_request['number'], diff_changes, label))
                
train_split, dev_split, test_split = split_dataset(dataset)

with open(os.path.join('data', 'spring-projects', 'train.tsv'), 'w') as tsv_file:
    tsv_file.write('\n'.join('\t'.join(str(y) for y in x) for x in train_split))
with open(os.path.join('data', 'spring-projects', 'dev.tsv'), 'w') as tsv_file:
    tsv_file.write('\n'.join('\t'.join(str(y) for y in x) for x in dev_split))
with open(os.path.join('data', 'spring-projects', 'test.tsv'), 'w') as tsv_file:
    tsv_file.write('\n'.join('\t'.join(str(y) for y in x) for x in test_split))


Train, dev and test split sizes: 1392 298 299
