In [1]:
import os
import random
import pandas as pd
from typing import List, Tuple
import javalang



In [2]:
def get_relevant_line_indices(lines,num_label_lines=1,min_lines_above=1,min_lines_below=1):
    """
    Takes lines of code and creates a list of line indices for lines that are not used for whitespace, comments, imports, or packages. 
    """

    # remove empty lines, single line comments, import statements, packages
    relevant_lines_indices_with_mlcomments = [index for index, line in enumerate(lines) if line.strip() != '']
    relevant_lines_indices_with_mlcomments = [index for index in relevant_lines_indices_with_mlcomments if not lines[index].strip().startswith('//')]
    relevant_lines_indices_with_mlcomments = [index for index in relevant_lines_indices_with_mlcomments if not lines[index].strip().startswith('#')]
    relevant_lines_indices_with_mlcomments = [index for index in relevant_lines_indices_with_mlcomments if not (lines[index].strip().startswith('import') or lines[index].strip().startswith('package'))]

    # To handle multiline comments, we can use a flag
    in_multiline_comment = False
    relevant_line_indices = []

    for index in relevant_lines_indices_with_mlcomments:
        line = lines[index]

        if line.strip().startswith('/*'):
            in_multiline_comment = True

        if not in_multiline_comment:
            relevant_line_indices.append(index)

        if line.strip().endswith('*/'):
            in_multiline_comment = False
    
    trimmed_relevant_line_indices = relevant_line_indices[min_lines_above+1:- (num_label_lines + min_lines_below)]

    return trimmed_relevant_line_indices

In [3]:
def get_relevant_token_indices(lines, num_label_tokens=1, min_tokens_before=1, min_tokens_after=1):
    """
    Takes lines of code and creates a list of token indices for tokens that are not used for whitespace, comments, imports, or packages. 
    """

    code = "".join(lines)
    tokens = list(javalang.tokenizer.tokenize(code))

    # Get the indices of tokens that are not whitespace or comments.
    relevant_token_indices_with_mlcomments = [index for index, token in enumerate(tokens) if token.value.strip() != '' and not ('Comment' in token.__class__.__name__)]

    # Filter out 'import' and 'package' tokens.
    relevant_token_indices = [index for index in relevant_token_indices_with_mlcomments if tokens[index].value not in ['import', 'package']]

    # Trimming tokens
    trimmed_relevant_token_indices = relevant_token_indices[min_tokens_before:- (num_label_tokens + min_tokens_after)]

    return trimmed_relevant_token_indices

In [4]:
def create_prompt(prefix_lines,suffix_lines,infill=True,max_lines_above=50,max_lines_below=50):
    """
    Takes the prefix_lines and suffix_lines and uses them to consruct the prompt. Defines prompt style. Limits prompt size. 
    """

    # Limit the number of lines in the prefix and suffix to a maximum number of lines. 
    if len(prefix_lines) > max_lines_above:
        prefix_lines = prefix_lines[-max_lines_above:]
        
    if len(suffix_lines) > max_lines_below:
        suffix_lines = suffix_lines[:max_lines_below]

    if infill == True:
        prompt = f'<PRE> {"".join(prefix_lines)} <SUF>{"".join(suffix_lines)} <MID>'
    else:
        prompt = "".join(prefix_lines)

    return prompt

In [50]:
def line_removal_prompts(lines,line_to_remove_index=None,num_label_lines=1,infill=True,max_lines_above=50,max_lines_below=50):
    """
    Given lines of code, will remove either a single line or multiple lines in order to create an infill prompts
    and return a (prompt, label, start index of label, end index of label) 
    """
    
    if line_to_remove_index == None:
        relevant_line_indices = get_relevant_line_indices(lines)
        line_to_remove_index = random.choice(relevant_line_indices)

    last_line_to_remove_index = line_to_remove_index + num_label_lines
    label = "".join(lines[line_to_remove_index:last_line_to_remove_index])
    prefix_lines = lines[:line_to_remove_index]
    suffix_lines = lines[last_line_to_remove_index:]

    prompt = create_prompt(prefix_lines, suffix_lines, infill=infill,  max_lines_above=max_lines_above, max_lines_below=max_lines_below)

    return [prompt, label, line_to_remove_index + 1, last_line_to_remove_index]

In [31]:
def token_removal_prompts(lines,starting_token_index = None,num_label_tokens=1,infill=True,max_lines_above=50,max_lines_below=50):
    """
    Given lines of code, will remove a random set of consecutive tokens to create an infill prompt. 
    """

    if starting_token_index == None:
        relevant_token_indices = get_relevant_token_indices(lines,num_label_tokens=num_label_tokens)
        starting_token_index = random.choice(relevant_token_indices)
    
    code = "".join(lines)
    tokens = list(javalang.tokenizer.tokenize(code))

    start_position = tokens[starting_token_index].position[1] - 1  # this gets the column start of the token
    end_position = tokens[starting_token_index + num_label_tokens - 1].position[1] - 1 + len(tokens[starting_token_index + num_label_tokens - 1].value)  # this gets the column end of the last token
    start_line = tokens[starting_token_index].position[0]
    end_line = tokens[starting_token_index + num_label_tokens - 1].position[0]
    
    # List of lines before start line and the piece of line before the label. 
    prefix_lines = lines[:tokens[starting_token_index].position[0] - 1] + [lines[tokens[starting_token_index].position[0] - 1][:start_position]]
    # List of the piece of line after the label and lines after end line. 
    suffix_lines = [lines[tokens[starting_token_index + num_label_tokens - 1].position[0] - 1][end_position:]] + lines[tokens[starting_token_index + num_label_tokens - 1].position[0]:]

    prompt = create_prompt(prefix_lines, suffix_lines, infill=infill,  max_lines_above=max_lines_above, max_lines_below=max_lines_below)

    label = code[len("".join(prefix_lines)):len(code)-len("".join(suffix_lines))]
    
    return [prompt, label, start_line, end_line]

In [7]:
def find_method_in_file(lines, method_name):
    """
    Given a method name, return the first line and last line of that method declaration.
    """

    code = "".join(lines)

    tree = javalang.parse.parse(code)
    for path, node in tree.filter(javalang.tree.MethodDeclaration):
        if node.name == method_name:
            start_line = node.position.line
            post_prefix_content = "".join(lines[start_line-1:])
            post_prefix_tokens = list(javalang.tokenizer.tokenize(post_prefix_content))
            
            stack = []
            for token in post_prefix_tokens:
                if token.value == '{':
                    stack.append('{')
                
                elif token.value == '}':
                    stack.pop()
                    if not stack:
                        end_line = start_line + token.position.line - 1
                        break
                    
            return (start_line, end_line)
    return (None, None)

In [8]:
def method_declaration_prompts(lines,method_names,infill=True,max_lines_above=50,max_lines_below=50):
    """
    Given a list of method names, if that method exists in a file, will create a prompt for each corresponding method. 
    """

    prompts = []
    for m in method_names:
        start_line, end_line = find_method_in_file(lines, m)
        if start_line == None and end_line == None:
            continue

        label = "".join(lines[start_line:end_line])
        prefix_lines = lines[:start_line]
        suffix_lines = lines[end_line:]

        prompt = create_prompt(prefix_lines, suffix_lines, infill=infill,  max_lines_above=max_lines_above, max_lines_below=max_lines_below)
        prompts.extend([prompt, label, start_line, end_line])
    
    return prompts

In [59]:
def create_prompts_df(folder_path,num_label_lines,lines_prompts_count,num_label_tokens,tokens_prompts_count):
    """
    Given a folder of java files, returns a dictionary with three dataframes corresponding to three different kinds of prompts. 
    """

    df3_list = []
    all_relevant_line_indices = []
    all_relevant_token_indices = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)

            with open(file_path, 'r') as f:
                lines = f.readlines()
            
            these_relevant_lines = [(file_path,num) for num in get_relevant_line_indices(lines)]
            all_relevant_line_indices.extend(these_relevant_lines)
            these_relevant_tokens = [(file_path,num) for num in get_relevant_token_indices(lines)]
            all_relevant_token_indices.extend(these_relevant_tokens)

            df3_row = method_declaration_prompts(lines,['defineProperties','defineAdditionalProperties'])
            if df3_row != []:
                df3_list.append(df3_row)

    line_indices_for_lines_prompt_making = random.sample(all_relevant_line_indices, lines_prompts_count)
    token_indices_for_tokens_prompt_making = random.sample(all_relevant_token_indices, tokens_prompts_count)

    df2_list = []
    for path, token_index in token_indices_for_tokens_prompt_making:

        with open(path, 'r') as f:
            lines = f.readlines()

        df2_row = token_removal_prompts(lines,starting_token_index=token_index,num_label_tokens=num_label_tokens)
        if df2_row != []:
                df2_list.append(df2_row)
    
    df1_list = []
    for path, line_index in line_indices_for_lines_prompt_making:

        with open(path, 'r') as f:
            lines = f.readlines()

        df1_row = line_removal_prompts(lines,line_to_remove_index=line_index,num_label_lines=num_label_lines)
        if df1_row != []:
                df1_list.append(df1_row)

    df_dict = {'lines_df': pd.DataFrame(df1_list), 'tokens_df': pd.DataFrame(df2_list), 'methods_df': pd.DataFrame(df3_list)}
    pd.set_option('max_colwidth', 500)

    return df_dict
        