In [None]:
import pickle
from bs4 import BeautifulSoup
from nltk import word_tokenize
import sys
import string
sys.path.append('../../utils/comment_parser/')
import comment_parser
import random
from itertools import groupby
from operator import itemgetter
import re
from tqdm import tqdm
from urlextract import URLExtract
from codetype import identify
import datefinder
extractor = URLExtract()

PLACEHOLDER = '||_to_remove_||'
PLACEHOLDER_LINK = '_LINK_'
PLACEHOLDER_REF = '_REF_'
PLACEHOLDER_NUM = '_NUM_'
FORBIDDEN_TAGS = ['@see', '@version', '@author',  '@since']
MAX_LEN = 256
SEED = 42

random.seed(SEED)

In [None]:
def allEquals(s):
    s = s.strip()
    s = s.replace('\n','')
    s = re.sub('\s+',' ',s)
    n = len(s)
    for i in range(1, n):
        if s[i] != s[0]:
            return False
    return True

In [None]:
def isAscii(s):
    return s.isascii()

In [None]:
def removeNonAscii(method):

    comments = comment_parser.extract_comments_from_str(method, mime='text/x-java-source')

    counter = 0
    while(True):

        if len(comments) == 0:
            break

        comment_object = comments[counter]
        comment = comment_object.text().strip()

        if not isAscii(comment):
            start = comment_object.indexes()[0]
            end = comment_object.indexes()[1]
            method = method[0:start] + method[end:]
            comments = comment_parser.extract_comments_from_str(method, mime='text/x-java-source')
            counter = 0

        else:
            counter += 1

        if counter == len(comments):
            break

    return method

In [None]:
# With the following function we try to manage corner cases like the following one:
# //////// comment /////////
# k represents the treshold. Can be tuned!
def stripChar(comment,k):
    target = comment[0:k]
    for i in range(len(target)-1):
        if target[i] != target[i+1]:
            return False
    return True

In [None]:
def removeDateTime(method, javadoc):

    if javadoc:
        try:
            matches = list(datefinder.find_dates(method, source=True, index=True))
            matches = [match[1] for match in matches if len(match[1].strip())>5 and len(match[1].strip().split(' '))==1] # We're excluding false positive
            if len(matches) > 0:
                for match in matches:
                    method = method.replace(match, PLACEHOLDER_NUM)

        except Exception:
            return method

        return method

    else:

        comments = re.findall("<sep>([\s\S]*?)<sep>", method)
        for string in comments:

            try:
                matches = list(datefinder.find_dates(string, source=True, index=True))
                matches = [match[1] for match in matches if len(match[1].strip())>5 and len(match[1].strip().split(' '))==1] # We're excluding false positive

            except Exception:
                continue

            if len(matches) > 0:
                for match in matches:
                    method = method.replace(match, PLACEHOLDER_NUM)


        return method


In [None]:
def filterMultiLineComment(method):

    comments = comment_parser.extract_comments_from_str(method, mime='text/x-java-source')

    for c in comments:

        if c.is_multiline():

            c_line = c.line_number()-1


            comment = c.text()
            comment = removeDateTime(comment, javadoc=False)
            comment_lines = comment.splitlines()

            final_line = c_line + len(comment_lines)-1

            for (idx,line) in enumerate(comment_lines):
                if line.strip().startswith('*'):
                    comment_lines[idx] = line.replace('*','',1)

            comment_refined_lines = '\n'.join(comment_lines)
            replace_this = '<sep> '+comment_refined_lines+' <sep>'
            method = method.replace('/*'+comment+'*/',replace_this)
            function_lines = method.splitlines()
            starting_position_sep = function_lines[c_line].find('<sep>')
            ending_position_sep = function_lines[final_line].rfind('<sep>') + 5

            if final_line < len(function_lines):
                if  function_lines[c_line][0:starting_position_sep].strip() == '' and function_lines[final_line][ending_position_sep:].strip() == '':
                    pass

                else:
                    #Heuristic
                    if ( len(function_lines[c_line][0:starting_position_sep].strip()) > 0 and len(function_lines[final_line][ending_position_sep:].strip())==0 ):

                        to_replace  = replace_this + '\n' +function_lines[c_line][0:starting_position_sep]
                        to_remove_from_method = function_lines[c_line][0:starting_position_sep]+ replace_this
                        method = method.replace(to_remove_from_method, to_replace)

                    elif ( len(function_lines[c_line][0:starting_position_sep].strip()) > 0 or len(function_lines[final_line][ending_position_sep:].strip())>0 ):
                        method = method.replace(replace_this,'')

    return method

In [None]:
#Check for SATD
def checkSATD(comment):
    comment = comment.strip().lower()
    return  \
        comment.startswith('to-do') or \
        comment.startswith('todo') or \
        comment.startswith('to_do') or \
        comment.startswith('to-fix')  or \
        comment.startswith('to fix') or \
        comment.startswith('tofix') or \
        comment.startswith('to_fix')  or \
        comment.startswith('fixme') or \
        comment.startswith('fix-me') or \
        comment.startswith('bugfix')

In [None]:
# This function aligns comment with the following format

# E.G cdtTrxTxInf.getPmtId().setEndToEndId(SepaUtil.getProperty(sepaParams, SepaUtil.insertIndex))  // comment
#
# the expected result is:
#
# // comment
# cdtTrxTxInf.getPmtId().setEndToEndId(SepaUtil.getProperty(sepaParams, SepaUtil.insertIndex))

#Here the assumption is the following:
# if we find the following scenario:
# cdtTrxTxInf.getPmtId().setEndToEndId(SepaUtil.getProperty(
#                 sepaParams, //c1 c2 c3
#                 SepaUtil.insertIndex)) //c2 c3 c4
# We assume that c1 c2 c3 are going to be the description of sepaParams and c2 c3 c4 describe sepaUtil.insertIndex

def alignComment(method):

    comments = comment_parser.extract_comments_from_str(method,mime='text/x-java-source')

    # We don't want to handle multiline comments at this level, so we skip here
    newlist = [x for x in comments if not x.is_multiline()]

    method_lines = method.splitlines()
    for c in newlist:

        c_line = c.line_number() -1
        comment = c.text().strip()
        start = len(method_lines[c_line]) - (c.indexes()[1]-c.indexes()[0])
        to_replace = method_lines[c_line][0:start] #What comes before the comment (e.g: System.println() // comment)


        if method_lines[c_line].replace('//','').strip() == comment:
            method_lines[c_line] = '<sep> ' + comment + ' <sep>'
        else:
            method_lines[c_line] =  '<sep> ' + comment + ' <sep>\n' + to_replace

    ret = keepSepTagAligned('\n'.join(method_lines))
    return ret

In [None]:
def maskLinkRef(string, javadoc):

    if javadoc:
        string = string.replace('\n',' ')

    urls=extractor.find_urls(string)

    if len(urls)>0:

        for match in urls:
            string = string.replace(match, PLACEHOLDER_LINK)

    if javadoc:
        regex1 = r"{@link .*?}"
        matches1 = re.findall(regex1, string)

        for match in matches1:

            if match not in urls:
                #reference case
                string = string.replace(match, '{@link %s}' % PLACEHOLDER_REF)

    return string

In [None]:
# Here we have to pass as parameter only the body of a the method without the javadoc
def keepSepTagAligned(method):
    target_lines = []
    method_lines = method.splitlines()

    for (idx, line) in enumerate(method_lines):

        if line.strip().startswith('<sep>'):
            target_lines.append(idx)

    multi_list = []

    for k, g in groupby(enumerate(target_lines), lambda ix : ix[0] - ix[1]):
       multi_list.append(list(map(itemgetter(1), g)))

    for sub_list in multi_list:

        if len(sub_list)==1:
            continue

        if len(sub_list)==2:
            method_lines[sub_list[0]] = method_lines[sub_list[0]].rstrip('<sep>')
            method_lines[sub_list[1]] = method_lines[sub_list[1]].lstrip('<sep>')

        else:
            method_lines[sub_list[0]] = method_lines[sub_list[0]].rstrip('<sep>')
            for idx_line in sub_list[1:]:
                method_lines[idx_line] = method_lines[idx_line].replace('<sep>','')
            method_lines[sub_list[-1]] =  method_lines[sub_list[-1]] + ' <sep>'

    return '\n'.join(method_lines)

In [None]:
def removeOrphan(method):

    comments = comment_parser.extract_comments_from_str(method, mime='text/x-java-source')

    method_lines = method.splitlines()

    for comment in comments:

        comment_text = comment.text()
        c_line = comment.line_number()-1
        len_comment_lines = len(comment_text.splitlines())

        if c_line > 1:
            if len(method_lines[c_line-1].strip())==0 and len(method_lines[c_line+len_comment_lines].strip())==0:
                method = method.replace(comment_text, PLACEHOLDER)

    return method

In [None]:
def filterCodeAndShort(ref_string, discarded_code_comments=None):

    comments = []
    forbidden_languages = ['Java', 'JavaScript', 'Objective-C', 'D', 'C++']

    prepared_comment = re.findall("<sep>([\s\S]*?)<sep>", ref_string)

    #Remove short comment
    for prep_comment in prepared_comment:
        refined_comment = maskLinkRef(prep_comment.strip(), javadoc=False)

        if checkSATD(refined_comment) or len(refined_comment.split(' '))<=2:
            ref_string = ref_string.replace(prep_comment,PLACEHOLDER)
            continue

        comments.append(prep_comment)

    res = ''
    for line in ref_string.splitlines():

        if PLACEHOLDER in line:
            continue

        res+= line + '\n'

    ref_string = res
    c_line = 0

    for comment in comments:

        comment_lines = comment.splitlines()
        stripped_comment = comment.strip()
        for (idx,line) in enumerate(comment_lines):
            if comment in line:
                c_line = idx
                break

        flag_todo = False
        for line in comment_lines:
            if checkSATD(line):
                flag_todo = True
                break

        if flag_todo:
            for index in range(c_line, len(comment_lines) + c_line ):
                ref_string = ref_string.replace(comment, '')

        #print('iterative check: ',ref_string)

        #By putting this check first, we're able to capture corner cases like the following one:
        # <sep> ------------System.out.println()-----------<sep>
        # In this case we want completely discard such comment from the dataset
        if stripped_comment!='' and stripChar(stripped_comment,4):
            refined_comment = stripped_comment.strip(stripped_comment[0])

            if refined_comment == '':
                ref_string = ref_string.replace(comment, '')
                continue #skip this comment
            else:
                ref_string = ref_string.replace(stripped_comment, refined_comment)

        # if identify(stripped_comment) == 'AppleScript':
        #     print('strange language: ', stripped_comment)

        # NB: To filter out commented code we use https://github.com/jdkato/codetype.
        # We want to discard as much as possible commented code. Since the dataset we're using is pretty big we do not care of false-positive
        if discarded_code_comments:
            if identify(stripped_comment) in forbidden_languages:
                  discarded_code_comments.write('--> ' + stripped_comment+'\n')

        if identify(stripped_comment) in forbidden_languages or len(stripped_comment.split())<=2:
            ref_string = ref_string.replace(stripped_comment, '')
            continue

    comments = re.findall('<sep>([\s\S]*?)<sep>',ref_string)
    for item in comments:
        #Empty comment after stripping and preprocessing it
        if len(item.strip())==0:
            ref_string = ref_string.replace('<sep>%s<sep>'% item,'')

    return ref_string


In [None]:
def checkForMultipleSeparator(method):

    lines = method.splitlines()
    flag_start = False
    starting_line = 0

    for (idx,line) in enumerate(lines):

        if line.strip().startswith('<sep>') and not flag_start:
            starting_line = idx
            flag_start = True

        if line.strip().endswith('<sep>') and flag_start:
            if idx < len(lines):
                if lines[idx+1].strip().startswith('<sep>'):
                    continue

            if starting_line == idx:
                flag_start=False
                continue


            else:
                substring = '\n'.join(lines[starting_line : idx+1])
                if substring.count('<sep>') > 2:
                    backup = substring
                    substring = substring.replace('<sep>','')
                    method = method.replace(backup, '<sep> '+substring+' <sep>')
                flag_start=False

    return method

In [None]:
def preprocessInsideComment(string, discarded_code_comments=None):

    ref_string = ''

    for (idx,line) in enumerate(string.splitlines()):
        if line.strip().startswith('@'):
            continue
        else:
            ref_string += line +'\n'

    # Going to filter comments containing non ascii characters
    ff = removeNonAscii(ref_string)

    # Going to filter orphan comments
    ff = removeOrphan(ff)

    #print('removeOrphan: ',ff)

    # Going to refine multiLineComment
    ff = filterMultiLineComment(ff)

    # Going to align comment
    ff =  alignComment(ff)

    #print('align: ', ff)

    # Going to check if we can squash subsequent comment into one
    result = checkForMultipleSeparator(ff)

    # Going to filter out commented code and short comment.
    ff = filterCodeAndShort(result, discarded_code_comments)

    #print('filterCodeAndShort: ', ff)

    # Going to remove dateTime from comments
    result = removeDateTime(ff, javadoc=False)

    #print('removeDateTime: ',ff)

    return result

In [None]:
def preprocessDocstring(javadoc):

    print('docstring spoca: ',javadoc)
    if not isAscii(javadoc):
        return 'non-ascii'

    #Discard every html tag from the docstring. We use beautiful soup which works like a charm for this kind of job
    soup = BeautifulSoup(javadoc)
    javadoc=soup.get_text(separator=' ')

    lines = javadoc.splitlines()

    for (idx,line) in enumerate(lines):
        for tag in FORBIDDEN_TAGS:
            if tag in line:
                lines[idx]=PLACEHOLDER
                break

    if PLACEHOLDER in '\n'.join(lines):
        javadoc = ''
        for line in lines:
            if PLACEHOLDER not in line:
                javadoc += line + '\n'

    print('dirty docstring: ',javadoc)
    docstring = maskLinkRef(javadoc, javadoc=True)
    print('cleaned docstring: "',docstring)
    docstring = removeDateTime(docstring, javadoc=True)

    docstring_lines = docstring.splitlines()
    for line in docstring_lines:
        if checkSATD(line.strip()):
            return 'invalid'

    refinedDocstring = ''
    for line in docstring.splitlines():

        ############ Starting section ############
        if line.startswith('/**'):
            line = docstring.lstrip('/**')

        if line.startswith('/*'):
            line = docstring.lstrip('/*')

        if line.endswith('*/'):
            line = docstring.rstrip('*/')

        ############ ---------------- ############
        if line.startswith('*'):
            line = docstring.lstrip('*')

        refinedDocstring += line + ' '


    return refinedDocstring


In [None]:
def preprocessFullDataset(dataset,threshold=0, commented_code_analysis=False, save_pickle=True):

    only_docstring_list = []
    only_inside_list = []
    inside_and_docstring_list = []

    if commented_code_analysis: discarded_code_comments = open('discarded_code_comments.txt','a+')
    else: discarded_code_comments = None

    if threshold>0:
        flag_threshold=False
    else:
        threshold = len(dataset)
        flag_threshold=True

    # tqdm._instances.clear()

    indice = 0

    for (idx,item) in enumerate(tqdm(dataset)):

        function = item['function']
        docstring = item['docstring']

        if idx==threshold and not flag_threshold:
            return only_docstring_list, only_inside_list, inside_and_docstring_list

        #NB: Apparently nltk is much faster than spacy to tokenize a string
        list_doc_function = word_tokenize(function)

        list_doc_docstring = word_tokenize(docstring)

        flag_docstring = True #Assume that each method has its own docstring/javadoc
        flag_inline = True #Assume that each method has at least one inline/multiline comment in its own body

        #NB: We're counting the #of tokens starting from the 'un-refined' item
        if ( len(list_doc_function) + len(list_doc_docstring) ) <= MAX_LEN:

            refinedDocstring = preprocessDocstring(docstring)

            #Handling function with no comment
            try:
                refinedFunction = preprocessInsideComment(function, discarded_code_comments)
            except Exception:
                print('Error here!: ',refinedFunction)
                continue

            # If we find a non ascii character inside the method we skip it
            if not isAscii(refinedFunction):
                continue

            if refinedFunction.count('<sep>')==0:
                flag_inline = False
                if (refinedDocstring=='non-ascii' or refinedDocstring=='invalid'):
                    continue

            if refinedDocstring == 'non-ascii' or refinedDocstring == 'invalid' or len(refinedDocstring.split(' '))<=2:
                flag_docstring = False

            if len(refinedDocstring.split(' '))>1 and refinedDocstring != '' and ('(non-javadoc)' not in refinedDocstring.lower()):

                javadoc_string = refinedDocstring.strip()
                javadoc_string = javadoc_string.replace('\n','')
                javadoc_string = re.sub('\s+',' ',javadoc_string)

                # Check the validity of the docstring after the refinements
                if javadoc_string != '':

                    if stripChar(javadoc_string,4):

                        stripped_ref = re.compile('\w+').findall(javadoc_string)
                        javadoc_string = ' '.join(stripped_ref)

                        indice += 1

                if javadoc_string == '' and not flag_inline:
                    continue

                else:

                    if flag_docstring and not flag_inline and len(javadoc_string.split())>2:

                        ds_sample = refinedFunction + '\n' + '<sep> ' + javadoc_string + ' <sep>'

                        if len(word_tokenize(ds_sample)) <= MAX_LEN:
                            only_docstring_list.append(ds_sample)

                    elif flag_inline and flag_docstring:

                        ds_sample = refinedFunction + '\n' + '<sep> ' + javadoc_string + ' <sep>'

                        if len(word_tokenize(ds_sample)) <= MAX_LEN:
                            # Check if the length is ok
                            if len(javadoc_string.split())>2:
                                inside_and_docstring_list.append(ds_sample)
                            else:
                                only_inside_list.append(refinedFunction)

                    elif flag_inline and not flag_docstring:
                        if len(word_tokenize(refinedFunction)) <= MAX_LEN:
                            only_inside_list.append(refinedFunction)

                    else:
                        continue

            else:
                if flag_inline:
                    only_inside_list.append(refinedFunction)

    if commented_code_analysis:
        discarded_code_comments.close()

    #First pass of duplicates dropping
    only_inside_list = list(dict.fromkeys(only_inside_list))
    only_docstring_list = list(dict.fromkeys(only_docstring_list))
    inside_and_docstring_list = list(dict.fromkeys(inside_and_docstring_list))

    if save_pickle:
        with open('only_docstring_list', 'wb') as fp:
            pickle.dump(only_docstring_list, fp, protocol=pickle.HIGHEST_PROTOCOL)

        with open('only_inside_list.pickle', 'wb') as fp:
            pickle.dump(only_inside_list, fp, protocol=pickle.HIGHEST_PROTOCOL)

        with open('inside_and_docstring_list.pickle', 'wb') as fp:
            pickle.dump(inside_and_docstring_list, fp, protocol=pickle.HIGHEST_PROTOCOL)

    return only_docstring_list, only_inside_list, inside_and_docstring_list


# with open('../../data/raw/java_dedupe_definitions_v2.pkl', 'rb') as f:
#     dataset = pickle.load(f, encoding='utf-8')

# Is going to take a while...

In [None]:
# Snippet identification business starts here!
# It extracts multiple instances
def extractSamples(item, keep_comment, withJavaDoc=False):

    results = []

    comments = re.findall("<sep>([\s\S]*?)<sep>", item)


    if withJavaDoc:
        comments = comments[0:-1]

    #print(comments)

    splitted_lines = item.splitlines()

    for comment in comments:

        matching_line = comment.splitlines()[0]

        for (matching_index, line) in enumerate(splitted_lines):

            if matching_line in line:

                comment_lines = len(comment.splitlines())

                #forward pass

                # Set the upper bound
                back_index = matching_index -1
                lines_from_begin = splitted_lines[0:back_index]
                reversed_list = lines_from_begin[::-1]

                flag_exit = False
                for (idx, line_back) in enumerate(reversed_list):

                    if not keep_comment:

                        if '<sep>' in line_back:

                            back_index = back_index - idx
                            #print('back_index ', splitted_lines[back_index])
                            #print('idx: ',idx)
                            flag_exit = True
                            break

                        else:
                            pass

                    if len(line_back.strip())==0:
                        back_index = back_index - idx
                        flag_exit = True
                        break

                    else:
                        pass

                # In this case the method signature is gonna be the upper bound
                if not flag_exit:
                    back_index = len(reversed_list)


                # Set the lower bound
                final_index = matching_index+1

                if comment_lines > 1:
                    final_index =  (matching_index) + comment_lines
                    # print('starting index: ',final_index)
                    # print('comment: ', comment)

                lines_to_the_end = splitted_lines[final_index:]


                flag_exit = False
                flag_add_brace = False
                for (idx, line_forward) in enumerate(lines_to_the_end):

                    if not keep_comment:

                        if '<sep>' in line_forward:
                            final_index = final_index + idx
                            flag_exit = True
                            break

                        else:
                            pass

                    if len(line_forward.strip())==0:
                        final_index = final_index + idx
                        flag_exit = True
                        break

                    if line_forward.strip().startswith('}'):
                        final_index = final_index + idx
                        flag_exit = True
                        flag_add_brace = True
                        break

                    else:
                        pass

                # Push down to the end of the method
                if not flag_exit:
                    final_index = final_index+len(lines_to_the_end)

                sample = '\n'.join(splitted_lines[back_index:final_index])

                if flag_add_brace:
                    sample += '\n}'

                # Handling of duplicated instances due to same comments
                if sample not in results and sample.count('<sep>')>=2:
                    results.append(sample.strip())
                    break

    return results

In [None]:
#Takes as input the results preprocessFullDataset
def datasetSplittingAndPreparation(only_docstring_list=None, only_inside_list=None, inside_and_docstring_list=None):


    if only_docstring_list is None:
        with open('only_docstring_list.pickle', 'rb') as fp:
            only_docstring_list = pickle.load(fp)

    if only_inside_list is None:
        with open('only_inside_list.pickle', 'rb') as fp:
            only_inside_list = pickle.load(fp)

    if inside_and_docstring_list is None:
        with open('inside_and_docstring_list.pickle', 'rb') as fp:
            inside_and_docstring_list = pickle.load(fp)

    pre_training_only_doc = round(len(only_docstring_list) * (2/3))
    pre_training_only_inside = round(len(only_inside_list) * (2/3))
    pre_training_inside_doc = round(len(inside_and_docstring_list) * (2/3))

    pretraining_1 = random.sample(only_docstring_list, k=pre_training_only_doc)
    pretraining_2 = random.sample(only_inside_list, k=pre_training_only_inside)
    pretraining_3 = random.sample(inside_and_docstring_list, k=pre_training_inside_doc)

    pretraining_1_3 = []

    for (idx, item) in enumerate(pretraining_1):


        comment = re.findall("<sep>([\s\S]*?)<sep>", item)[0]
        sample = item.replace('<sep>','')
        sample = sample.replace(comment,'')
        sample = '<sep> ' + comment + ' <sep>\n' + sample

        pretraining_1_3.append(sample)

    for (idx,item) in enumerate(pretraining_3):

        comment = re.findall("<sep>([\s\S]*?)<sep>", item)[-1]
        item = item.replace('<sep> '+comment.strip()+' <sep>','')
        sample = '<sep> ' + comment.strip() + ' <sep>\n' + item

        pretraining_1_3.append(sample)

    set_x1 = set(pretraining_1_3)
    set_x2 = set(pretraining_2)

    pretrain = set_x1.union(set_x2)

    set1_original = set(only_docstring_list)
    set2_original = set(only_inside_list)
    set3_original = set(inside_and_docstring_list)

    set1 = set(pretraining_1)
    set2 = set(pretraining_2)
    set3 = set(pretraining_3)

    finetuning_1 = set1_original.difference(set1)
    finetuning_2 = set2_original.difference(set2)
    finetuning_3 = set3_original.difference(set3)

    ft_union = finetuning_1.union(finetuning_2,finetuning_3)
    finetune = set(ft_union)

    #Check for duplicates between pre-training and fine-tuning instances
    assert(len(pretrain.intersection(finetune))==0)

    print('Datasets length: ')
    print('Fine tuning: ', len(finetune))
    print('Pre-training: ',len(pretrain))

    #Saving data
    with open('pretrain.pickle', 'wb') as fp:
        pickle.dump(pretrain, fp, protocol=pickle.HIGHEST_PROTOCOL)

    # Complete finetuning dataset
    with open('finetune.pickle', 'wb') as fp:
        pickle.dump(finetune, fp, protocol=pickle.HIGHEST_PROTOCOL)

    with open('finetuning_1.pickle', 'wb') as fp:
        pickle.dump(finetuning_1, fp, protocol=pickle.HIGHEST_PROTOCOL)

    with open('finetuning_2.pickle', 'wb') as fp:
        pickle.dump(finetuning_2, fp, protocol=pickle.HIGHEST_PROTOCOL)

    with open('finetuning_3.pickle', 'wb') as fp:
        pickle.dump(finetuning_3, fp, protocol=pickle.HIGHEST_PROTOCOL)

# datasetSplittingAndPreparation(only_docstring_list=None, only_inside_list=None, inside_and_docstring_list=None)
datasetSplittingAndPreparation(only_docstring_list=only_docstring_list, only_inside_list=only_inside_list, inside_and_docstring_list=inside_and_docstring_list)

In [None]:
#javadoc=True if want to generate the javadoctask-specific dataset as well
def createDatasetPerTask(javadoc=True, keep_comment=False, save4analysis=False):

    finetune_inside = []
    finetune_docstring = []

    # ************** Loading data **************

    # Loading Javadoc pickle
    if javadoc:
        with open('....', 'rb') as finetuning_1:
            ft1 = pickle.load(finetuning_1)

        for item in list(ft1):
            finetune_docstring.append(item)

        with open('....', 'rb') as finetuning_3:
            ft3 = pickle.load(finetuning_3)

    with open('....', 'rb') as finetuning_2:
        ft2 = pickle.load(finetuning_2)

    # Loading finetuning pickle
    for item in list(ft2):

        rr = extractSamples(item, keep_comment=keep_comment)

        for comment in rr:
            finetune_inside.append(comment)

    for item in list(ft3):

        rr=extractSamples(item, keep_comment=keep_comment, withJavaDoc=True)

        for comment in rr:
            finetune_inside.append(comment)

        if javadoc:
            finetune_docstring.append(item)

    #Save finetuning for the javadoc
    if javadoc:
        set_1_ft_docstring = list(dict.fromkeys(finetune_docstring))

        with open('finetuning_javadoc.pickle', 'wb') as fp:
            pickle.dump(set_1_ft_docstring, fp, protocol=pickle.HIGHEST_PROTOCOL)

    set_2_ft_inside = list(dict.fromkeys(finetune_inside))

    if keep_comment: filename_ft = 'finetuning_multi_comment'
    else: filename_ft = 'finetuning_single_comment'

    with open('%s.pickle' % filename_ft, 'wb') as fp:
        pickle.dump(set_2_ft_inside, fp, protocol=pickle.HIGHEST_PROTOCOL)

    if save4analysis:

        ft_txt = open('%s.txt' % filename_ft,'a+')
        for (idx,element) in enumerate(list(set_2_ft_inside)):
            ft_txt.write('idx: '+str(idx)+' '+element+'\n')
        ft_txt.close()

        if javadoc:

            ft_txt = open('finetuning_docstring.txt','a+')
            for (idx,element) in enumerate(list(set_1_ft_docstring)):
                ft_txt.write('idx: '+str(idx)+' '+element+'\n')
            ft_txt.close()

#If keep_comment = True then we expand the context to non target-comment as well
createDatasetPerTask(javadoc=True, keep_comment=False, save4analysis=True)