In [87]:
import os
import sys
import fnmatch
import pandas as pd
import operator
import shutil
def create_dev_test_train_split_and_vocabulary(root_path, 
                                               train_output, 
                                               vocabFile
                                              ):

    train_file = ''
    dev_file = ''
    test_file = ''

    word_counts = dict()
    
    for root, dirnames, filenames in os.walk(root_path):
        for filename in fnmatch.filter(filenames, '*.csv'):

            path = os.path.join(root, filename)

            if filename.endswith("test.csv"):
                test_file = path

            elif filename.endswith("dev.csv"):
                dev_file = path

            else:
                path = "csv/oneLineCode.csv"
                train_file = path
                dataframe = pd.read_csv(path, na_filter = False)
                for _,data in dataframe.iterrows():
                    s =  data["comment"] +" "+data["code"]
                    add_counts(word_counts, s)
#                 with open(path, 'r', encoding='utf-8') as text:
#                     for line in text:
#                         add_counts(word_counts, line)

    vocabulary = build_vocabulary(word_counts)
    write_vocabulary(vocabulary, vocabFile)

    write_processed_dataset(train_file, train_output,vocabFile)
#     write_processed_dataset(dev_txt_files, dev_output)
#     write_processed_dataset(test_txt_files, test_output)

def write_processed_dataset(input_file, output_file, vocabFile):
    names = [ 'comment', 'code','non-information']
    df = pd.DataFrame()
    word_vocabulary = read_vocabulary(vocabFile)
    dataframe = pd.read_csv(input_file, na_filter = False)
    for i,d in dataframe.iterrows():
        comment = []
        code = []
        label = 1 if d["non-information"] == "yes" else 0
        for token in d["comment"].split():
            comment.append(word_vocabulary.get(token))
        for token in d["code"].split():
            code.append(word_vocabulary.get(token))
        da = [{
            "comment":comment,
            "code":code,
            "label":label,
        }]
        print(da)
        df = df.append(da,ignore_index=True,sort=False)
    print(df)
    df.to_csv(output_file, index=False)

In [88]:
#TODO stopwords
STOPWORDS=["and","or","i"]
def iterable_to_dict(arr):
    return dict((x.strip(), i) for (i, x) in enumerate(arr))

def read_vocabulary(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return iterable_to_dict(f.readlines())
def add_counts(word_counts, line):
    for w in line.split():
        if w in STOPWORDS:
            continue
        word_counts[w] = word_counts.get(w, 0) + 1
END = "</S>"
UNK = "<UNK>"
NUM = "<NUM>"

def dump(d, path):
    with open(path, 'w') as f:
        for s in d:
            f.write("%s\n" % repr(s))
        
def write_vocabulary(vocabulary, file_name):
    if END not in vocabulary:
        vocabulary.append(END)
    if UNK not in vocabulary:
        vocabulary.append(UNK)

    print("Vocabulary size: %d" % len(vocabulary))

    with open(file_name, 'w', encoding='utf-8') as f:
        f.write("\n".join(vocabulary))
MAX_WORD_VOCABULARY_SIZE = 100000
MIN_WORD_COUNT_IN_VOCAB = 2
MAX_SEQUENCE_LEN = 50
def build_vocabulary(word_counts):
    return [wc[0] for wc in reversed(sorted(word_counts.items(), key=operator.itemgetter(1))) if wc[1] >= MIN_WORD_COUNT_IN_VOCAB and wc[0] != UNK][:MAX_WORD_VOCABULARY_SIZE] # Unk will be appended to end

In [89]:
root_path = "csv"
vocabFile = "csv/vocab.txt"
create_dev_test_train_split_and_vocabulary(root_path,"csv/test_out.csv",vocabFile)

Vocabulary size: 5056
[{'comment': [1654, 379, 26, 17, 5053], 'code': [5, 38, 15, 5052, 5051, 709, 1653, 1652, 457, 2], 'label': 1}]
[{'comment': [5050], 'code': [5049, 378], 'label': 1}]
[{'comment': [5048, 74, 6, 0, 968, 173, 80, 32, 0, 5047], 'code': [5046, 5045], 'label': 0}]
[{'comment': [967, 8, 0, 54, 966, 456, 1, 553, 0, 34, 90], 'code': [25, 62, 5044, 5043, 2], 'label': 1}]
[{'comment': [113, 144, 72], 'code': [5042], 'label': 0}]
[{'comment': [112, 8, 76, 4, 3, 58, 51, 32, 0, 335, 965, 71, 455, 19, 4, 3, 1651, 6, 14, 1650, 57, 201, 0, 58, 5041], 'code': [552, 1649, 7, 66], 'label': 0}]
[{'comment': [159, 29, 51], 'code': [10, 5040, 5039, 29, 107, 5038, 2], 'label': 1}]
[{'comment': [49, 5037, 5036, None, 5035, 44, 13, 551, 33, 9, 5034, 1648, 49, 964, 1647, 1646, 14, 23, 4, 24, 551, 9, 5033], 'code': [], 'label': 0}]
[{'comment': [5032, 5031, 377, 143, 376, 5030, 1645, 5029, 5028], 'code': [8, 5027, 292, 5026, 2], 'label': 0}]
[{'comment': [963, 106, 1644, 1, 0, 172, 454, 142]

[{'comment': [84, 186, 80, 8, 52, 896, 4, 87], 'code': [432, 7, 4647], 'label': 0}]
[{'comment': [55, 70, 1, 84, 4646, 10, 0, 895, 73, 57, 14, 1522, 74, 31, 894, 4645], 'code': [4644, 4643, 4642, 4641, 2], 'label': 0}]
[{'comment': [49, 358, 893, 1521, 4640], 'code': [43, 326, 2], 'label': 0}]
[{'comment': [55, 70, 4639, 0, 431, 10, 0, 357, 6, 0, 195, 705, 55, 86, 1, 106, 23, 4638], 'code': [4637], 'label': 0}]
[{'comment': [114, None, 271, 1520], 'code': [4636, 1520, 7, 11, 4635, 4634], 'label': 1}]
[{'comment': [71, 35, 180, 663, 201, 23, 26, 0, 51, 675], 'code': [8, 1519, 194, 124, 2], 'label': 1}]
[{'comment': [4633, 430, 10, 211, 662, 524, 4632], 'code': [25, 12, 4631, 82, 4630, 2], 'label': 0}]
[{'comment': [16, 12, 429, 3, 157, 9, 94, 56, 523, 20, 369, 144, 4, 0, 892, 0, 56, 255, 0, 277, None, 0, 11, 523, 1518, 323, 63, 13, 891], 'code': [5, 12, 4629, 42, 522, 2], 'label': 0}]
[{'comment': [1517, 0, 961, 163], 'code': [4628, 4627, 7, 4626], 'label': 0}]
[{'comment': [661, 308, 8

[{'comment': [426, 269, 819, None, 169, 505, 9, 410], 'code': [25, 15, 4299, 2], 'label': 0}]
[{'comment': [49, 111, 122, 61, 906, 122, 7, 11, 1534, 548, 4298, 702], 'code': [409, 4297, 7, 4296], 'label': 0}]
[{'comment': [408, 36, 671, 21, 1355, None, 708, 169, 184], 'code': [5, 15, 4295, 2], 'label': 0}]
[{'comment': [818, 4294, 270, 817, 4293, 492, 4292, 1354, 1353, 16, 63, 13, 4291, 53, 129, 3, 4290, 1460, 1352, 348, 4289], 'code': [8, 4288, 2], 'label': 0}]
[{'comment': [614, 0, 134, 17, 1351, 22, 0, 155, 240, 190, None, 4287, 10, 4286], 'code': [5, 38, 15, 4285, 1350, 2], 'label': 1}]
[{'comment': [99, 4284], 'code': [5, 12, 4283, 42, 491, 2], 'label': 1}]
[{'comment': [613, 0, 137, 212, 26, 3, 1349, 1348], 'code': [5, 38, 61, 407, 4282, 4281, 4280, 4279, 178, 4278, 2], 'label': 0}]
[{'comment': [8, 4277, 2, 521, 4276, 139, 29, 7, 4275, 237, 4274, 233, 1347, 1, 4273, 366, 1346, 7, 11, 4272, 1345, 138, 123, 22, 4271, 930, 74, 7, 4270, 1344, 10, 4269, 4268, 4267, 2, 4266, 4265, 43,

[{'comment': [113, 3885, 72], 'code': [3884], 'label': 1}]
[{'comment': [3883, 1250, 93, 217], 'code': [25, 38, 61, 37, 3882, 7, 3881], 'label': 1}]
[{'comment': [150, 707], 'code': [3880], 'label': 1}]
[{'comment': [113, 3879], 'code': [3878], 'label': 1}]
[{'comment': [613, 0, 450, 137, 212, 26, 3, 3877, 1349, 1348, 348, 79, 3876], 'code': [5, 38, 61, 407, 3875, 3874, 780, 444, 3873, 178, 227, 2], 'label': 0}]
[{'comment': [113, 3872, 72], 'code': [3871], 'label': 1}]
[{'comment': [3870, None, 595, 393], 'code': [3869, 59, 2], 'label': 0}]
[{'comment': [502, 22, 700, 48, 3868, 1473, 492, 10, 3867, 74, None, 1249, 1497, 9, 110, 76, 31, 530, 74], 'code': [3866, 3865, 7, 11, 3864, 3863], 'label': 0}]
[{'comment': [3862, 3, 34, 80, 1, 3, 34, 56, 10, 36, 39, 9, 3, 1248, 16, 18, 1247, 350, 150, 94, 1246, 98, 162, 3, 369, 931, 14, 44, 13, 451, 53, 0, 1245], 'code': [25, 38, 779, 3861, 268, 234, 510, 37, 3860, 2], 'label': 0}]
[{'comment': [55, 70, 1, 84, 778, 1, 115, 0, 279, 919], 'code': [

[{'comment': [16, 3675, 3674, 4, 3673, 0, 335, 33, 17, 1199, 55, 208, 306, 14, 0, 505, 6, 36, 819, 3672, 146, 1, 0, 3671, 505, 6, 0, 1198, 3670, 9, 959, 1, 17, 1199, 35, 585, 0, 819, 3669, 53, 169, 861, 3668], 'code': [5, 12, 3667, 82, 3666, 3665, 2], 'label': 0}]
[{'comment': [55, 70, 1, 3664, 0, 3663, 26, 0, 1608, 1, 3662], 'code': [3661, 3660, 7, 3659, 3658], 'label': 0}]
[{'comment': [680, 88, 532, 6, 17, 1609, 9, 0, 278, 187, 442, 171, 4, 1567], 'code': [5, 15, 3657, 280, 2], 'label': 0}]
[{'comment': [16, 672, 0, 277, 620, 1371, 57, 14, 374, 1370, 91, 0, 11, 1369, 290, 0, 830, 1368, 19, 12, 44, 13, 829], 'code': [692], 'label': 0}]
[{'comment': [271, 9, 78, 3656], 'code': [3655, 3654], 'label': 0}]
[{'comment': [348, 8, 76, 31, 922, 698, 607, 914, 1, 211, 338, 71, 455, 35, 86, 1, 84, 3653], 'code': [780, 3652, 3651, 7, 11, 868], 'label': 0}]
[{'comment': [49, 3650], 'code': [], 'label': 0}]
[{'comment': [49, 634, 133, 206, 3649, 506, 3648, 47, 170, 284, 47, 3647, 47, 3646, 170, 4

[{'comment': [3311, 1256, 10, 3310, 0, 3309, 91, 0, 3308, 56], 'code': [11, 3307], 'label': 0}]
[{'comment': [237, 49, 661, 3306, 3305, 6, 0, 74, 3304, 3303, 7, 11, 3302, 3301, 47, 3300, 70, 89, 654, 1, 0, 161, 3299, 3298, 47, 3297, 3296, 59, 3295, 3294, 3293], 'code': [28, 11, 3292, 3291], 'label': 0}]
[{'comment': [3290, 37, 3289, 10, 236, 177], 'code': [692], 'label': 0}]
[{'comment': [1145, 635, 3288], 'code': [574, 3287, 7, 11, 3286], 'label': 0}]
[{'comment': [112, 8, 35, 44, 1144, 0, 3285, 3284, 130, 9, 862], 'code': [62, 1144, 7, 3283], 'label': 1}]
[{'comment': [3282, 0, 54, 1, 757, 39, 3281], 'code': [3280], 'label': 1}]
[{'comment': [3279, 0, 251, 16, 18, 4, 1143, 10, 447, 3, 351, 122, 802, 1, 0, 3278, 57, 0, 157, 63, 13, 746], 'code': [5, 135, 15, 3277, 307, 366, 3276], 'label': 0}]
[{'comment': [578, 493], 'code': [3275], 'label': 0}]
[{'comment': [582, 1, 3274, 149, 470, 99], 'code': [3273, 3272, 7, 11, 3271], 'label': 0}]
[{'comment': [113, 3270, 72], 'code': [3269], 'la

[{'comment': [75, 142, 191, 90], 'code': [8, 1299, 131, 124, 2], 'label': 1}]
[{'comment': [2958, 672, 3, 2957, 26, 571, None, 4, 50, 9, 0, 334, 1095, 46, 210, None, 699], 'code': [], 'label': 0}]
[{'comment': [20, 953, 196, 10, 103, 2956], 'code': [12, 2955, 42, 2954, 82, 1493, 2], 'label': 0}]
[{'comment': [301, 0, 1134, 6, 0, 677], 'code': [2953, 340, 340, 784], 'label': 1}]
[{'comment': [55, 77, 1, 1, 203, 2952, 74, 9, 0, 93, 56, 289, 1, 0, 93, 156, 14, 474, 198, 344, 10, 2951, 116, 2950], 'code': [61, 2949, 2948, 7, 2947, 2946, 59, 2], 'label': 0}]
[{'comment': [2945, 1454, 400, 2944, 10, 159, 116], 'code': [574, 2943, 7, 2942], 'label': 0}]
[{'comment': [1094, 0, 404, 21, None, 192, 652, 2941, 305, 20, 652, 1502, 48, 1, 13, 2940, 22, 53, 645, 0, 1093, 539, 823, 287, 256, 2939, 2938, 274, 81, 1093, 4, 1113, 199, 0, 507, 1092, 14, 4, 180, 30, 13, 2937, None, 22, 74, 9, 14, 453, 0, 2936, 30, 13, 2935, None, 74, 31, 948, 1091, 2934, 274, 3, 179, 1, 3, 389, 21, 4, 593, 1, 2933, 199, 0

[{'comment': [2589, 728], 'code': [5, 38, 61, 37, 2588, 7, 2587], 'label': 1}]
[{'comment': [95, 1, 349, 9, 110, 0, 143, 312, 24, 13, 2586, 1645, 8, 0, 21, 4, 24, 180, 95, 1, 349, 9, 110, 0, 2585, 1, 0, 1208, 528, 78, 4, 2584, 151, 24, 1042, 95, 1, 349, 8, 0, 2583, 179, 4, 24, 299, 9, 0, 186], 'code': [219, 62, 2582], 'label': 0}]
[{'comment': [111, 212, 120, 2581, 10, 467, 2580], 'code': [2579], 'label': 0}]
[{'comment': [944, 0, 45, 39, None, 1449, 257, 32, 0, 45, 276, 404, 443, 2578, 199, 23, 4, 1142, 1, 0, 542, 16, 1234, 413, 13, 2577], 'code': [5, 12, 2576, 2], 'label': 0}]
[{'comment': [16, 12, 787, 3, 51, 6, 229, 65, 14, 63, 13, 243, 1, 26, 395, 20, 12, 253, 2575, 144, 1, 1326, 27, 531, 3, 507, 466, None, 3, 314, 1, 64, 240, 651, 65, 6, 19, 580], 'code': [5, 12, 2574, 82, 552, 2], 'label': 0}]
[{'comment': [2573, 2572, 48, 89, 1041, 348, 8, 76, 1040, 94, 2571], 'code': [8, 2570, 2], 'label': 0}]
[{'comment': [290, 110, 23, 1039, 24, 200], 'code': [354, 2569, 2568, 2], 'label': 1

[{'comment': [88, 386, 12, 50, 33, 3, 239, 10, 0, 385, 384, 383, 382, 1, 203, 296, 381], 'code': [5, 12, 2193, 2], 'label': 0}]
[{'comment': [52, 345], 'code': [43], 'label': 0}]
[{'comment': [88, 386, 12, 50, 33, 3, 239, 10, 0, 385, 384, 383, 382, 1, 203, 296, 381], 'code': [5, 147, 2192, 2], 'label': 0}]
[{'comment': [88, 386, 12, 50, 33, 3, 239, 10, 0, 385, 384, 383, 382, 1, 203, 296, 381], 'code': [5, 12, 2191, 2], 'label': 0}]
[{'comment': [105, 365], 'code': [43], 'label': 1}]
[{'comment': [390, 1018, 1017, 423], 'code': [1016, 1015, 7, 11, 1014, 1013, 1012, 567, 762], 'label': 0}]
[{'comment': [390, 1018, 1017, 423], 'code': [1016, 1015, 7, 11, 1014, 1013, 1012, 567, 762], 'label': 0}]
[{'comment': [88, 386, 12, 50, 33, 3, 239, 10, 0, 385, 384, 383, 382, 1, 203, 296, 381], 'code': [5, 12, 2190, 82, 2189, 2], 'label': 0}]
[{'comment': [88, 386, 12, 50, 33, 3, 239, 10, 0, 385, 384, 383, 382, 1, 203, 296, 381], 'code': [5, 12, 2188, 2], 'label': 0}]
[{'comment': [88, 386, 12, 50, 3

[{'comment': [16, 30, 69, None, 100, 3, 11, 188, 127, 85, 30, 294, 3, 380, 462, 176, 22, 0, 245, None, 3, 325, None, 461, 387, 158, 69, 3, 188, 46, 32, 182, 153, 207, 79, 17, 293, 118, 118, 556], 'code': [62, 712, 174, 37, 463], 'label': 0}]
[{'comment': [114, None, 100, 3, 11, 188, 127, 85, 30, 294, 3, 380, 462, 176, 22, 0, 245, None, 3, 325, 465, 34, 337, None, 461, 387, 158, 69, 3, 188, 46, 32, 182, 153, 207, 79, 17, 293, 118, 118, 554], 'code': [62, 712, 174, 37, 295, 37, 1804], 'label': 0}]
[{'comment': [114, None, 100, 3, 11, 188, 127, 85, 30, 294, 3, 380, 462, 176, 22, 0, 245, None, 3, 325, 465, 34, 337, None, 461, 711, 32, 34, 337, 387, 158, 69, 3, 188, 46, 32, 182, 153, 207, 79, 17, 293, 118, 118, 554, 205, 349, 8, 0, 84, 1803, 1802, 238, 1005], 'code': [62, 712, 174, 37, 295, 37, 990, 37, 1801], 'label': 0}]
[{'comment': [114, None, 100, 3, 11, 188, 127, 85, 30, 294, 3, 380, 462, 176, 22, 0, 245, None, 3, 984, 465, 34, 337, None, 461, 711, 32, 34, 337, 387, 158, 69, 3, 188, 4

[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 250], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 250], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 262], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 66], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [], 'label': 1}]
[{'comment': [41, 18, 40], 'code': [28, 262], 'label'

In [65]:
dataframe = pd.read_csv("csv/oneLineCode.csv", na_filter = False)
for _,data in dataframe.iterrows():
    s =  data["comment"] +" "+data["code"]
    print(s)

@implNote taken from {@link com.sun.javafx.scene.control.behavior.TextAreaBehavior#contextMenuRequested(javafx.scene.input.ContextMenuEvent)} public static void showContextMenu(TextArea textArea, ContextMenu contextMenu, ContextMenuEvent e) {
icon.setToolTipText(printedViewModel.getLocalization()); TABLE_ICONS.put(SpecialField.PRINTED, icon);
Synchronize changes of the underlying date value with the temporalAccessorValue BindingsHelper.bindBidirectional(valueProperty(), temporalAccessorValue,
Ask if the user really wants to close the given database private boolean confirmClose(BasePanel panel) {
css: information * INTEGRITY_INFO(MaterialDesignIcon.INFORMATION),
Check if there is a default type with the same name. If so, this is a modification of that type, so remove the default one: ExternalFileType toRemove = null;
each entry type for (Map.Entry<EntryType, TextField> entry : textFields.entrySet()) {
TODO: Username, domain and identity should be included as in .NET version. TODO: Shoul

Generate keys BibtexKeyPatternPreferences prefs = jabRefPreferences.getBibtexKeyPatternPreferences();
This class is similar to {@link GraphicValidationDecoration} but with a different style and font-based icon. public class IconValidationDecorator extends GraphicValidationDecoration {
This method returns a JComponent detailing the nature of the change. public abstract Node description();
only remove explicit groups from the entries, keyword groups should not be deleted if (group.getGroupNode().getGroup() instanceof ExplicitGroup) {
Are there children (or children of children...) that are matched? If yes we also need to show this node return node.children.getSource().stream().anyMatch(this::showNode);
If there are entries to add if (!toAdd.isEmpty()) {
node should be generated for each call, as nodes can be added to the scene graph only once return icon.getGraphicNode();
The user doesn't want to override cite keys if (!overwriteKeys) {
TODO: Add undo panel.getUndoManager().addEdit(new U

boolean options options.addOption("h", "help", false, Localization.lang("Display help on command line options"));
This class provides methods to create default JavaFX dialogs which will also work on top of Swing windows. The created dialogs are instances of the {@link FXDialog} class. The available dialogs in this class are useful for displaying small information graphic dialogs rather than complex windows. For more complex dialogs it is advised to rather create a new sub class of {@link FXDialog}. public class JabRefDialogService implements DialogService {
Need to force the alert to layout in order to grab the graphic as we are replacing the dialog pane with a custom pane alert.getDialogPane().applyCss();
Create a new dialog pane that has a checkbox instead of the hide/show details button Use the supplied callback for the action of the checkbox alert.setDialogPane(new DialogPane() {
Fool the dialog into thinking there is some expandable content; a group won't take up any space if it h