In [None]:
import os
import sys
import fnmatch
import pandas as pd
import operator
import shutil
import re
import javalang
def create_dev_test_train_split_and_vocabulary(root_path, 
                                               train_output, 
                                               vocabFile
                                              ):

    train_file = ''
    dev_file = ''
    test_file = ''

    word_counts = dict()
    
    for root, dirnames, filenames in os.walk(root_path):
        for filename in fnmatch.filter(filenames, '*.csv'):

            path = os.path.join(root, filename)
            #print(path)
            #print(filename)

            if filename.endswith("test.csv"):
                test_file = path

            elif filename.endswith("dev.csv"):
                dev_file = path

            else:
                path = "csv/oneLineCode.csv"
                train_file = path
                dataframe = pd.read_csv(path, na_filter = False)
                for i,data in dataframe.iterrows():
                    d = splitComment(data["comment"])
                    c = tokenizeJavaCode(data["code"])
                    print(i) # row number
                    print("comment:")
                    print(d) # comment words
                    
                    print("code:")
                    print(c) # code words
                    s = []
                    s =  d+c # comment and code together
                    add_counts(word_counts, s)
                with open(path, 'r', encoding='utf-8') as text:
                    for line in text:
                        add_counts(word_counts, line)

    vocabulary = build_vocabulary(word_counts)
    print("-- VOCABULARY BEGIN --")
    print(vocabulary)
    print("-- VOCABULARY END --")
    write_vocabulary(vocabulary, vocabFile)

    write_processed_dataset(train_file, train_output,vocabFile)
#     write_processed_dataset(dev_txt_files, dev_output)
#     write_processed_dataset(test_txt_files, test_output)

def write_processed_dataset(input_file, output_file, vocabFile):
    names = [ 'comment', 'code','non-information']
    df = pd.DataFrame()
    word_vocabulary = read_vocabulary(vocabFile)
    dataframe = pd.read_csv(input_file, na_filter = False)
    for i,d in dataframe.iterrows():
        comment = []
        code = []
        label = 1 if d["non-information"] == "yes" else 0
        for token in splitComment(d["comment"]):
            comment.append(word_vocabulary.get(token,0))
        for token in tokenizeJavaCode(d["code"]):
            code.append(word_vocabulary.get(token,0))
        da = [{
            "comment":comment,
            "code":code,
            "label":label,
        }]
        print(da)
        df = df.append(da,ignore_index=True,sort=False)
    df.to_csv(output_file, index=False)

In [None]:
delimiters = "#", ".",",","<b>","</b>","-",":","<br>","_","?"," ",";"
def splitComment(string,delimiters = delimiters, maxsplit=0):
## replace all web https to https
    for l in string.split():
        if l.startswith("https"):
            string = string.replace(l,"https")
    ## split string by delimiters
    regexPattern = '|'.join(map(re.escape, delimiters))
    result =  re.split(regexPattern, string, maxsplit)
    ## split string by uppercase
    f = []
    for r in result:
        p =  re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', r)
#         print(p)
        for q in p.split():
            if q != "":
                f.append(q)
    return f
def tokenizeJavaCode(code):
    result = []
    try:
        tokens = list(javalang.tokenizer.tokenize(code))
        for token in tokens:
            result.append(token.value)
        return result
    except:
        return list(code)
#TODO stopwords
STOPWORDS_RAW=["(?i)or$", "(?i)and$", "(?i)i$"]

def iterable_to_dict(arr):
    print("-- DEBUG BEGIN --")
    print(dict((x.strip(), i) for (i, x) in enumerate(arr)))
    print("-- DEBUG END --")
    return dict((x.strip(), (i+1)) for (i, x) in enumerate(arr))

def read_vocabulary(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        return iterable_to_dict(f.readlines())
def add_counts(word_counts, line):
    stopreg = []
    for rawregex in STOPWORDS_RAW:
        stopreg.append(re.compile(rawregex))
    
    print(stopreg)
    for w in line:  
        if any(regex.match(w) for regex in stopreg):
            print("Stopword:",w)
            continue

        word_counts[w] = word_counts.get(w, 0) + 1
END = "</S>"
UNK = "<UNK>"
NUM = "<NUM>"

def dump(d, path):
    with open(path, 'w') as f:
        for s in d:
            f.write("%s\n" % repr(s))
        
def write_vocabulary(vocabulary, file_name):
    if END not in vocabulary:
        vocabulary.append(END)
    if UNK not in vocabulary:
        vocabulary.append(UNK)

    print("Vocabulary size (write_vocabulary): %d" % len(vocabulary))

    with open(file_name, 'w', encoding='utf-8') as f:
        f.write("\n".join(vocabulary))
MAX_WORD_VOCABULARY_SIZE = 100000
MIN_WORD_COUNT_IN_VOCAB = 2
MAX_SEQUENCE_LEN = 50
def build_vocabulary(word_counts):
    return [wc[0] for wc in reversed(sorted(word_counts.items(), key=operator.itemgetter(1))) if wc[1] >= MIN_WORD_COUNT_IN_VOCAB and wc[0] != UNK][:MAX_WORD_VOCABULARY_SIZE] # Unk will be appended to end

In [None]:
root_path = "csv"
vocabFile = "csv/vocab.txt"
create_dev_test_train_split_and_vocabulary(root_path,"csv/test_out.csv",vocabFile)

In [None]:
dataframe = pd.read_csv("csv/oneLineCode.csv", na_filter = False)
for i,data in dataframe.iterrows():
    s =  data["comment"]
    c = data["code"]
    print(i)
    print("comment:")
    print(s)
    print("code:")
    print(c)

In [None]:
delimiters = "#", ".",",","<b>","</b>","-",":","<br>","_","?"," ",";"
example = "stacko#verFlow (c) is a=Wesome... isn't it? D/?DD"
def splitText(delimiters, string, maxsplit=0):
#    string.replace()
    import re
    regexPattern = '|'.join(map(re.escape, delimiters))
#     print(regexPattern)
    result =  re.split(regexPattern, string, maxsplit)
    f = []
    for r in result:
        p =  re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', r)
#         print(p)
        for q in p.split():
            if q != "":
                f.append(q)
    return f
splitText(delimiters,example)

In [None]:
tokens = javalang.tokenizer.tokenize('System.out.println("Hello " + "world");')
parser = javalang.parser.Parser(tokens)
parser.parse_expression()

In [None]:
f = "This class monitors a set of files for changes. Upon detecting a change it notifies the registered {@link FileUpdateListener}s. Implementation based on https://stackoverflow.com/questions/16251273/can-i-watch-for-single-file-change-with-watchservice-not-the-whole-directory"
splitComment(f)