In [None]:
import pandas as pd
import spacy
import time
import re

In [None]:
#Find camelCase words
def camel_case_split(word):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
    words = [m.group(0) for m in matches]
    splitedWord = ''
    if len(words) > 1:
        i = 0
        for w in words:
            if i == 0:
                splitedWord = w
            else:
                splitedWord += ' ' + w
            i += 1
    else:
        splitedWord = words[0]
    
    print('Word after camelCase split:', splitedWord)
    return splitedWord

In [None]:
#Remove symbols which are defined in "seps" from given word
def remove_symbol_from_word(word):
    seps = ["#", ".", '(', ')', '{', '}', '[', ']', '\'', '/', '_', '-', '"', '=']
    trimedWord = ''
    for sep in seps:
        word = word.replace(sep,' ')
    
    print('Word after symbol remover:', word)
    return word

In [None]:
#Remove single char word (for example: if "e.g." is modified to "e g", we can remove then "e" and "g")
def remove_single_char_words(comment):
    splitedComment = comment.split()
    newComment = ''
    idx = 0
    for w in splitedComment:
        if len(w) == 1:
            print('Remove single char:', w)
        else:
            if idx == 0:
                newComment += w
            else:
                newComment = newComment + ' ' + w
        idx += 1
    return newComment

In [None]:
#Remove duplicate white spaces in a sentence
def remove_more_than_one_space_in_sentence(comment):
    comment = re.sub(' +', ' ', comment)
    return comment

In [None]:
#Write all words in the comment to lower case
def all_to_lower_case(comment):
    return comment.lower()

In [None]:
#If a url is given in the comment -> remove it
def remove_url_from_comment(word):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)

In [None]:
#Replace word with another word (spaCy defines doesn't as two words: does and n't)
def replace_word_with(word):
    if word == 'n\'t':
        word = word.replace('n\'t', 'not')
    elif word == '\'ll':
        word = word.replace('\'ll', 'will')
    elif word == '||':
        word = word.replace('||', 'or')
    print('Word after replace with func:', word)
    return word

In [None]:
# Load dataset
url = "csv/newFile3.csv"

#define column lables
#names = ['comment', 'code', 'non-information']
names = ['comment', 'code', 'Expected']

#na_filter = false, because pandas read empty string as float (nan)!
#skiprows = 1, because the first row contains only the labels
dataframe = pd.read_csv(url, names=names, skiprows = 1, na_filter = False)



#load EN model to analyse sentences in english language
nlp = spacy.load("en_core_web_sm")

seps = ["#", ".", '(', ')']

#Filter comment AND code
for col in ['comment', 'code']:
    start_time = time.time()
    # Get all values from the column
    columns =  dataframe[col].tolist()
    #Loop over each column entries
    for idx, val in enumerate(columns):
        print(idx, val)
        print()
        doc = nlp(val)
        trimedValue = ''

        #Loop over each word
        for w in doc:
            #Remove PUNCT, DET via spaCy
            if w.pos_ == 'PUNCT' or w.pos_ == 'DET':
                print('Removed word via spaCy:', w.text)
                print()
                continue
            print('Word:', w.text)

            #Replace word with
            word = replace_word_with(w.text)
            #CamelCase remover
            word = camel_case_split(word)
            #URL remover
            word = remove_url_from_comment(word)
            #Symbol remover
            word = remove_symbol_from_word(word)

            print()
            trimedValue += ' ' + word

        trimedValue = remove_more_than_one_space_in_sentence(trimedValue)
        trimedValue = all_to_lower_case(trimedValue)
        trimedValue = remove_single_char_words(trimedValue)
        print()
        print('Finished modified value:', trimedValue)
        #print([(w.text, w.pos_) for w in doc])
        print()
        dataframe.at[idx, col] = trimedValue

    elapsed_time = time.time() - start_time
    print(elapsed_time)

#Save modified comment into file
#keep_col = ['comment','code','non-information']
keep_col = ['comment','code','Expected']
new_f = dataframe[keep_col]
new_f.to_csv("csv/trimedValues3.csv", index=False)



#Print all rows and not only the first 5 and the last 5
#pd.set_option('display.max_rows', None)

#other option settings if necessary:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)

# Only print comment and non-information column
#print(dataframe[['comment', 'non-information']])
#print(dataframe[['comment']])

# Print all information from csv file in a DataFrame (table object)
#print(dataframe)