In [3]:
import pandas as pd
import spacy
import time
import re

In [4]:
#Find camelCase words
def camel_case_split(word):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
    words = [m.group(0) for m in matches]
    splitedWord = ''
    if len(words) > 1:
        i = 0
        for w in words:
            if i == 0:
                splitedWord = w
            else:
                splitedWord += ' ' + w
            i += 1
    else:
        splitedWord = words[0]
    
    print('Word after camelCase split:', splitedWord)
    return splitedWord

In [5]:
#Remove symbols which are defined in "seps" from given word
def remove_symbol_from_word(word):
    seps = ["#", ".", '(', ')', '{', '}', '[', ']', '\'', '/', '_', '-', '"', '=']
    trimedWord = ''
    for sep in seps:
        word = word.replace(sep,' ')
    
    print('Word after symbol remover:', word)
    return word

In [6]:
#Remove single char word (for example: if "e.g." is modified to "e g", we can remove then "e" and "g")
def remove_single_char_words(comment):
    splitedComment = comment.split()
    newComment = ''
    idx = 0
    for w in splitedComment:
        if len(w) == 1:
            print('Remove single char:', w)
        else:
            if idx == 0:
                newComment += w
            else:
                newComment = newComment + ' ' + w
        idx += 1
    return newComment

In [7]:
#Remove duplicate white spaces in a sentence
def remove_more_than_one_space_in_sentence(comment):
    comment = re.sub(' +', ' ', comment)
    return comment

In [8]:
#Write all words in the comment to lower case
def all_to_lower_case(comment):
    return comment.lower()

In [9]:
#If a url is given in the comment -> remove it
def remove_url_from_comment(word):
    return re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)

In [10]:
#Replace word with another word (spaCy defines doesn't as two words: does and n't)
def replace_word_with(word):
    if word == 'n\'t':
        word = word.replace('n\'t', 'not')
    elif word == '\'ll':
        word = word.replace('\'ll', 'will')
    print('Word after replace with func:', word)
    return word

In [12]:

# Load dataset
#url = "assets/declutter-gold_DevelopmentSet.csv"
url = "csv/newFIle.csv"

#define column lables
names = ['comment', 'code', 'non-information']

#na_filter = false, because pandas read empty string as float (nan)!
#skiprows = 1, because the first row contains only the labels
dataframe = pd.read_csv(url, names=names, skiprows = 1, na_filter = False)



#load EN model to analyse sentences in english language
nlp = spacy.load("en_core_web_sm")


seps = ["#", ".", '(', ')']

start_time = time.time()
# Get all values from the column "comment" and print
columns =  dataframe['comment'].tolist()
#Loop over each comment entries
for idx, val in enumerate(columns):
    print(idx, val)
    print()
    doc = nlp(val)
    trimedComment = ''

    #Loop over each word
    for w in doc:
        #Remove PUNCT, AUX, DET via spaCy
        if w.pos_ == 'PUNCT' or w.pos_ == 'DET':
            print('Removed word via spaCy:', w.text)
            print()
            continue
        print('Word:', w.text)
        
        #Replace word with
        word = replace_word_with(w.text)
        #CamelCase remover
        word = camel_case_split(word)
        #URL remover
        word = remove_url_from_comment(word)
        #Symbol remover
        word = remove_symbol_from_word(word)
        
        print()
        trimedComment += ' ' + word
    
    trimedComment = remove_more_than_one_space_in_sentence(trimedComment)
    trimedComment = all_to_lower_case(trimedComment)
    trimedComment = remove_single_char_words(trimedComment)
    print()
    print('Finished modified comment:', trimedComment)
    #print([(w.text, w.pos_) for w in doc])
    print()
    dataframe.at[idx, 'comment'] = trimedComment

elapsed_time = time.time() - start_time
print(elapsed_time)

#Save modified comment into file
keep_col = ['comment','code','non-information']
new_f = dataframe[keep_col]
new_f.to_csv("csv/trimedComments.csv", index=False)



#Print all rows and not only the first 5 and the last 5
#pd.set_option('display.max_rows', None)

#other option settings if necessary:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)

# Only print comment and non-information column
#print(dataframe[['comment', 'non-information']])
#print(dataframe[['comment']])

# Print all information from csv file in a DataFrame (table object)
#print(dataframe)

0 


Finished modified comment: 

1 


Finished modified comment: 

2 


Finished modified comment: 

3 


Finished modified comment: 

4 


Finished modified comment: 

5 


Finished modified comment: 

6 


Finished modified comment: 

7 


Finished modified comment: 

8 


Finished modified comment: 

9 


Finished modified comment: 

10 


Finished modified comment: 

11 


Finished modified comment: 

12 


Finished modified comment: 

13 


Finished modified comment: 

14 


Finished modified comment: 

15 


Finished modified comment: 

16 


Finished modified comment: 

17 


Finished modified comment: 

18 


Finished modified comment: 

19 


Finished modified comment: 

20 


Finished modified comment: 

21 


Finished modified comment: 

22 


Finished modified comment: 

23 


Finished modified comment: 

24 


Finished modified comment: 

25 


Finished modified comment: 

26 


Finished modified comment: 

27 


Finished modified comment: 

28 


Finished modified commen

349 


Finished modified comment: 

350 


Finished modified comment: 

351 


Finished modified comment: 

352 


Finished modified comment: 

353 


Finished modified comment: 

354 


Finished modified comment: 

355 


Finished modified comment: 

356 


Finished modified comment: 

357 


Finished modified comment: 

358 


Finished modified comment: 

359 


Finished modified comment: 

360 


Finished modified comment: 

361 


Finished modified comment: 

362 


Finished modified comment: 

363 


Finished modified comment: 

364 


Finished modified comment: 

365 


Finished modified comment: 

366 


Finished modified comment: 

367 


Finished modified comment: 

368 


Finished modified comment: 

369 


Finished modified comment: 

370 


Finished modified comment: 

371 


Finished modified comment: 

372 


Finished modified comment: 

373 


Finished modified comment: 

374 


Finished modified comment: 

375 


Finished modified comment: 

376 


Finished modified com


Finished modified comment: 

694 


Finished modified comment: 

695 


Finished modified comment: 

696 


Finished modified comment: 

697 


Finished modified comment: 

698 


Finished modified comment: 

699 


Finished modified comment: 

700 


Finished modified comment: 

701 


Finished modified comment: 

702 


Finished modified comment: 

703 


Finished modified comment: 

704 


Finished modified comment: 

705 


Finished modified comment: 

706 


Finished modified comment: 

707 


Finished modified comment: 

708 


Finished modified comment: 

709 


Finished modified comment: 

710 


Finished modified comment: 

711 


Finished modified comment: 

712 


Finished modified comment: 

713 


Finished modified comment: 

714 


Finished modified comment: 

715 


Finished modified comment: 

716 


Finished modified comment: 

717 


Finished modified comment: 

718 


Finished modified comment: 

719 


Finished modified comment: 

720 


Finished modified comment: 


995 


Finished modified comment: 

996 


Finished modified comment: 

997 


Finished modified comment: 

998 


Finished modified comment: 

999 


Finished modified comment: 

1000 


Finished modified comment: 

1001 


Finished modified comment: 

1002 


Finished modified comment: 

1003 


Finished modified comment: 

1004 


Finished modified comment: 

1005 


Finished modified comment: 

1006 


Finished modified comment: 

1007 


Finished modified comment: 

1008 


Finished modified comment: 

1009 


Finished modified comment: 

1010 


Finished modified comment: 

1011 


Finished modified comment: 

1012 


Finished modified comment: 

1013 


Finished modified comment: 

1014 


Finished modified comment: 

1015 


Finished modified comment: 

1016 


Finished modified comment: 

1017 


Finished modified comment: 

1018 


Finished modified comment: 

1019 


Finished modified comment: 

1020 


Finished modified comment: 

1021 


Finished modified comment: 

1022 

KeyError: "['code'] not in index"