### Import required libraries

In [1]:
import csv
import pandas as pd
import string
from nltk import corpus
from nltk import word_tokenize

### Load the text file into the list

In [50]:
title = "AmazonEC2_4"

text = pd.read_csv(title + '.tsv', delimiter='\t', encoding='utf-8')

cleantxt = []

for i in text.description:
    if str(i) != 'nan':
        cleantxt.append(i)

print(cleantxt[0:100]) # fraction of the full text



### Step 1) Convert all characters to lower case

In [51]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = str(cleantxt[i]).lower()

print(cleantxt[0:100])



### Step 2) Remove whitespace characters

In [52]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = ' '.join(cleantxt[i].split())
    
print(cleantxt[0:100])



### Step 3) Remove punctuation and special characters

In [53]:
punctuation = "?,.\\()!\";[]+-"

for i in range(0, len(cleantxt)):
    cleantxt[i] = cleantxt[i].translate(str.maketrans("","", punctuation))
    
print(cleantxt[0:100])



### Step 4) Remove stop words

In [54]:
stop_words = corpus.stopwords.words('english')
new_stop_words = ['something','i\'m','please','thank','thanks', 'hello', 'hi', 're:', 'hey', 'i\'ve', 'regards']
stop_words.extend(new_stop_words)

filtered_text = []

# print(stop_words)

for i in range(0, len(cleantxt)):
    words = cleantxt[i].split(" ")
    # print(words)
    filtered_sentence = []
    for w in words:
        if w not in stop_words and len(w) > 1:
            filtered_sentence.append(w)
            
    filtered_text.append(filtered_sentence)

### Now, write the output to a TSV file

In [55]:
with open(title + '_Cleaned.tsv', 'w', encoding='utf-8') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for item in filtered_text:
        writer.writerow([title, item])

# Sample Final Output

In [56]:
for item in filtered_text[0:100]:
    print(item)
    print("")

['cannot', 'ssh', 'instance', 'launched', 'ec2', 'instance', 'beginning', 'could', 'ssh', 'minutes', 'instance', 'accessible', 'anymore', 'instance', 'ec25238240122uswest2computeamazonawscom', 'happened', 'quite', 'times', 'anyone', 'help', 'wonder', 'avoid', 'situation', 'later']

['imagemagick', 'fire', 'cve2016–3714', 'aws', 'ami', '201603', 'imagemagick', 'security', 'alert', 'cve2016–3714', 'see', 'details', 'https://imagetragickcom/', 'seems', 'aws', 'ami', '201603', 'patched', 'think', 'update', 'imagemagick', 'policyxml', 'soon', 'edited', 'by:', 'kkbb2014', 'may', '2016', '1:36', 'pm']

['imagemagick', 'fire', 'cve2016–3714', 'aws', 'ami', '201603', 'aws', 'aware', 'cve2016–3714', 'updating', 'imagemagick', 'package', 'available', 'amazon', 'linux', 'repositories', 'soon', 'possible', 'new', 'package', 'available', 'bulletin', 'posted', 'amazon', 'linux', 'ami', 'security', 'center:', 'https://alasawsamazoncom', 'customers', 'update', 'policyxml', 'file', 'used', 'imagemagick'

### Suggested improvements:
1) Remove html URLs <br>
2) Remove name and edited by section <br>
3) Remove duplicate tokens in same word vector

### Issues:
1) Select * from queries are not distinguished uniquely