### Import required libraries

In [1]:
import csv
import pandas as pd
import string
from nltk import corpus
from nltk import word_tokenize

### Load the text file into the list

In [2]:
text = pd.read_csv('Amazon VPCi.tsv', delimiter='\t', encoding='utf-8')

cleantxt = []

for i in text.description:
    cleantxt.append(i)

print(cleantxt[0:100]) # fraction of the full text



### Step 1) Convert all characters to lower case

In [3]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = cleantxt[i].lower()

print(cleantxt[0:100])



### Step 2) Remove whitespace characters

In [4]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = ' '.join(cleantxt[i].split())
    
print(cleantxt[0:100])



### Step 3) Remove punctuation and special characters

In [5]:
punctuation = "?,.\\()!\";[]+-"

for i in range(0, len(cleantxt)):
    cleantxt[i] = cleantxt[i].translate(str.maketrans("","", punctuation))
    
print(cleantxt[0:100])



### Step 4) Remove stop words

In [6]:
stop_words = corpus.stopwords.words('english')
new_stop_words = ['something','i\'m','please','thank','thanks', 'hello', 'hi', 're:', 'hey', 'i\'ve', 'regards']
stop_words.extend(new_stop_words)

filtered_text = []

# print(stop_words)

for i in range(0, len(cleantxt)):
    words = cleantxt[i].split(" ")
    # print(words)
    filtered_sentence = []
    for w in words:
        if w not in stop_words and len(w) > 1:
            filtered_sentence.append(w)
            
    filtered_text.append(filtered_sentence)

### Now, write the output to a TSV file

In [7]:
with open('Amazon VPC_Cleaned.tsv', 'w', encoding='utf-8') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for item in filtered_text:
        writer.writerow(['Amazon VPC', item])

# Sample Final Output

In [8]:
for item in filtered_text[0:100]:
    print(item)
    print("")

['asymmetric', 'traffic', 'vpc', 'issue', 'regarding', 'asymmetric', 'packet', 'passing', 'vpc', 'created', 'vpc:', 'vpc:', '10200/16', 'created', 'subnets', 'vpc:', 'a:', '10210/24', 'b:', '10220/24', 'created', 'ec2instance:', 'r1:', 'cisco', 'csr1000v', 'enis:', 'gi1:', '1021181/24', 'subnet', 'gi2:', '1021254/24', 'subnet', 'r2:', 'cisco', 'csr1000v', 'eni:', 'gi1:', '1021253/24', 'subnet', 'got', 'problem', 'trying', 'ping', 'r2', 'r1:', 'r2:', 'ping', '1021181', 'failed', 'think', 'traffic', 'flow', 'asymmetric:', 'r2gi1', 'vpc', 'router', 'r1g1', 'r1', 'g2', 'r2g1', 'instead', 'symmetric:', 'r2g1', 'vpc', 'router', 'r1g1', 'vpc', 'router', 'r2g1', 'tried', 'trace', 'packet', 'found', 'packet', 'reaching', 'r1g1', "can't", 'get', 'ping', 'reply', 'r2g1', 'anyone', 'idea', 'issue', 'really', 'appreciate', 'help', 'felix']

['createdefaultvpc', 'trying', 'create', 'default', 'vpc', 'receiving', 'following', 'error', 'default', 'region', 'name', 'https://forumsawsamazoncom/:', 'usea

### Suggested improvements:
1) Remove html URLs <br>
2) Remove name and edited by section <br>
3) Remove duplicate tokens in same word vector

### Issues:
1) Select * from queries are not distinguished uniquely