### Import required libraries

In [1]:
import csv
import pandas as pd
import string
from nltk import corpus
from nltk import word_tokenize

### Load the text file into the list

In [2]:
text = pd.read_csv('Amazon S3.tsv', delimiter='\t', encoding='utf-8')

cleantxt = []

for i in text.description:
    cleantxt.append(i)

print(cleantxt[0:100]) # fraction of the full text

["S3 public access can not be set.\r\nI've edited the bucket policy, public access settings, but the bucket is not changed to public and access is denied.\r\n\r\nWhy do you see these symptoms?\r\n\r\nhttps://imgur.com/rV64vwd\r\n(Edit Public Access Settings)\r\n\r\nhttps://imgur.com/Y3pPAQJ\r\n(bucket policy)", "I can't figure out why S3 won't display my website on my domain\r\nError 403 and I am not sure how to troubleshoot the error or resolve it despite reading guides\r\nI've looked at a couple of forums and pages but they are either irrelevant or beyond my current understanding. I cannot figure out why I keep getting a 403. I have a public bucket policy which changed my previous error of not getting a connection to the site to a 403, just forbidding traffic. I am new to AWS, what am I missing?\r\nI only have one HTML file in the bucket and when I hit 'make public' it said access denied. Are there other objects that I have to find?\r\nI was able to get into the HTML file permissions

### Step 1) Convert all characters to lower case

In [3]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = cleantxt[i].lower()

print(cleantxt[0:100])

["s3 public access can not be set.\r\ni've edited the bucket policy, public access settings, but the bucket is not changed to public and access is denied.\r\n\r\nwhy do you see these symptoms?\r\n\r\nhttps://imgur.com/rv64vwd\r\n(edit public access settings)\r\n\r\nhttps://imgur.com/y3ppaqj\r\n(bucket policy)", "i can't figure out why s3 won't display my website on my domain\r\nerror 403 and i am not sure how to troubleshoot the error or resolve it despite reading guides\r\ni've looked at a couple of forums and pages but they are either irrelevant or beyond my current understanding. i cannot figure out why i keep getting a 403. i have a public bucket policy which changed my previous error of not getting a connection to the site to a 403, just forbidding traffic. i am new to aws, what am i missing?\r\ni only have one html file in the bucket and when i hit 'make public' it said access denied. are there other objects that i have to find?\r\ni was able to get into the html file permissions

### Step 2) Remove whitespace characters

In [4]:
for i in range(0, len(cleantxt)):
    cleantxt[i] = ' '.join(cleantxt[i].split())
    
print(cleantxt[0:100])

["s3 public access can not be set. i've edited the bucket policy, public access settings, but the bucket is not changed to public and access is denied. why do you see these symptoms? https://imgur.com/rv64vwd (edit public access settings) https://imgur.com/y3ppaqj (bucket policy)", "i can't figure out why s3 won't display my website on my domain error 403 and i am not sure how to troubleshoot the error or resolve it despite reading guides i've looked at a couple of forums and pages but they are either irrelevant or beyond my current understanding. i cannot figure out why i keep getting a 403. i have a public bucket policy which changed my previous error of not getting a connection to the site to a 403, just forbidding traffic. i am new to aws, what am i missing? i only have one html file in the bucket and when i hit 'make public' it said access denied. are there other objects that i have to find? i was able to get into the html file permissions and when i selected 'public access' 'read

### Step 3) Remove punctuation and special characters

In [5]:
punctuation = "?,.\\()!\";[]+-"

for i in range(0, len(cleantxt)):
    cleantxt[i] = cleantxt[i].translate(str.maketrans("","", punctuation))
    
print(cleantxt[0:100])

["s3 public access can not be set i've edited the bucket policy public access settings but the bucket is not changed to public and access is denied why do you see these symptoms https://imgurcom/rv64vwd edit public access settings https://imgurcom/y3ppaqj bucket policy", "i can't figure out why s3 won't display my website on my domain error 403 and i am not sure how to troubleshoot the error or resolve it despite reading guides i've looked at a couple of forums and pages but they are either irrelevant or beyond my current understanding i cannot figure out why i keep getting a 403 i have a public bucket policy which changed my previous error of not getting a connection to the site to a 403 just forbidding traffic i am new to aws what am i missing i only have one html file in the bucket and when i hit 'make public' it said access denied are there other objects that i have to find i was able to get into the html file permissions and when i selected 'public access' 'read object' it said ac

### Step 4) Remove stop words

In [6]:
stop_words = corpus.stopwords.words('english')
new_stop_words = ['something','i\'m','please','thank','thanks', 'hello', 'hi', 're:', 'hey', 'i\'ve', 'regards']
stop_words.extend(new_stop_words)

filtered_text = []

# print(stop_words)

for i in range(0, len(cleantxt)):
    words = cleantxt[i].split(" ")
    # print(words)
    filtered_sentence = []
    for w in words:
        if w not in stop_words and len(w) > 1:
            filtered_sentence.append(w)
            
    filtered_text.append(filtered_sentence)

### Now, write the output to a TSV file

In [7]:
with open('Amazon S3_Cleaned.tsv', 'w', encoding='utf-8') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for item in filtered_text:
        writer.writerow(['Amazon S3', item])

# Sample Final Output

In [8]:
for item in filtered_text[0:100]:
    print(item)
    print("")

['s3', 'public', 'access', 'set', 'edited', 'bucket', 'policy', 'public', 'access', 'settings', 'bucket', 'changed', 'public', 'access', 'denied', 'see', 'symptoms', 'https://imgurcom/rv64vwd', 'edit', 'public', 'access', 'settings', 'https://imgurcom/y3ppaqj', 'bucket', 'policy']

["can't", 'figure', 's3', 'display', 'website', 'domain', 'error', '403', 'sure', 'troubleshoot', 'error', 'resolve', 'despite', 'reading', 'guides', 'looked', 'couple', 'forums', 'pages', 'either', 'irrelevant', 'beyond', 'current', 'understanding', 'cannot', 'figure', 'keep', 'getting', '403', 'public', 'bucket', 'policy', 'changed', 'previous', 'error', 'getting', 'connection', 'site', '403', 'forbidding', 'traffic', 'new', 'aws', 'missing', 'one', 'html', 'file', 'bucket', 'hit', "'make", "public'", 'said', 'access', 'denied', 'objects', 'find', 'able', 'get', 'html', 'file', 'permissions', 'selected', "'public", "access'", "'read", "object'", 'said', 'access', 'denied', 'whatever', 'reason', 'accessing'

### Suggested improvements:
1) Remove html URLs <br>
2) Remove name and edited by section <br>
3) Remove duplicate tokens in same word vector

### Issues:
1) Select * from queries are not distinguished uniquely