# Preprocessing



### Read data

In [1]:
with open('../data/pg1597.txt') as f:
    lines = f.readlines()
print(lines[:6])

["THE EMPEROR'S NEW CLOTHES\n", '\n', 'Many years ago, there was an Emperor, who was so excessively fond of\n', 'new clothes, that he spent all his money in dress. He did not trouble\n', 'himself in the least about his soldiers; nor did he care to go either to\n', 'the theatre or the chase, except for the opportunities then afforded him\n']


### Tokenization

In [2]:
# import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
data = "".join(lines)
data = word_tokenize(data)
print(data[:51])

['THE', 'EMPEROR', "'S", 'NEW', 'CLOTHES', 'Many', 'years', 'ago', ',', 'there', 'was', 'an', 'Emperor', ',', 'who', 'was', 'so', 'excessively', 'fond', 'of', 'new', 'clothes', ',', 'that', 'he', 'spent', 'all', 'his', 'money', 'in', 'dress', '.', 'He', 'did', 'not', 'trouble', 'himself', 'in', 'the', 'least', 'about', 'his', 'soldiers', ';', 'nor', 'did', 'he', 'care', 'to', 'go', 'either']


### Lower case

In [3]:
words = " ".join(data)
lower_w = words.lower()
lower_w[:50]

"the emperor 's new clothes many years ago , there "

### Stop words removal

In [4]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords 
  
stop_words = set(stopwords.words('english')) 
word_tokens = lower_w.split(" ")
  
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
# filtered_sentence[0] = filtered_sentence[0][1:]
print(filtered_sentence[:51])

['emperor', "'s", 'new', 'clothes', 'many', 'years', 'ago', ',', 'emperor', ',', 'excessively', 'fond', 'new', 'clothes', ',', 'spent', 'money', 'dress', '.', 'trouble', 'least', 'soldiers', ';', 'care', 'go', 'either', 'theatre', 'chase', ',', 'except', 'opportunities', 'afforded', 'displaying', 'new', 'clothes', '.', 'different', 'suit', 'hour', 'day', ';', 'king', 'emperor', ',', 'one', 'accustomed', 'say', ',', '“', 'sitting', 'council']


### Stemming

In [5]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

output = []
for word in filtered_sentence:
    output.append((ps.stem(word)))

In [6]:
final_out = " ".join(output)

In [7]:
print(final_out[:500])

emperor 's new cloth mani year ago , emperor , excess fond new cloth , spent money dress . troubl least soldier ; care go either theatr chase , except opportun afford display new cloth . differ suit hour day ; king emperor , one accustom say , “ sit council , ” alway said , “ emperor sit wardrobe. ” time pass merrili larg town capit ; stranger arriv everi day court . one day , two rogu , call weaver , made appear . gave knew weav stuff beauti color elabor pattern , cloth manufactur wonder proper


### Vocabulary

In [8]:
def get_vocab(data):
    data = data.replace('\n', ' ').split(' ')
    words = list(set(data))
    vocabulary = {word:index for index, word in enumerate(words)}
    vocab_size = len(vocabulary)
    
    return vocabulary, vocab_size

In [9]:
vocabulary, vocab_size = get_vocab(final_out)
vocabulary

{'trampl': 0,
 'gutter': 1,
 'destroy.': 2,
 'observ': 3,
 'secret': 4,
 'seeth': 5,
 'bank.': 6,
 'match': 7,
 'knight': 8,
 'trifl': 9,
 'burnt-out': 10,
 'kickery-ki': 11,
 'disput': 12,
 'selenit': 13,
 'mummeri': 14,
 'broad': 15,
 'side': 16,
 'staircas': 17,
 'flaminiu': 18,
 'grapes.': 19,
 'squar': 20,
 'innat': 21,
 'trunk': 22,
 'potato': 23,
 "'thou": 24,
 'asham': 25,
 'encompass': 26,
 'passag': 27,
 'beast': 28,
 'good': 29,
 'audienc': 30,
 'obey': 31,
 'deceiv': 32,
 'land': 33,
 'thitherward': 34,
 'brass': 35,
 "merchant'": 36,
 'trod': 37,
 'frederickshafen': 38,
 'magnific': 39,
 'procur': 40,
 'sallow': 41,
 'congreg': 42,
 'physic': 43,
 'rose-color': 44,
 'thunder': 45,
 'treasur': 46,
 'theatr': 47,
 'idea': 48,
 'girl': 49,
 'augh': 50,
 'year': 51,
 'sailor-boy': 52,
 'quicker': 53,
 'zealand': 54,
 'foremost': 55,
 'coal.': 56,
 'wood-pigeon': 57,
 'deton': 58,
 'result': 59,
 'prospect': 60,
 'plunder': 61,
 'hyacinth': 62,
 'breathing.': 63,
 'tack': 64,
 

In [10]:
vocab_size

4119

In [11]:
train_data = final_out
test_data = final_out

### Train data on RNN

In [13]:
import tensorflow as tf

ModuleNotFoundError: No module named 'tensorflow'