# Data Preprocessing

In [1]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\aashi\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
from nltk.corpus import brown
from collections import Counter
import pandas as pd
import csv

First, map all uppercased words to lowercase. 

Then, create a list of words with frequency less than 3.

In [3]:
word_list = brown.words()
wordlist_lowercased = [i.lower() for i in word_list]  # Mapping uppercased words to lowercase
c = dict(Counter(wordlist_lowercased)) # Dictionary containing frequency of lowercased word list
lf_words_dict = dict((key, value) for key, value in c.items() if value<4)  # Removing words with frequency <=3.

Create word list by removing words with frequency less than 3.

In [4]:
import time

start = time.time()
for key, value in lf_words_dict.items():
    for i in range(value):
        wordlist_lowercased.remove(key)  # Converting to lowercase
processed_wordlist = wordlist_lowercased.copy()
stop = time.time()
print('Time for loop is:' + str(stop - start))

Time for loop is:779.5225172042847


Divide the Brown Corpus such that the first 800,000 words are used for training, the following 200,000 for validation (model selection, weight decay, early stopping) and the remaining 161,192 for testing.

In [5]:
training_words = processed_wordlist[0:800000]
validation_words = processed_wordlist[800000:1000000]
testing_words = processed_wordlist[1000000:len(processed_wordlist)]

print('Number of words in training set are: '+ str(len(training_words)))
print('Number of words in validation set are: '+ str(len(validation_words)))
print('Number of words in testing set are: '+ str(len(testing_words)))

Number of words in training set are: 800000
Number of words in validation set are: 200000
Number of words in testing set are: 113024


Create sequences of N+1 words from training set, validation set and testing set wherein, the first 10 / 50 / 100 / 300 words are input sequences and the 11th word is the output word.

In [6]:
# Creating input sequences of length 'N'

N = 300
training_sequences = []
validation_sequences = []
testing_sequences = []

for i in range(N, len(training_words)):
    training_sequences.append(training_words[i-N:i+1])

for j in range(N, len(validation_words)):
    validation_sequences.append(validation_words[j-N:j+1])

for k in range(N, len(testing_words)):
    testing_sequences.append(testing_words[k-N:k+1])


In [7]:
print('Number of training sequences are: '+ str(len(training_sequences)))
print('Number of validation sequences are: '+ str(len(validation_sequences)))
print('Number of testing sequences are: '+ str(len(testing_sequences)))

Number of training sequences are: 799700
Number of validation sequences are: 199700
Number of testing sequences are: 112724


Writing word sequences into csv files, by name, 'training_sequences.csv', 'validation_sequences.csv', 'testing_sequences.csv'

In [9]:
import os
os.mkdir("word_sequences")

df_train_seq = pd.DataFrame(training_sequences)
df_val_seq = pd.DataFrame(validation_sequences)
df_test_seq = pd.DataFrame(testing_sequences)

df_train_seq.to_csv('word_sequences/training_sequences.csv')
df_val_seq.to_csv('word_sequences/validation_sequences.csv')
df_test_seq.to_csv('word_sequences/testing_sequences.csv')

Writing dictionary with serial number, words and corresponding frequencies.

In [10]:
unique_words = list(set(processed_wordlist))  # List of lowercased words
snum = list(range(len(unique_words))) # Serial Number for unique words
c = dict(Counter(processed_wordlist))
data = {'Serial Number': snum, 'Word' : list(c.keys()), 'Frequency' : list(c.values())}
df = pd.DataFrame(data)
df.to_csv('dictionary.csv')

In [11]:
print(len(unique_words))

16689


We will use an Embedding Layer to learn the representation of words. 

The word embedding layer expects input sequences to be comprised of integers.

We can map each word in our vocabulary to a unique integer and encode our input sequences. Later, when we make predictions, we can convert the prediction to numbers and look up their associated words in the same mapping.

We will write each of these sequences into csv files, by name, 'encoded_training.csv', 'encoded_validation.csv', 'encoded_testing.csv'.

In [12]:
#os.mkdir('encoded_sequences')
word_dictionary = dict(zip(list(c.keys()), snum))
encoded_training = []
encoded_validation = []
encoded_testing = []

for i in range(len(training_sequences)):
    temp1 = []
    for seq1 in range(N + 1):
        word = training_sequences[i][seq1]
        temp1.append(word_dictionary[word])
    encoded_training.append(temp1) 
        
for j in range(len(validation_sequences)):
    temp2 = []
    for seq2 in range(N + 1):
        word = training_sequences[j][seq2]
        temp2.append(word_dictionary[word])
    encoded_validation.append(temp2)
    
for k in range(len(testing_sequences)):
    temp3 = []
    for seq3 in range(N + 1):
        word = training_sequences[k][seq3]
        temp3.append(word_dictionary[word])
    encoded_testing.append(temp3)

df_enc_train = pd.DataFrame(encoded_training)
df_enc_val = pd.DataFrame(encoded_validation)
df_enc_test = pd.DataFrame(encoded_testing)

df_enc_train_20pc = pd.DataFrame(encoded_training).sample(frac=0.2)
df_enc_val_20pc = pd.DataFrame(encoded_validation).sample(frac=0.2)
df_enc_test_20pc = pd.DataFrame(encoded_testing).sample(frac=0.2)

df_enc_train.to_csv('encoded_sequences/encoded_training.csv')
df_enc_val.to_csv('encoded_sequences/encoded_validation.csv')
df_enc_test.to_csv('encoded_sequences/encoded_testing.csv')

df_enc_train_20pc.to_csv('encoded_sequences/encoded_training_20pc.csv')
df_enc_val_20pc.to_csv('encoded_sequences/encoded_validation_20pc.csv')
df_enc_test_20pc.to_csv('encoded_sequences/encoded_testing_20pc.csv')