# Preparing data for the Deep Learning model.
This is the most important step of all the machine learning projects. Our models need data to learn and if we give garbage to the model then we will get garbage from the model

In [1]:
import string
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,LSTM,Dropout,Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [9]:
def load_file(path:str):
    '''
    Function to load the text file for training the language model.
    params:
        path(str): path to the text file to be used for training.
    returns:
        text(str): The text from the file.
    '''
    file = open(path,'r')
    text = file.read()
    file.close()
    return text

def clean_file(file:str):
    '''
    Function to clean the text read from the file and converting them to tokens
    for training the language model.
    params:
        file(str): variable containing the file contents
    returns:
        tokens(list): The text converted to a list of cleaned tokens.
    '''
    # replace '--' with a space
    file = file.replace('--',' ')
    # split into tokens
    tokens = file.split()
    # remove punctuations from token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # removing remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # convert to lower case
    tokens = [word.lower() for word in tokens]
    return tokens

def convert_to_sequences(tokens:list):
    '''
    Function to convert the sequences to tokens.
    params:
        tokens(list): The text converted to a list of cleaned tokens.
    returns:
        sequnces(list): The list of sequences formed from tokens.abs
    '''
    length = 50 + 1
    sequences = list()
    for i in range(length,len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)
    print('Total Sequences: %d' % len(sequences))
    return sequences

def save_sequences(sequences,file_name):
    '''
    Function to save the sequences to the file for further usage.
    params:
        sequnces(list): The list of sequences 
        file_name(str): file name to store the sequences
    returns:
        none
    '''
    data = '\n'.join(sequences)
    file = open(file_name, 'w')
    file.write(data)
    file.close()

def load_sequences(file_name):
    '''
    Function to load the sequences from the text file for training the language model.
    params:
        file_name(str): path to the text file to be used for training.
    returns:
        text(str): The text from the file.
    '''
    file = open(file_name,'r')
    text = file.read()
    file.close()
    return text

file_name = 'republic_clean.txt'
file = load_file(file_name)
print("first 200 characters from the text: \n"+file[:200])
tokens = clean_file(file)
print("----------------------------------")
print(tokens[:50])
print("-----------------------------------")
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

sequences = convert_to_sequences(tokens)

out_filename = 'republic_sequences.txt'
save_sequences(sequences, out_filename)

first 200 characters from the text: 
BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what
----------------------------------
['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i']
-----------------------------------
Total Tokens: 118684
Unique Tokens: 7409
Total Sequences: 118633


In [10]:
# load doc into memory
def load_file(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'republic_sequences.txt'
doc = load_file(in_filename)
lines = doc.split('\n')

In [11]:

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)


In [12]:
vocab_size = len(tokenizer.word_index) + 1

In [13]:

# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]


In [14]:
X.shape

(118633, 50)

In [17]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            370500    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 7410)              748410    
Total params: 1,269,810
Trainable params: 1,269,810
Non-trainable params: 0
_________________________________________________________________
None


In [18]:

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)

Train on 118633 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

KeyboardInterrupt: 