# Preparing data for the Deep Learning model.
This is the most important step of all the machine learning projects. Our models need data to learn and if we give garbage to the model then we will get garbage from the model

In [1]:
import string
import numpy as np 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,LSTM,Dropout,Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [2]:
def load_file(path:str):
    '''
    Function to load the text file for training the language model.
    params:
        path(str): path to the text file to be used for training.
    returns:
        text(str): The text from the file.
    '''
    file = open(path,'r')
    text = file.read()
    file.close()
    return text

def clean_file(file:str):
    '''
    Function to clean the text read from the file and converting them to tokens
    for training the language model.
    params:
        file(str): variable containing the file contents
    returns:
        tokens(list): The text converted to a list of cleaned tokens.
    '''
    file = file.replace('--',' ')
    tokens = file.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens]
    tokens = [word.lower() for word in tokens]
    return tokens

def convert_to_sequences(tokens:list):
    '''
    Function to convert the sequences to tokens.
    params:
        tokens(list): The text converted to a list of cleaned tokens.
    returns:
        sequnces(list): The list of sequences formed from tokens.abs
    '''
    length = 50 + 1
    sequences = list()
    for i in range(length,len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)
    print('Total Sequences: %d' % len(sequences))
    return sequences

def save_sequences(sequences,file_name):
    '''
    Function to save the sequences to the file for further usage.
    params:
        sequnces(list): The list of sequences 
        file_name(str): file name to store the sequences
    returns:
        none
    '''
    data = '\n'.join(sequences)
    file = open(file_name, 'w')
    file.write(data)
    file.close()

def load_sequences(file_name):
    '''
    Function to load the sequences from the text file for training the language model.
    params:
        file_name(str): path to the text file to be used for training.
    returns:
        text(str): The text from the file.
    '''
    file = open(file_name,'r')
    text = file.read()
    file.close()
    return text

file_name = 'republic_clean.txt'
file = load_file(file_name)
print("first 200 characters from the text: \n"+file[:200])
tokens = clean_file(file)
print("----------------------------------")
print(tokens[:50])
print("-----------------------------------")
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

sequences = convert_to_sequences(tokens)

out_filename = 'republic_sequences.txt'
save_sequences(sequences, out_filename)

first 200 characters from the text: 
BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what
----------------------------------
['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i']
-----------------------------------
Total Tokens: 118754
Unique Tokens: 7435
Total Sequences: 118703


In [3]:
in_filename = 'republic_sequences.txt'
file = load_sequences(in_filename)
lines = list(file.split('\n'))

In [56]:
lines

['book i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was',
 'i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted',
 'i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with',
 'went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what man

In [89]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
main_sequences = tokenizer.texts_to_sequences(lines)
print("Vocab size: "+str(len(tokenizer.word_index) + 1))

Vocab size: 7435


In [91]:
main_sequences

[[1046,
  11,
  11,
  1045,
  329,
  7434,
  4,
  1,
  2880,
  35,
  213,
  1,
  261,
  3,
  2255,
  9,
  11,
  179,
  817,
  123,
  92,
  2879,
  4,
  1,
  2253,
  7433,
  1,
  7432,
  7431,
  2,
  75,
  120,
  11,
  1267,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7430,
  1,
  1610,
  13,
  57,
  8,
  549,
  151,
  11,
  57],
 [11,
  11,
  1045,
  329,
  7434,
  4,
  1,
  2880,
  35,
  213,
  1,
  261,
  3,
  2255,
  9,
  11,
  179,
  817,
  123,
  92,
  2879,
  4,
  1,
  2253,
  7433,
  1,
  7432,
  7431,
  2,
  75,
  120,
  11,
  1267,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7430,
  1,
  1610,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1148],
 [11,
  1045,
  329,
  7434,
  4,
  1,
  2880,
  35,
  213,
  1,
  261,
  3,
  2255,
  9,
  11,
  179,
  817,
  123,
  92,
  2879,
  4,
  1,
  2253,
  7433,
  1,
  7432,
  7431,
  2,
  75,
  120,
  11,
  1267,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7430,
  1,
  1610,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1148,
  35],
 [1045,

In [120]:
X=[]
y=[]
for i in range(len(enc)):
    X.append(encoded_sequences[i][:-1])
    y.append([encoded_sequences[i][-1]])

In [122]:
y = np.array(y)

In [125]:
y = to_categorical(y)

In [128]:
len(y[0])

7430

In [72]:
lines.shape

(118703,)

In [79]:
z = [[1,2,3],
     [4,5,6],
     [7,8,9]]

In [80]:
z_ = np.array(z)

50