<table style="width:100%">
  <tr>
    <td><center style="font-size:300%;">Modelamento de Linguagem</center></td>
    <td><img src="https://logodownload.org/wp-content/uploads/2015/02/puc-rio-logo.gif" width="100"/></td> 
  </tr>    
</table>

Msc. Cristian Muñoz V.

In [0]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras import Model, Input
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM , GRU, SimpleRNN
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import nltk
nltk.download('punkt')

In [0]:
%%bash

rm -rf PRH.zip PRH
wget -O PRH.zip https://github.com/fabiocorreacordeiro/wordEmbeddingsOG/raw/master/_corpus/PRH.zip
unzip PRH

In [0]:
def preprocess(text):
    text = text.lower()
    text = nltk.word_tokenize(text , language='portuguese')
    return text

In [0]:
filepath= 'PRH/20140404-MONOGRAFIA_0.txt'
text = open(filepath,'rb').read()
text = text.decode("utf-8","ignore")
text = preprocess(text)

In [0]:
print(text[:100])

In [0]:
words = set([word for word in text])
vocab_size = len(words)
w2i = {word: i for i, word in enumerate(words)}
i2w = {v:k for k, v in w2i.items()}

In [0]:
sequence_length = 10
step= 1

In [0]:
input_words = [text[i:][:sequence_length] for i in range(0, len(text) - sequence_length - 1, step)]
label_words = [text[i+1:][:sequence_length] for i in range(0, len(text) - sequence_length - 1, step)]

In [0]:
def one_hot(targets, nb_classes):
    res = np.eye(nb_classes)[np.array(targets).reshape(-1)]
    return res.reshape(list(targets.shape)+[nb_classes])

In [0]:
X = np.array([list(map(lambda w:w2i[w] , input_word)) for input_word in input_words])
Y = np.array([list(map(lambda w:w2i[w] , label_word)) for label_word in label_words])
Y_OH = one_hot(Y,vocab_size)

In [0]:
hidden_size = 128
batch_size = 30
epochs = 25
emb_size = 128

In [0]:
# Camda de Entrada
input = Input(shape=[None])

# Camada de Embedding
x = Embedding(vocab_size, 
              emb_size,
              name = "embeddings" , 
              embeddings_initializer="glorot_uniform")(input)

# Camada LSTM
x = LSTM(hidden_size, return_sequences=True , name = "lstm")(x)

# Camda Dropout
x = Dropout(0.2)(x)

# Camda FC
x = Dense(vocab_size , name = "linear")(x)
output = Activation("softmax" , name = "softmax")(x)

model = Model(input=input, output=output)

model.summary()

In [0]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [0]:
history = model.fit(X,Y_OH,batch_size=batch_size, epochs=epochs, verbose=1)
model.save_weights('model.hdf5')
print("loss=%f" % (history.history["loss"][-1]))

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.plot(history.history["loss"], color="g", label="Train")
plt.title('Loss')
plt.tight_layout()
plt.grid()
plt.show()

In [0]:
inputformat = lambda w: np.reshape(w2i[w],[1,1])

def query_sentences(sentence, length, multinomial=False):
    model.reset_states()
    
    words = preprocess(sentence)
    [model.predict(inputformat(word)) for word in words[:-1]]
        

    print(words[-1] + " " , end="")
    startWord = inputformat(words[-1])
    for i in range(length):         
      
        next_word_probs = model.predict(startWord).squeeze()
        
        if multinomial:
            next_word_id = np.random.multinomial(1, next_word_probs , 1).argmax()
        else:
            next_word_id    = next_word_probs.squeeze().argmax()

        startWord[0,0] = next_word_id 
        print(i2w[next_word_id] + " ", end = "")

In [0]:
query_sentences(sentence='ou', length=30, multinomial=False)