In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import tensorflow as tf

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import CategoryEncoding, Input, Dense, StringLookup, Embedding, LSTM

In [6]:
import nltk    
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [7]:
from string import punctuation

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
import matplotlib.pyplot as plt

In [10]:
nltk.download('punkt') # Necessário para word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
df = pd.read_csv("../../../dados/nlp/dialogs.txt", encoding="utf-8", sep="\t", header=None, names=["ConversaA", "ConversaB"])
df.head()

Unnamed: 0,ConversaA,ConversaB
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [12]:
def limpar_texto( texto ):
    translator = str.maketrans('—’', '  ', punctuation)
    texto_limpo = texto.lower().translate( translator )
    return "<BOS> " + texto_limpo + " <EOS>"

In [13]:
df["ConversaALimpa"] = df["ConversaA"].apply(limpar_texto)
df["ConversaBLimpa"] = df["ConversaB"].apply(limpar_texto)

In [14]:
# <BOS> - <Begin Of Sentence>
# <EOS> - <End Of Sentence>
# <OOV> - <Out Of Vocabulary>

lista_final = list(df["ConversaALimpa"]) + list(df["ConversaBLimpa"])
tokenizer = Tokenizer(oov_token="<OOV>", filters="", lower=False) # OOV = Out Of Vocabulary
tokenizer.fit_on_texts( lista_final )
vocabulario = tokenizer.word_index
list(vocabulario.items())[0:5]

[('<OOV>', 1), ('<BOS>', 2), ('<EOS>', 3), ('i', 4), ('you', 5)]

In [15]:
tokens_conversa_a = tokenizer.texts_to_sequences( df["ConversaALimpa"] )
tokens_conversa_b = tokenizer.texts_to_sequences( df["ConversaBLimpa"] )

In [16]:
print(df["ConversaALimpa"][0:5])
print(tokens_conversa_a[0:5])

0                <BOS> hi how are you doing <EOS>
1          <BOS> im fine how about yourself <EOS>
2    <BOS> im pretty good thanks for asking <EOS>
3     <BOS> no problem so how have you been <EOS>
4       <BOS> ive been great what about you <EOS>
Name: ConversaALimpa, dtype: object
[[2, 1515, 39, 17, 5, 177, 3], [2, 34, 606, 39, 36, 556, 3], [2, 34, 159, 48, 250, 26, 491, 3], [2, 31, 173, 23, 39, 16, 5, 102, 3], [2, 103, 102, 108, 12, 36, 5, 3]]


In [17]:
tokens_entrada_a = []
for tokens in tokens_conversa_a:
    tokens_entrada_a.append( tokens[1:-1] )

In [18]:
tokens_entrada_a[0:5]

[[1515, 39, 17, 5, 177],
 [34, 606, 39, 36, 556],
 [34, 159, 48, 250, 26, 491],
 [31, 173, 23, 39, 16, 5, 102],
 [103, 102, 108, 12, 36, 5]]

In [19]:
encoder_entrada_dados = pad_sequences( tokens_entrada_a, padding="pre" )
encoder_entrada_dados.shape

(3725, 19)

In [20]:
decoder_entrada_dados = pad_sequences( [tokens[0:-1] for tokens in tokens_conversa_a], padding="pre" )
decoder_entrada_dados.shape

(3725, 20)

In [21]:
decoder_saida_dados = pad_sequences( [tokens[1:] for tokens in tokens_conversa_b], padding="pre" )
decoder_saida_dados.shape

(3725, 20)

In [22]:
decoder_saida_dados[0:5]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  34, 606,  39,  36, 556,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         34, 159,  48, 250,  26, 491,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  31,
        173,  23,  39,  16,   5, 102,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        103, 102, 108,  12,  36,   5,   3],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 103, 102,
         48,  34,  18,  96,  70, 115,   3]])

In [23]:
NUM_PALAVRAS = len(vocabulario)
NUM_PALAVRAS

2527

In [24]:
category_encoding_saida = CategoryEncoding(num_tokens=NUM_PALAVRAS, output_mode="one_hot")
decoder_saida_dados_onehot = category_encoding_saida( decoder_saida_dados ).numpy()
decoder_saida_dados_onehot[0:10]

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [25]:
embed_dimens = 50
lstm_nodes = 50

In [27]:
encoder_input_layer = Input( (None, ) )
encoder_embed_layer = Embedding( input_dim=(NUM_PALAVRAS), output_dim=embed_dimens, mask_zero=False)
encoder_embed = encoder_embed_layer( encoder_input_layer )
encoder_lstm_layer = LSTM( lstm_nodes, return_state = True )
encoder_lstm_result, state_h, state_c = encoder_lstm_layer( encoder_embed )
encoder_states = [state_h, state_c]

In [28]:
decoder_input_layer = Input( (None, ) )
decoder_embed_layer = Embedding( input_dim=(NUM_PALAVRAS), output_dim=embed_dimens )
decoder_embed = decoder_embed_layer( decoder_input_layer )
decoder_lstm_layer = LSTM( lstm_nodes, return_state = True, return_sequences = True )
decoder_lstm_result, _, _ = decoder_lstm_layer( decoder_embed, initial_state = encoder_states )
decoder_saida_layer = Dense( NUM_PALAVRAS, activation="softmax" )
decoder_saida = decoder_saida_layer( decoder_lstm_result )

In [30]:
modelo = Model( [encoder_input_layer, decoder_input_layer], decoder_saida )
modelo.compile( optimizer="adam", loss="categorical_crossentropy", metrics=["Accuracy"])
resultado = modelo.fit( [encoder_entrada_dados, decoder_entrada_dados], decoder_saida_dados_onehot, batch_size=32, epochs=50 )

Epoch 1/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - Accuracy: 0.6183 - loss: 4.2983
Epoch 2/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6239 - loss: 2.6634
Epoch 3/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6239 - loss: 2.5650
Epoch 4/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6239 - loss: 2.5126
Epoch 5/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6498 - loss: 2.4628
Epoch 6/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6702 - loss: 2.4067
Epoch 7/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - Accuracy: 0.6713 - loss: 2.3595
Epoch 8/50
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - Accuracy: 0.6730 - loss: 2.3184
Epoch 9/50
[1m117/117[0m [32m

In [32]:
modelo_encoder = Model( encoder_input_layer, encoder_states )

In [33]:
decoder_state_h = Input( (lstm_nodes, ) )
decoder_state_c = Input( (lstm_nodes, ) )
decoder_states_input = [decoder_state_h, decoder_state_c]

In [34]:
decoder_output, state_h2, state_c2 = decoder_lstm_layer( decoder_embed, initial_state = decoder_states_input )
decoder_states_output = [state_h2, state_c2]
decoder_saida2 = decoder_saida_layer( decoder_output )

In [35]:
modelo_decoder = Model( [decoder_input_layer] + decoder_states_input, [decoder_saida2] + decoder_states_output)

In [64]:
# texto = "Hi how are you"
def gerador_textos( texto, max_palavras = 10):
    tokens_sequence = tokenizer.texts_to_sequences( [texto] )
    tokens_padded = pad_sequences( tokens_sequence, padding="pre", maxlen=19 )
    print("Tokens padded: ", tokens_padded)
    state_values = modelo_encoder.predict( tokens_padded )
    word_bos = vocabulario["<BOS>"]
    print("Word <BOS>: ", word_bos)
    # <BOS> I am fine
    # for i in range(max_palavras):
    print("State Values: ", state_values )
    sentenca_entrada = np.zeros( (1, 50) )
    sentenca_entrada[0, 0] = word_bos
    print("Sentenca Entrada: ", sentenca_entrada )
    decoder_entrada = [sentenca_entrada] + state_values
    print("Decoder Entrada: ", decoder_entrada )   
    
    decoder_saida, decoder_state_h, decoder_state_c = modelo_decoder.predict( decoder_entrada )
    state_value = [decoder_state_h, decoder_state_c]
    print("Decoder Saida shape: ", decoder_saida.shape )
    print("Decoder Saida: ", decoder_saida )
    indice_palavra = np.argmax( decoder_saida[0], axis=1 )
    print("Indice Palavra: ", indice_palavra)
    palavra = tokenizer.sequences_to_texts( [ indice_palavra] )
    print( palavra )
    

# texto = "Hi how are you"
gerador_textos( texto )