In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import CategoryEncoding, Input, Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
import nltk
from nltk.tokenize import word_tokenize

In [6]:
import matplotlib.pyplot as plt

In [7]:
from string import punctuation

In [8]:
df = pd.read_csv( "D:/git/dados/nlp/dialogs.txt", sep="\t", header=None, names=["Persona_a", "Persona_b"])
df.head()

Unnamed: 0,Persona_a,Persona_b
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [9]:
texto = df["Persona_b"][0]
texto

"i'm fine. how about yourself?"

In [10]:
translator = str.maketrans("", "", punctuation)
texto_novo = texto.translate( translator )
texto_novo

'im fine how about yourself'

In [11]:
contraction_dict = {
    "aren't": "are not", "can't": "can not", "could've": "could have",
    "couldn't": "could not", "daren't": "dare not", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he's": "he is",
    "how'd": "how had", "how're": "how are", "how's": "how is",
    "how've": "how have", "i'd": "i had", "i'm": "i am",
    "i've": "i have", "isn't": "is+ not", "it's": "it is",
    "might've": "might have", "mightn't": "might not", "must've": "must have",
    "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not",
    "shan't": "shall not", "she'd": "she had", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "that'd": "that had",
    "thats's": "that is", "there'd": "there had", "there's": "there is",
    "they'd": "they had", "they're": "you are", "they've": "they have",
    "wasn't": "was+ not", "we'd": "we had", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'd": "what had",
    "what're": "what are", "what's": "what is", "what've": "what have",
    "when'd": "when had", "when're": "when are", "when's": "when is",
    "when've": "when have", "where'd": "where had", "where're": "where are",
    "where's": "where is", "where've": "where have", "who'd": "who had",
    "who're": "who are", "who's": "who is", "who've": "who have",
    "why'd": "why had", "why're": "why are", "why's": "why is",
    "why've": "why have", "would've": "would have", "wouldn't": "would not",
    "you're": "you are", "you've": "you have", "'cause": "because", 
    "ain't": "is not", "aren't": "are not", "can't": "cannot", 
    "could've": "could have", "he's": "he is", "how'll": "how will",
    "i'll": "i will", "it'll": "it will", "it's": "it is", 
    "she'll": "she will", "she's": "she is", "that'll": "that will",
    "there'll": "there will", "they'll": "they will", "they're": "they are",
    "we'll": "we will", "we're": "we are", "what'll": "what will",
    "when'll": "when will", "where'll": "where will", "who'll": "who will",
    "yo're": "you are", "you'll": "you will"
}

In [12]:
texto.split(" ")

["i'm", 'fine.', 'how', 'about', 'yourself?']

In [13]:
def limpar_texto( texto ):
    translator = str.maketrans("", "", punctuation)
    texto_minusculo = texto.lower()
    tokens = texto_minusculo.split(" ")
    palavras_limpas = []
    for token in tokens: 
        if token in contraction_dict:
            palavras_limpas.append(contraction_dict[token])
        else:
            palavras_limpas.append(token)
    texto_novo = " ".join(palavras_limpas)
    return "<BOS> " + texto_novo.translate(translator) + " <EOS>"
        

In [14]:
df["Persona_a_limpa"] = df["Persona_a"].apply( limpar_texto )
df["Persona_b_limpa"] = df["Persona_b"].apply( limpar_texto )
df.head()

Unnamed: 0,Persona_a,Persona_b,Persona_a_limpa,Persona_b_limpa
0,"hi, how are you doing?",i'm fine. how about yourself?,<BOS> hi how are you doing <EOS>,<BOS> i am fine how about yourself <EOS>
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,<BOS> i am fine how about yourself <EOS>,<BOS> i am pretty good thanks for asking <EOS>
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,<BOS> i am pretty good thanks for asking <EOS>,<BOS> no problem so how have you been <EOS>
3,no problem. so how have you been?,i've been great. what about you?,<BOS> no problem so how have you been <EOS>,<BOS> i have been great what about you <EOS>
4,i've been great. what about you?,i've been good. i'm in school right now.,<BOS> i have been great what about you <EOS>,<BOS> i have been good i am in school right no...


In [15]:
tokenizer = Tokenizer(oov_token="<UNK>", filters="", lower=False) # OOV = Out Of Vocabulary

In [16]:
lista_textos  = []
lista_textos.extend(df["Persona_a_limpa"])
lista_textos.extend(df["Persona_b_limpa"])

In [17]:
tokenizer.fit_on_texts( lista_textos )

In [18]:
tokenizer.texts_to_sequences( [ "hi i am fine and how about your brother's computer" ] )

[[1488, 4, 28, 585, 18, 39, 37, 44, 1, 818]]

In [19]:
vocabulario = tokenizer.word_index
VOCAB_SIZE = len(vocabulario) + 1  # +1 para acomodar o token de padding
VOCAB_SIZE

2498

In [20]:
conversa_a_sequences = tokenizer.texts_to_sequences( df["Persona_a_limpa"] )
conversa_b_sequences = tokenizer.texts_to_sequences( df["Persona_b_limpa"] )

In [21]:
# conversa_a_sequences[:10]

In [22]:
encoder_input_draft = [sequence[1:] for sequence in conversa_a_sequences] 
decoder_input_draft = [sequence[:-1] for sequence in conversa_b_sequences]
decoder_output_draft = [sequence[1:] for sequence in conversa_b_sequences]

In [23]:
# decoder_output_data[:10]

In [24]:
encoder_input_data = np.array(pad_sequences( encoder_input_draft, padding="post" ))
decoder_input_data = np.array(pad_sequences( decoder_input_draft, padding="post" ))
decoder_output_data = np.array(pad_sequences( decoder_output_draft, padding="post" ))

In [25]:
output_encoder = CategoryEncoding( num_tokens=VOCAB_SIZE, output_mode="one_hot" )
decoder_output_categorized = output_encoder( decoder_output_data ).numpy()
decoder_output_categorized.shape

(3725, 21, 2498)

In [26]:
encoder_input_data.shape

(3725, 21)

In [27]:
ENCODER_ENTRADA_SIZE = encoder_input_data.shape[1]
DECODER_ENTRADA_SIZE = decoder_input_data.shape[1]
DECODER_SAIDA_SIZE = decoder_output_data.shape[1]
EMBED_DIM = 100
LSTM_NODES = 200

In [28]:
# LSTM Encoder
encoder_input_layer = Input( (ENCODER_ENTRADA_SIZE,) )
encoder_embedded_layer = Embedding( VOCAB_SIZE, EMBED_DIM, mask_zero=True )
encoder_embedded = encoder_embedded_layer( encoder_input_layer )
encoder_lstm_layer = LSTM( LSTM_NODES, return_state = True )
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm_layer( encoder_embedded )
encoder_states = [encoder_state_h, encoder_state_c]

In [29]:
#LSTM Decoder
decoder_input_layer = Input( (DECODER_ENTRADA_SIZE,) )
decoder_embedded_layer = Embedding( VOCAB_SIZE, EMBED_DIM, mask_zero=True )
decoder_embedded = decoder_embedded_layer( decoder_input_layer )
decoder_lstm_layer = LSTM( LSTM_NODES, return_state = True, return_sequences = True )
decoder_lstm_output, _, _ = decoder_lstm_layer( decoder_embedded, initial_state = encoder_states )
decoder_output_layer = Dense( VOCAB_SIZE, activation="softmax" )
decoder_output = decoder_output_layer( decoder_lstm_output )

In [30]:
modelo = Model( [encoder_input_layer, decoder_input_layer], decoder_output )
modelo.compile( optimizer=RMSprop(), loss="categorical_crossentropy", metrics=["Accuracy"])
modelo.summary()

In [31]:
resultado = modelo.fit( [encoder_input_data, decoder_input_data], decoder_output_categorized, batch_size=32, epochs=100 )

Epoch 1/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 66ms/step - Accuracy: 0.1458 - loss: 5.9299
Epoch 2/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - Accuracy: 0.1601 - loss: 5.3439
Epoch 3/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 66ms/step - Accuracy: 0.1607 - loss: 5.2749
Epoch 4/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 65ms/step - Accuracy: 0.1612 - loss: 5.2037
Epoch 5/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - Accuracy: 0.1623 - loss: 5.1278
Epoch 6/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - Accuracy: 0.1664 - loss: 5.0400
Epoch 7/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 68ms/step - Accuracy: 0.1765 - loss: 4.9572
Epoch 8/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 69ms/step - Accuracy: 0.1838 - loss: 4.8811
Epoch 9/100
[1m117/117

In [32]:
encoder_model = Model( encoder_input_layer, encoder_states )
decoder_lstm_input_h = Input( (LSTM_NODES, ) )
decoder_lstm_input_c = Input( (LSTM_NODES, ) )
decoder_state_inputs = [decoder_lstm_input_h, decoder_lstm_input_c]
decoder_lstm_output, decoder_state_h, decoder_state_c = decoder_lstm_layer( decoder_embedded, initial_state = decoder_state_inputs )
decoder_state_output = [decoder_state_h, decoder_state_c]
decoder_output = decoder_output_layer( decoder_lstm_output )

decoder_model = Model ( [decoder_input_layer] + decoder_state_inputs, [decoder_output] + decoder_state_output)
decoder_model.summary()

In [33]:
texto = "Hi there ! how are you doing ?"
texto_limpo = limpar_texto( texto ) 

In [34]:
sequencia = [ seq[1:-1] for seq in tokenizer.texts_to_sequences( [texto_limpo] ) ]
sequencia

[[1488, 47, 39, 15, 5, 167]]

In [35]:
sequencia_padded =  np.array(pad_sequences( sequencia, maxlen=ENCODER_ENTRADA_SIZE, padding="post" ))
sequencia_padded

array([[1488,   47,   39,   15,    5,  167,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])

In [36]:
encoder_context = encoder_model.predict( sequencia_padded )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step


In [37]:
bos_token = tokenizer.word_index["<BOS>"]

In [38]:
texto_sequencia = np.zeros( (1,1) )
texto_sequencia

array([[0.]])

In [39]:
texto_sequencia[0, 0] = bos_token
texto_sequencia

array([[2.]])

In [40]:
decoder_entrada_data = [texto_sequencia] + encoder_context
# decoder_entrada_data

In [41]:
decoder_saida, decoder_output_state_h, decoder_output_state_c = decoder_model.predict( decoder_entrada_data )
decoder_saida_context = [decoder_output_state_h, decoder_output_state_c]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step


In [42]:
indice_palavra = np.argmax(decoder_saida[0, 0])

In [43]:
tokenizer.sequences_to_texts( [[indice_palavra]] )

['not']

In [50]:
def gerar_frase( frase_entrada ):
    MAX_PALAVRAS_GERADAS = 20
    texto_limpo = limpar_texto( frase_entrada ) 
    sequence = tokenizer.texts_to_sequences([texto_limpo])
    padded_sequence = pad_sequences(sequence, maxlen=ENCODER_ENTRADA_SIZE, padding='post')
    state_values = encoder_model.predict(padded_sequence)
    palavra_start = tokenizer.word_index["<BOS>"]
    palavra_final = tokenizer.word_index["<EOS>"]
    indice_palavra = palavra_start
    frase_saida = ""
    indice = 0
    while indice < MAX_PALAVRAS_GERADAS:
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = indice_palavra
        dec_output, saida_state_h, saida_state_c = decoder_model.predict([target_sequence] + state_values)
        state_values = [saida_state_h, saida_state_c]
        indice_palavra = np.argmax(dec_output[0, -1, :])
        if indice_palavra == palavra_final:
            break
        palavra_saida = tokenizer.sequences_to_texts([[indice_palavra]])[0]
        frase_saida = frase_saida + " " + palavra_saida
        indice += 1
    return frase_saida

In [51]:
gerar_frase( texto ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


' i got a good nose'

In [52]:
gerar_frase( "Tell more about the nose" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


' i think they are'

In [53]:
gerar_frase( "they are what ?" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


' they were in the kitchen drawer'

In [54]:
gerar_frase( "what is a kitchen drawer ?" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step


' i am not sure'

In [55]:
gerar_frase( "what do you know ?" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


' the tv news is just a good question'

In [56]:
gerar_frase( "who are you ?" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step


' i do not know'

In [57]:
gerar_frase( "do you know who am i ?" ) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52

' i think he has a good nose but i have never had her'