In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import tensorflow as tf

In [75]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, GlobalAveragePooling1D

In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical

In [9]:
df = pd.read_csv("./dialogs.txt", sep='\t', header=None, names=["pergunta", "resposta"])

In [10]:
df

Unnamed: 0,pergunta,resposta
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [11]:
contraction_dict = {
    "aren't": "are not", "can't": "can not", "could've": "could have",
    "couldn't": "could not", "daren't": "dare not", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he's": "he is",
    "how'd": "how had", "how're": "how are", "how's": "how is",
    "how've": "how have", "i'd": "i had", "i'm": "i am",
    "i've": "i have", "isn't": "is+ not", "it's": "it is",
    "might've": "might have", "mightn't": "might not", "must've": "must have",
    "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not",
    "shan't": "shall not", "she'd": "she had", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "that'd": "that had",
    "thats's": "that is", "there'd": "there had", "there's": "there is",
    "they'd": "they had", "they're": "you are", "they've": "they have",
    "wasn't": "was+ not", "we'd": "we had", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'd": "what had",
    "what're": "what are", "what's": "what is", "what've": "what have",
    "when'd": "when had", "when're": "when are", "when's": "when is",
    "when've": "when have", "where'd": "where had", "where're": "where are",
    "where's": "where is", "where've": "where have", "who'd": "who had",
    "who're": "who are", "who's": "who is", "who've": "who have",
    "why'd": "why had", "why're": "why are", "why's": "why is",
    "why've": "why have", "would've": "would have", "wouldn't": "would not",
    "you're": "you are", "you've": "you have", "'cause": "because", 
    "ain't": "is not", "aren't": "are not", "can't": "cannot", 
    "could've": "could have", "he's": "he is", "how'll": "how will",
    "i'll": "i will", "it'll": "it will", "it's": "it is", 
    "she'll": "she will", "she's": "she is", "that'll": "that will",
    "there'll": "there will", "they'll": "they will", "they're": "they are",
    "we'll": "we will", "we're": "we are", "what'll": "what will",
    "when'll": "when will", "where'll": "where will", "who'll": "who will",
    "yo're": "you are", "you'll": "you will"
}

In [28]:
def limpar_texto( texto ):
    texto = texto.lower()
    texto = texto.replace('?', ' ?')
    texto = texto.replace('.', ' ')
    texto = texto.replace(',', ' ')
    texto = texto.replace('  ', ' ')
    tokens = texto.split(" ")
    lista = []
    for token in tokens:
        palavra = token
        if token in contraction_dict:
            palavra = contraction_dict[token]
        lista.append(palavra)
    return "<START> " + " ".join(lista).strip() + " <END>"

In [29]:
df["pergunta_limpa"] = df["pergunta"].apply(limpar_texto)
df["resposta_limpa"] = df["resposta"].apply(limpar_texto)

In [30]:
df

Unnamed: 0,pergunta,resposta,pergunta_limpa,resposta_limpa
0,"hi, how are you doing?",i'm fine. how about yourself?,<START> hi how are you doing ? <END>,<START> i am fine how about yourself ? <END>
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,<START> i am fine how about yourself ? <END>,<START> i am pretty good thanks for asking <END>
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,<START> i am pretty good thanks for asking <END>,<START> no problem so how have you been ? <END>
3,no problem. so how have you been?,i've been great. what about you?,<START> no problem so how have you been ? <END>,<START> i have been great what about you ? <END>
4,i've been great. what about you?,i've been good. i'm in school right now.,<START> i have been great what about you ? <END>,<START> i have been good i am in school right ...
...,...,...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?,<START> that's a good question maybe it is not...,<START> are you right-handed ? <END>
3721,are you right-handed?,yes. all my life.,<START> are you right-handed ? <END>,<START> yes all my life <END>
3722,yes. all my life.,you're wearing out your right hand. stop using...,<START> yes all my life <END>,<START> you are wearing out your right hand st...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.,<START> you are wearing out your right hand st...,<START> but i do all my writing with my right ...


In [45]:
tokenizer = Tokenizer( oov_token = "<OOV>", filters="", lower=False)

In [49]:
tokenizer.fit_on_texts( df['pergunta_limpa'] + " " + df['resposta_limpa'])

In [74]:
vocab_size = len(tokenizer.word_index)
vocab_size

2613

In [51]:
tokenizer.texts_to_sequences(["<START> are you right-handed ? <END>"])

[[2, 17, 6, 2397, 5, 3]]

In [52]:
todos_textos = df['pergunta_limpa'] + " " + df['resposta_limpa']

In [53]:
todos_textos

0       <START> hi how are you doing ? <END> <START> i...
1       <START> i am fine how about yourself ? <END> <...
2       <START> i am pretty good thanks for asking <EN...
3       <START> no problem so how have you been ? <END...
4       <START> i have been great what about you ? <EN...
                              ...                        
3720    <START> that's a good question maybe it is not...
3721    <START> are you right-handed ? <END> <START> y...
3722    <START> yes all my life <END> <START> you are ...
3723    <START> you are wearing out your right hand st...
3724    <START> but i do all my writing with my right ...
Length: 3725, dtype: object

In [96]:
maior_sequencia = 0
for frase in todos_textos:
    tokens = frase.split(" ")
    if len(tokens) > maior_sequencia:
        maior_sequencia = len(tokens)
maior_sequencia

37

In [97]:
perguntas_sequencia = tokenizer.texts_to_sequences(df["pergunta_limpa"])

In [98]:
respostas_sequencia = tokenizer.texts_to_sequences(df["resposta_limpa"])
respostas_sequencia_nostart = [ elemento[1:] for elemento in respostas_sequencia] 

In [99]:
perguntas_padded = pad_sequences(perguntas_sequencia, padding="post", maxlen=maior_sequencia)
print("Perguntas Shape: ", perguntas_padded.shape)
respostas_padded = pad_sequences(respostas_sequencia_nostart, padding="post", maxlen=maior_sequencia)
print("Respostas Shape: ", respostas_padded.shape)

Perguntas Shape:  (3725, 37)
Respostas Shape:  (3725, 37)


In [100]:
respostas_padded

array([[   4,   31,  587, ...,    0,    0,    0],
       [   4,   31,  161, ...,    0,    0,    0],
       [  35,  165,   27, ...,    0,    0,    0],
       ...,
       [   6,   17, 1367, ...,    0,    0,    0],
       [  34,    4,   14, ...,    0,    0,    0],
       [ 250, 1483,  960, ...,    0,    0,    0]])

In [105]:
decoder_output_data = to_categorical(respostas_padded, num_classes=vocab_size + 1)
decoder_input_data = respostas_padded
encoder_input_data = perguntas_padded

In [106]:
print("Decoder Output Shape:", decoder_output_data.shape)
print("Vocab Size:", vocab_size)

Decoder Output Shape: (3725, 37, 2614)
Vocab Size: 2613


In [109]:
encoder_input = Input(shape=(maior_sequencia, ))
encoder_embedding = Embedding(vocab_size + 1, 150, mask_zero=True)(encoder_input)
encoder_outputs, state_h, state_c = LSTM(150, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_input = Input(shape=(maior_sequencia, ))
decoder_embedding = Embedding(vocab_size + 1, 150, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(150, return_state=True, return_sequences=True)

decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state = encoder_states)
decoder_dense = Dense(vocab_size + 1, activation="softmax")
output = decoder_dense (decoder_output)

rms = RMSprop()

modelo = Model([encoder_input, decoder_input], output)
modelo.compile(optimizer=rms, loss="categorical_crossentropy", metrics=["accuracy"])
modelo.summary()

In [None]:
history = modelo.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=32, epochs=100)

Epoch 1/100




[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 160ms/step - accuracy: 0.0282 - loss: 6.7057
Epoch 2/100
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 155ms/step - accuracy: 0.0356 - loss: 5.0262
Epoch 3/100
[1m 53/117[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m9s[0m 141ms/step - accuracy: 0.0528 - loss: 4.5557