In [None]:
# !pip install unidecode

In [None]:
import re
from unidecode import unidecode
import string
import pickle

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM
from tensorflow.keras.models import Model

In [None]:
arquivo = open("/content/nlp/dialogs.txt", "r")

In [None]:
textos = arquivo.readlines()

In [None]:
arquivo.close()

In [None]:
def limpar_texto( texto ):
    texto = texto.replace("\n", "")
    texto = re.sub(r'(?<! )\?', ' ?', texto)
    return texto


In [None]:
textos_limpos = [limpar_texto(texto) for texto in textos]

In [None]:
textos_limpos_splited = [texto.split("\t") for texto in textos_limpos]

In [None]:
df = pd.DataFrame(textos_limpos_splited, columns=['perguntas', 'respostas'])
df

Unnamed: 0,perguntas,respostas
0,"hi, how are you doing ?",i'm fine. how about yourself ?
1,i'm fine. how about yourself ?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been ?
3,no problem. so how have you been ?,i've been great. what about you ?
4,i've been great. what about you ?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed ?
3721,are you right-handed ?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [None]:
contraction_dict = {
    "aren't": "are not", "can't": "can not", "could've": "could have",
    "couldn't": "could not", "daren't": "dare not", "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hasn't": "has not", "haven't": "have not", "he's": "he is",
    "how'd": "how had", "how're": "how are", "how's": "how is",
    "how've": "how have", "i'd": "i had", "i'm": "i am",
    "i've": "i have", "isn't": "is+ not", "it's": "it is",
    "might've": "might have", "mightn't": "might not", "must've": "must have",
    "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not",
    "shan't": "shall not", "she'd": "she had", "she's": "she is",
    "should've": "should have", "shouldn't": "should not", "that'd": "that had",
    "that's": "that is", "there'd": "there had", "there's": "there is",
    "they'd": "they had", "they're": "you are", "they've": "they have",
    "wasn't": "was+ not", "we'd": "we had", "we're": "we are",
    "we've": "we have", "weren't": "were not", "what'd": "what had",
    "what're": "what are", "what's": "what is", "what've": "what have",
    "when'd": "when had", "when're": "when are", "when's": "when is",
    "when've": "when have", "where'd": "where had", "where're": "where are",
    "where's": "where is", "where've": "where have", "who'd": "who had",
    "who're": "who are", "who's": "who is", "who've": "who have",
    "why'd": "why had", "why're": "why are", "why's": "why is",
    "why've": "why have", "would've": "would have", "wouldn't": "would not",
    "you're": "you are", "you've": "you have", "'cause": "because",
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "could've": "could have", "he's": "he is", "how'll": "how will",
    "i'll": "i will", "it'll": "it will", "it's": "it is",
    "she'll": "she will", "she's": "she is", "that'll": "that will",
    "there'll": "there will", "they'll": "they will", "they're": "they are",
    "we'll": "we will", "we're": "we are", "what'll": "what will",
    "when'll": "when will", "where'll": "where will", "who'll": "who will",
    "yo're": "you are", "you'll": "you will"
}

In [None]:
pontuacoes_remover = string.punctuation
pontuacoes_remover = pontuacoes_remover.replace("?", "")

def limpar_trocar_contracoes_tags( texto ):
    global pontuacoes_remover
    novas_palavras = []
    mascara = str.maketrans("\n\r\t", "   ", pontuacoes_remover)
    texto_minusculo = texto.lower()
    lista_palavras = texto_minusculo.split(" ")
    for palavra in lista_palavras:
        if palavra in contraction_dict:
            palavra = contraction_dict[palavra]
        novas_palavras.append(palavra)
    novo_texto =  " ".join(novas_palavras)
    texto_limpo = novo_texto.translate(mascara)
    texto_limpo = "<START> " + unidecode(texto_limpo) + " <END>"
    return texto_limpo

In [None]:
df['perguntas_limpas'] = df['perguntas'].apply(limpar_trocar_contracoes_tags)
df['respostas_limpas'] = df['respostas'].apply(limpar_trocar_contracoes_tags)
df

Unnamed: 0,perguntas,respostas,perguntas_limpas,respostas_limpas
0,"hi, how are you doing ?",i'm fine. how about yourself ?,<START> hi how are you doing ? <END>,<START> i am fine how about yourself ? <END>
1,i'm fine. how about yourself ?,i'm pretty good. thanks for asking.,<START> i am fine how about yourself ? <END>,<START> i am pretty good thanks for asking <END>
2,i'm pretty good. thanks for asking.,no problem. so how have you been ?,<START> i am pretty good thanks for asking <END>,<START> no problem so how have you been ? <END>
3,no problem. so how have you been ?,i've been great. what about you ?,<START> no problem so how have you been ? <END>,<START> i have been great what about you ? <END>
4,i've been great. what about you ?,i've been good. i'm in school right now.,<START> i have been great what about you ? <END>,<START> i have been good i am in school right ...
...,...,...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed ?,<START> that is a good question maybe it is no...,<START> are you righthanded ? <END>
3721,are you right-handed ?,yes. all my life.,<START> are you righthanded ? <END>,<START> yes all my life <END>
3722,yes. all my life.,you're wearing out your right hand. stop using...,<START> yes all my life <END>,<START> you are wearing out your right hand st...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.,<START> you are wearing out your right hand st...,<START> but i do all my writing with my right ...


In [None]:
tokenizer = Tokenizer(oov_token='<OOV>', filters="",)

In [None]:
tokenizer.fit_on_texts(df['perguntas_limpas'] + df['respostas_limpas'])

In [None]:
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [None]:
perguntas_sequences = tokenizer.texts_to_sequences(df['perguntas_limpas'])

In [None]:
respostas_sequences = tokenizer.texts_to_sequences(df['respostas_limpas'])

In [None]:
maximo_palavras_perguntas = max([len(sequence) for sequence in (perguntas_sequences)])
maximo_palavras_perguntas

23

In [None]:
maximo_palavras_respostas = max([len(sequence) for sequence in (respostas_sequences)])
maximo_palavras_respostas

23

In [None]:
# Salvar o Tokenizer
with open("/content/nlp/tokenizer.pkl", "wb") as arquivo_tokenizer:
  tokenizer_dump = pickle.dumps(tokenizer)
  arquivo_tokenizer.write(tokenizer_dump)

In [None]:
VOCAB_SIZE

2496

In [None]:
decoder_output_sequences = [ sequence[1:] for sequence in respostas_sequences ]

In [None]:
encoder_input_data = pad_sequences(perguntas_sequences, maxlen=maximo_palavras_perguntas, padding='post')

In [None]:
decoder_input_data = pad_sequences(respostas_sequences, maxlen=maximo_palavras_respostas, padding='post')

In [None]:
decoder_output_data = pad_sequences(decoder_output_sequences, maxlen=maximo_palavras_respostas, padding='post')

In [None]:
decoder_output_data

array([[   5,   30,  586, ...,    0,    0,    0],
       [   5,   30,  152, ...,    0,    0,    0],
       [  35,  164,   27, ...,    0,    0,    0],
       ...,
       [   7,   17, 1368, ...,    0,    0,    0],
       [  34,    5,   14, ...,    0,    0,    0],
       [ 248, 1481,  967, ...,    0,    0,    0]], dtype=int32)

In [None]:
decoder_output_data = to_categorical(decoder_output_data, VOCAB_SIZE)
decoder_output_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [None]:
EPOCAS = 300
EMBEDDING_DIM = 200
LSTM_NODES = 200
BATCH_SIZE = 32

In [None]:
# criar modelo de entrada do encoder
encoder_input = Input(shape=(maximo_palavras_perguntas, ))
encoder_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(encoder_input)
encoder_outputs, state_h, state_c = LSTM(LSTM_NODES, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [None]:
# criar modelo de entrada do decoder
decoder_input = Input(shape=(maximo_palavras_respostas, ))
decoder_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(LSTM_NODES, return_state=True, return_sequences=True)

In [None]:
# criar modelo de saida do decoder
decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(VOCAB_SIZE, activation="softmax")
output = decoder_dense(decoder_output)

In [None]:
model = Model([encoder_input, decoder_input], output)
model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss="categorical_crossentropy",
             metrics=['accuracy'])
model.summary()

In [None]:
decoder_output_data.shape

(3725, 23, 2496)

### Modelo de treinamento

In [None]:
history = model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=BATCH_SIZE, epochs=EPOCAS)

Epoch 1/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.5795 - loss: 6.2943
Epoch 2/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.0987 - loss: 4.7431
Epoch 3/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.0997 - loss: 4.5978
Epoch 4/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.1017 - loss: 4.4860
Epoch 5/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.1050 - loss: 4.3989
Epoch 6/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.1088 - loss: 4.2890
Epoch 7/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.1123 - loss: 4.2166
Epoch 8/300
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.1166 - loss: 4.1383
Epoch 9/300
[1m117/117

### Modelo de conversa

In [None]:
encoder_model = Model(encoder_input, encoder_states)
decoder_state_input_h = Input(shape=(LSTM_NODES, ))
decoder_state_input_c = Input(shape=(LSTM_NODES, ))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_state_inputs)

decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_input] + decoder_state_inputs, [decoder_outputs] + decoder_states)
decoder_model.summary()

In [None]:
# Salvar o modelo
model.save("/content/nlp/modelo_geral.keras")
encoder_model.save("/content/nlp/modelo_encoder.keras")
decoder_model.save("/content/nlp/modelo_decoder.keras")

In [None]:
testes = ['how are you ?']

In [None]:
sequence = tokenizer.texts_to_sequences(testes)

In [None]:
padded_sequence = pad_sequences(sequence, maxlen=maximo_palavras_perguntas, padding='post')

In [None]:
state_values = encoder_model.predict(padded_sequence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step


In [None]:
palavra_start = tokenizer.word_index["<start>"]
indice_palavra = palavra_start
# start_sequence = pad_sequences([[palavra_start]], maxlen=maximo_palavras_perguntas, padding='post')


In [None]:
target_sequence = np.zeros((1, 1))
target_sequence[0, 0] = indice_palavra
target_sequence

array([[2.]])

In [None]:
dec_output, saida_state_h, saida_state_c = decoder_model.predict([target_sequence] + state_values)
state_values = [saida_state_h, saida_state_c]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step


In [None]:
dec_output[0]

array([[4.1989935e-07, 5.0052188e-07, 6.1781685e-07, ..., 2.8562224e-06,
        6.5947143e-07, 2.4444162e-07]], dtype=float32)

In [None]:
indice_palavra = np.argmax(dec_output[0, -1, :])
indice_palavra

np.int64(5)

In [None]:
tokenizer.sequences_to_texts([[indice_palavra]])

['i']

In [None]:
def gerar_frase( frase_entrada ):
    sequence = tokenizer.texts_to_sequences([frase_entrada])
    padded_sequence = pad_sequences(sequence, maxlen=maximo_palavras_perguntas, padding='post')
    state_values = encoder_model.predict(padded_sequence)
    palavra_start = tokenizer.word_index["<start>"]
    indice_palavra = palavra_start
    frase_saida = ""
    while True:
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = indice_palavra
        dec_output, saida_state_h, saida_state_c = decoder_model.predict([target_sequence] + state_values)
        state_values = [saida_state_h, saida_state_c]
        indice_palavra = np.argmax(dec_output[0, -1, :])
        if indice_palavra == 4:
            break
        palavra_saida = tokenizer.sequences_to_texts([[indice_palavra]])[0]
        frase_saida = frase_saida + " " + palavra_saida
    return frase_saida

In [None]:
gerar_frase("how are you ?")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


' i am serious'

In [None]:
gerar_frase("it is a beautiful day do not you think ?")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


' it is not my fault you did not mind'