In [28]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [29]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Bidirectional, Embedding, LSTM, Dense, Bidirectional, Input
from tensorflow.keras.optimizers import Adam

In [30]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical

In [31]:
import matplotlib.pyplot as plt

In [32]:
PADDING_POS = "pre"

In [33]:
df_textos = pd.read_csv("../../../dados/nlp/dialogs.txt", sep="\t", encoding="utf-8", names=["charA", "charB"])
df_textos

Unnamed: 0,charA,charB
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [34]:
contraction_dict = {
                    "aren't": "are not",		"can't": "can not",		"could've": "could have",
                    "couldn't": "could not",	"daren't": "dare not",	"didn't": "did not",
                    "doesn't": "does not",		"don't": "do not",		"hadn't": "had not",
                    "hasn't": "has not",		"haven't": "have not",	"he's": "he is",
                    "how'd": "how had",			"how're": "how are",	"how's": "how is",
                    "how've": "how have",		"i'd": "i had",			"i'm": "i am",
                    "i've": "i have",			"isn't": "is+ not",		"it's": "it is",
                    "might've": "might have",	"mightn't": "might not",	"must've": "must have",
                    "mustn't": "must not",		"needn't": "need not",	"oughtn't": "ought not",
                    "shan't": "shall not",		"she'd": "she had",		"she's": "she is",
                    "should've": "should have",	"shouldn't": "should not",	"that'd": "that had",
                    "that's": "that is",		"there'd": "there had",	"there's": "there is",
                    "they'd": "they had",		"they're": "you are",	"they've": "they have",
                    "wasn't": "was+ not",		"we'd": "we had",		"we're": "we are",
                    "we've": "we have",			"weren't": "were not",	"what'd": "what had",
                    "what're": "what are",		"what's": "what is",	"what've": "what have",
                    "when'd": "when had",		"when're": "when are",	"when's": "when is",
                    "when've": "when have",		"where'd": "where had",	"where're": "where are",
                    "where's": "where is",		"where've": "where have",	"who'd": "who had",
                    "who're": "who are",		"who's": "who is",		"who've": "who have",
                    "why'd": "why had",			"why're": "why are",	"why's": "why is",
                    "why've": "why have",		"would've": "would have",	"wouldn't": "would not",
                    "you're": "you are",		"you've": "you have",	"'cause": "because", 
                    "ain't": "is not", 			"aren't": "are not",	"can't": "cannot", 
                    "could've": "could have",	"he's": "he is",		"how'll": "how will",
                    "i'll": "i will",			"it'll": "it will",		"it's": "it is", 
                    "she'll": "she will",		"she's": "she is",		"that'll": "that will",
                    "there'll": "there will",	"they'll": "they will",	"they're": "they are",
                    "we'll": "we will",			"we're": "we are",		"what'll": "what will",
                    "when'll": "when will",		"where'll": "where will",	"who'll": "who will",
                    "yo're": "you are",			"you'll": "you will"
            }

In [35]:
def pre_processa_texto( texto ):
    global contraction_dict
    tokens = texto.lower().split(" ")
    novos_tokens = []
    for token in tokens:
        if token in contraction_dict:
            token = contraction_dict[token]
        novos_tokens.append(token)
    return " ".join(novos_tokens)
        

In [36]:
pre_processa_texto("it's okay. it's a really big campus.")

'it is okay. it is a really big campus.'

In [37]:
df_textos["charA_limpo"] = df_textos["charA"].apply( pre_processa_texto )
df_textos["charB_limpo"] = df_textos["charB"].apply( pre_processa_texto )

In [38]:
df_textos

Unnamed: 0,charA,charB,charA_limpo,charB_limpo
0,"hi, how are you doing?",i'm fine. how about yourself?,"hi, how are you doing?",i am fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.,i am fine. how about yourself?,i am pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?,i am pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?,no problem. so how have you been?,i have been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.,i have been great. what about you?,i have been good. i am in school right now.
...,...,...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?,that is a good question. maybe it is not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...,yes. all my life.,you are wearing out your right hand. stop usin...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.,you are wearing out your right hand. stop usin...,but i do all my writing with my right hand.


In [39]:
entrada_textos = "<bos> " + df_textos["charA_limpo"]
saida_textos = "<bos> " + df_textos["charB_limpo"] + " <eos>"

# <BOS> - Begin of Sentence
# <EOS> - End of Sentence
# dicionario = {"<BOS>": 1, "<EOS>": 2}
# "<BOS> i am pretty good. thanks for asking. <EOS>"
# "bos I am pretty good. thanks for asking eos"

In [40]:
tokenizer = Tokenizer(oov_token="<oov>", filters="")  # Textos do Personagem A - entrada
tokenizer.fit_on_texts(entrada_textos + saida_textos)
vocabulario = tokenizer.word_index

In [41]:
entrada_sequencias = tokenizer.texts_to_sequences( entrada_textos )
saida_sequencias = tokenizer.texts_to_sequences( saida_textos )
vocabulario_tam = len(vocabulario) + 1

In [42]:
maximo_palavras = 0
for seq in entrada_sequencias:
    if len(seq) > maximo_palavras:
        maximo_palavras = len(seq)
maximo_palavras

21

In [43]:
for seq in saida_sequencias:
    if len(seq) > maximo_palavras:
        maximo_palavras = len(seq)
maximo_palavras

22

In [44]:
encoder_entrada_dados = pad_sequences([seq[1:] for seq in entrada_sequencias], maxlen=maximo_palavras, padding=PADDING_POS)
encoder_entrada_dados

array([[   0,    0,    0, ...,   15,    5,  963],
       [   0,    0,    0, ...,   34,   33, 1461],
       [   0,    0,    0, ...,  604,   30, 1463],
       ...,
       [   0,    0,    0, ...,   48,   27,  842],
       [   0,    0,    0, ...,   10,   31,  461],
       [   0,    0,    0, ...,   27,  120, 1340]])

In [45]:
entrada_sequencias[0]

[2, 1791, 34, 15, 5, 963]

In [46]:
encoder_entrada_dados[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 1791,   34,   15,    5,  963])

In [47]:
# outralista = [seq[:-1] for seq in saida_sequencias]
# outralista

In [48]:
decoder_entrada_dados = pad_sequences(entrada_sequencias, maxlen=maximo_palavras, padding=PADDING_POS)
decoder_saida_dados = pad_sequences([seq[1:] for seq in saida_sequencias], maxlen=maximo_palavras, padding=PADDING_POS)

In [49]:
decoder_entrada_dados[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    2, 1791,   34,   15,    5,  963])

In [50]:
vocabulario_tam

5499

In [51]:
decoder_saida_dados[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    4,   26,  928,   34,   33, 1461,    3])

In [52]:
decoder_saida_onehot_dados = to_categorical( decoder_saida_dados, num_classes=vocabulario_tam )
decoder_saida_onehot_dados[0]

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
embedding_dim = 100
lstm_encoder_nodes = 100

In [55]:
encoder_input = Input( shape=(None,) )
encoder_embedding = Embedding(input_dim = vocabulario_tam, output_dim = embedding_dim, mask_zero=True)(encoder_input)
encoder_lstm, state_h, state_c = LSTM(lstm_encoder_nodes, return_state = True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [57]:
decoder_input = Input( shape=(None,) )
decoder_embedding_layer = Embedding(input_dim=vocabulario_tam, output_dim = embedding_dim, mask_zero=True)
decoder_embedding = decoder_embedding_layer(decoder_input)
lstm_decoder_layer = LSTM(lstm_encoder_nodes, return_state=True, return_sequences=True)
decoder_lstm_output, _, _ = lstm_decoder_layer(decoder_embedding, initial_state = encoder_states)
decoder_output_layer = Dense(vocabulario_tam, activation="softmax")
decorder_output = decoder_output_layer( decoder_lstm_output )

In [59]:
model = Model( [encoder_input, decoder_input], decorder_output )

In [61]:
# adam = Adam(learning_rate = 0.001)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
resultado = model.fit([encoder_entrada_dados, decoder_entrada_dados], decoder_saida_onehot_dados, 
          batch_size=4, epochs=100, validation_split=0.2)

In [None]:
loss = np.array(resultado.history["loss"])
loss_max = np.max(loss)
loss_norm = loss/loss_max
# accuracy = np.array(resultado.history["accuracy"])
# accuracy_max = np.max(accuracy)
# accuracy_norm = accuracy / accuracy_max
plt.plot(loss_norm, color="red", label="Loss")
plt.plot(resultado.history["accuracy"], color="green", label="Accuracy")
plt.legend()
plt.show()

In [None]:
modelo_encoder = Model( encoder_input, encoder_states )

In [None]:
decoder_state_h = Input( shape=(lstm_encoder_nodes,) )
decoder_state_c = Input( shape=(lstm_encoder_nodes,) )
decoder_states_input = [decoder_state_h, decoder_state_c]

In [None]:
# decoder_embedding2 = decoder_embedding_layer(decoder_input)

In [None]:
decoder_output2, state_h2, state_c2 = lstm_decoder_layer(decoder_embedding, initial_state = decoder_states_input)
decoder_output_states = [state_h2, state_c2]
decoder_saida2 = decoder_output_layer(decoder_output2)

In [None]:
modelo_decoder = Model( [decoder_input] + decoder_states_input, 
                        [decoder_saida2] + decoder_output_states )
modelo_decoder.summary()

In [None]:
# Origem = "Goog morning"
# Target = "<BOS>"   # Ate que seja gerado <EOS>

In [None]:
def chat_bot( texto, max_words = 10  ):
    global entrada_tokenizer
    texto_processado = pre_processa_texto( texto )
    persona_a_sequencia = tokenizer.texts_to_sequences( [texto_processado.lower()] )
    persona_a_padded = pad_sequences( persona_a_sequencia, maxlen=maximo_palavras, padding="pre")

    state_value = modelo_encoder.predict( persona_a_padded )

    target_text = "<BOS>"
    target_sequence = np.array(tokenizer.texts_to_sequences([target_text]))
    print("target_sequence: ", target_sequence)
    final_sentenca = False
    sentenca = ""
    counter = 0
    while not final_sentenca and counter < max_words:    # texto for diferente de <EOS>
        # target_padded = pad_sequences( target_sequence, maxlen=saida_maximo_palavras, padding="pre")
        # print("Decode: ", [target_sequence] + state_value)
        tokens_ouput, decoder_state_h2, decoder_state_c2 = modelo_decoder.predict([target_sequence] + state_value)
        state_value = [decoder_state_h2, decoder_state_c2]
        token_provavel = np.argmax(tokens_ouput[0, -1, :])
        texto_provavel = tokenizer.sequences_to_texts( [[token_provavel]] )
        # print("Token Provavel: ", token_provavel, "Texto Provavel: ", texto_provavel)
        # print("Target Sequence: ", target_sequence)
        # print("Sentenca: ", sentenca)
        if texto_provavel[0] == "<eos>": 
            final_sentenca = True
        else: 
            if texto_provavel[0] != "<oov>":
                sentenca += (texto_provavel[0] + " ")
            target_text = target_text + " " + texto_provavel[0]
            target_sequence = np.array(tokenizer.texts_to_sequences([target_text]))
            counter += 1
    return sentenca
    

In [None]:
def gerar_texto( texto_entrada, max_palavras = 10 ): 
    textos_sequence = tokenizer.texts_to_sequences( [texto_entrada] )
    textos_padded = pad_sequences( textos_sequence, padding="post", maxlen=maximo_palavras)
    print("Textos padded: ", textos_padded)
    state_values = modelo_encoder.predict( textos_padded )
    palavra_start = tokenizer.word_index["<bos>"]
    palavra_end = tokenizer.word_index["<eos>"]
    index_palavra = palavra_start
    texto_saida = ""
    for i in range(max_palavras):
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = index_palavra
        print("Target Sequence: ", target_sequence)
        dec_output, saida_state_h, saida_state_c = modelo_decoder.predict( [target_sequence] + state_values )
        state_values = [saida_state_h, saida_state_c]
        index_palavra = np.argmax(dec_output[0, 0])
        print("Numero Palavra: ", index_palavra)
        if index_palavra == palavra_end:
            break
        # print("Palavra: ", palavra)
        palavra_saida = tokenizer.sequences_to_texts([[ index_palavra ]])[0]
        texto_saida = texto_saida + " " + palavra_saida
    return texto_saida
        

In [None]:
gerar_texto("are you right-handed?")