In [95]:
import numpy as np
import pandas as pd

import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing, utils
import re

import yaml
import os

dir_path = 'Primer'
files_list = os.listdir(dir_path + os.sep)

In [96]:
questions, answers = [], []

for filepath in files_list:
    file_ = open(dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(file_)
    conversations = docs['conversations']
    for con in conversations:
        if len(con) > 2 :
            questions.append(con[0])
            replies = con[1 :]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append(ans)
        elif len(con)> 1:
            questions.append(con[0])
            answers.append(con[1])

In [97]:
answers

['Безусловно, не существует такого банковского предложения, где вы не будете ничего переплачивать во время погашения долга. Это просто не выгодно для банка. А вот попытка снизить расходы по кредиту вполне возможна. Сравнивая условия займов в нескольких банках в поисках наиболее выгодного для вас варианта, будет достаточно напрямую попросить у сотрудника по кредитам озвучить сумму переплаты.',
 'плачивать кредит можно суммами гораздо больше, чем установленный по договору ежемесячный платеж только тогда, когда за это не берется дополнительная комиссия или штраф. Переплата допустима, если банком не наложен мораторий сроком от шести месяцев до года на досрочное погашение кредита. Поэтому это немаловажный нюанс, о котором тоже стоит уточнить у кредитного специалиста',
 'У любого банка есть полное право потребовать от потенциального клиента застраховать свое здоровье, трудоспособность или жизнь в любой компании, которая является аккредитованной страховой компанией.',
 'Во избежание штрафов з

In [98]:
answers_with_tags = []
for i in range(len(answers)):
    if type(answers[i]) == str:
        answers_with_tags.append(answers[i])
    else:
        questions.pop(i)

answers = []
for i in range(len(answers_with_tags)) :
    answers.append('<START> ' + answers_with_tags[i] + ' <END>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index)+1

In [99]:
from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

def tokenize(sentences):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub('[^а-яА-ЯёЁ]', ' ', sentence)
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append(tokens)
    return tokens_list , vocabulary

In [100]:
# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions , maxlen=maxlen_questions , padding='post')
encoder_input_data = np.array(padded_questions)

In [101]:
encoder_input_data.shape

(149, 13)

In [102]:
# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
decoder_input_data = np.array(padded_answers)

In [103]:
decoder_input_data.shape

(149, 68)

In [104]:
# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences(answers)
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers , maxlen=maxlen_answers , padding='post')
onehot_answers = utils.to_categorical(padded_answers , VOCAB_SIZE)
decoder_output_data = np.array(onehot_answers)

In [105]:
decoder_output_data.shape

(149, 68, 1462)

In [106]:
# Embedding, LSTM and Desne layers
encoder_inputs = tf.keras.layers.Input(shape=(maxlen_questions ,))
encoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM(200 , return_state=True)(encoder_embedding)
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(maxlen_answers , ))
decoder_embedding = tf.keras.layers.Embedding(VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm (decoder_embedding , initial_state=encoder_states)


decoder_dense = tf.keras.layers.Dense(VOCAB_SIZE , activation=tf.keras.activations.softmax) 
output = decoder_dense (decoder_outputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output)

In [107]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [108]:
model.summary()
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filepath='model_checkpoint.h5',
    save_weights_only=False,
    save_freq='epoch',
    verbose=1
)

Model: "model_14"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None, 13)]         0           []                               
                                                                                                  
 input_20 (InputLayer)          [(None, 68)]         0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 13, 200)      292400      ['input_19[0][0]']               
                                                                                                  
 embedding_9 (Embedding)        (None, 68, 200)      292400      ['input_20[0][0]']               
                                                                                           

In [109]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=16, epochs=300,callbacks=checkpoint_callback) 

Epoch 1/300
Epoch 1: saving model to model_checkpoint.h5
Epoch 2/300
Epoch 2: saving model to model_checkpoint.h5
Epoch 3/300
Epoch 3: saving model to model_checkpoint.h5
Epoch 4/300
Epoch 4: saving model to model_checkpoint.h5
Epoch 5/300
Epoch 5: saving model to model_checkpoint.h5
Epoch 6/300
Epoch 6: saving model to model_checkpoint.h5
Epoch 7/300
Epoch 7: saving model to model_checkpoint.h5
Epoch 8/300
Epoch 8: saving model to model_checkpoint.h5
Epoch 9/300
Epoch 9: saving model to model_checkpoint.h5
Epoch 10/300
Epoch 10: saving model to model_checkpoint.h5
Epoch 11/300
Epoch 11: saving model to model_checkpoint.h5
Epoch 12/300
Epoch 12: saving model to model_checkpoint.h5
Epoch 13/300
Epoch 13: saving model to model_checkpoint.h5
Epoch 14/300
Epoch 14: saving model to model_checkpoint.h5
Epoch 15/300
Epoch 15: saving model to model_checkpoint.h5
Epoch 16/300
Epoch 16: saving model to model_checkpoint.h5
Epoch 17/300
Epoch 17: saving model to model_checkpoint.h5
Epoch 18/300
Ep

<keras.callbacks.History at 0x2a605de0fd0>

In [110]:
from tensorflow import keras
model = keras.models.load_model('model_checkpoint.h5')

tf.keras.utils.plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [111]:
#Prediction

In [112]:
def inference():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=(200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=(200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

def preprocess_input(input_sentence):
    tokens = input_sentence.lower().split()
    tokens_list = []
    for word in tokens:
        tokens_list.append(tokenizer.word_index[word]) 
    return preprocessing.sequence.pad_sequences([tokens_list] , maxlen=maxlen_questions , padding='post')

In [113]:
enc_model , dec_model = inference()

In [119]:
tests = ['как  вкладывать деньги']

for i in range(1):
    states_values = enc_model.predict(preprocess_input(tests[i]))
    empty_target_seq = np.zeros((1 , 1))
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([empty_target_seq] + states_values)
        sampled_word_index = np.argmax(dec_outputs[0, -1, :])
        sampled_word = None
        
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += f' {word}'
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros((1 , 1))  
        empty_target_seq[0 , 0] = sampled_word_index
        states_values = [h , c] 
    print(f'Human: {tests[i]}')
    print()
    decoded_translation = decoded_translation.split(' end')[0]
    print(f'Bot: {decoded_translation}')
    print('-'*25)

Human: как  вкладывать деньги

Bot:  банки предоставляют профессиональную поддержку и консультации по финансовым вопросам вы сможете обратиться в банк для получения советов и рекомендаций от финансовых экспертов которые помогут вам принять решения связанные с вашими финансами и инвестициями
-------------------------
