<a href="https://colab.research.google.com/github/antonkravchenko2001/NLP-Telegram-Bot/blob/master/Transformer_neural_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Transformer Model creation**

In [None]:
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, SpatialDropout1D, Input, Embedding
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.model_selection import train_test_split
import random as python_random
import json

### Positional Encoding

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates
def pos_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

### Multihead Attention

In [None]:
class MultiheadSelfAttention(tf.keras.Model):
    def __init__(self, d_model, num_heads, mask = False):
        super(MultiheadSelfAttention, self).__init__()
        self.depth = d_model // num_heads
        self.num_heads = num_heads
        self.query_dense = tf.keras.layers.Dense(d_model)
        self.key_dense = tf.keras.layers.Dense(d_model)
        self.value_dense = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        self.mask = mask

    def create_mask(self, length, width):
        mask = np.zeros((length, width), dtype='float32')
        for i in range(length):
            for j in range(width):
                if j > i:
                    mask[i,j] = -1e9
        return tf.cast(mask, dtype=tf.float32)


    def call(self,query, key, value):
        query = self.query_dense(query)
        key = self.key_dense(key)
        value = self.value_dense(value)
        batch_size = query.shape[0]
        query = tf.reshape(query, [batch_size, -1, self.num_heads, self.depth])
        query = tf.transpose(query, [0, 2, 1, 3])
        key = tf.reshape(key, [batch_size, -1, self.num_heads, self.depth])
        key = tf.transpose(key, [0, 2, 1, 3])
        value = tf.reshape(value, [batch_size, -1, self.num_heads, self.depth])
        value = tf.transpose(value, [0, 2, 1, 3])
        score = tf.matmul(query, key, transpose_b=True)
        score /= tf.math.sqrt(tf.dtypes.cast(self.depth, dtype=tf.float32))
        if self.mask:
            score += self.create_mask(score.shape[-2], score.shape[-1])
        alignment = tf.nn.softmax(score, axis=-1)
        context = tf.matmul(alignment, value)
        context = tf.transpose(context, [0, 2, 1, 3])
        context = tf.reshape(context, [batch_size, -1, self.depth * self.num_heads])
        output = self.dense(context)
        return output

### Encoder Layer

In [None]:
class Encoder(tf.keras.Model):
  
    def __init__(self, vocab_size, d_model, num_heads,  num_layers=1, dropout=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.dropouts1 = [Dropout(dropout) for i in range(num_layers)]
        self.dropouts2 = [Dropout(dropout) for i in range(num_layers)]
        self.embedding = tf.keras.layers.Embedding(vocab_size+1, d_model)
        self.mha = [MultiheadSelfAttention(d_model, num_heads) for _ in range(num_layers)]
        self.layer_normalization_1 = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.layer_normalization_2 = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.dense_1 = [tf.keras.layers.Dense(d_model * 4, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(d_model) for _ in range(num_layers)]
        
   
    def call(self, encoder_inputs):
        encoder_outputs = self.embedding(encoder_inputs) #shape (batch_size, max_sequence_length(input_text), d_model)
        encoder_outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) 
        encoder_outputs += pos_encoding(encoder_outputs.shape[1], encoder_outputs.shape[2])
        for i in range(self.num_layers):
            attention = self.mha[i](query=encoder_outputs, key=encoder_outputs, value=encoder_outputs) #shape (batch_size, max_sequence_length(input_text), emebedding dimension)
            attention = self.dropouts1[i](attention)
            attention = self.layer_normalization_1[i](encoder_outputs + attention)
            encoder_outputs = self.dense_1[i](attention) #shape (batch_size, max_sequence_length(input_text), d_model*4)
            encoder_outputs = self.dense_2[i](encoder_outputs)  #shape (batch_size, max_sequence_length(input_text), d_model)
            encoder_outputs = self.dropouts2[i](encoder_outputs)
            encoder_outputs = self.layer_normalization_2[i](attention + encoder_outputs)
        return encoder_outputs  #shape (batch_size, max_sequence_length(input_text), d_model)

### Decoder Layer

In [None]:
class Decoder(tf.keras.Model):

    def __init__(self, vocab_size, d_model, num_heads, num_layers=1, dropout=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = tf.keras.layers.Embedding(vocab_size+1, d_model)
        self.masked_mha = [MultiheadSelfAttention(d_model, num_heads, mask=True) for _ in range(num_layers)]
        self.layer_normalization_1 = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.layer_normalization_2 = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.layer_normalization_3 = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.dropouts1 = [tf.keras.layers.Dropout(dropout) for _ in range(num_layers)]
        self.dropouts2 = [tf.keras.layers.Dropout(dropout) for _ in range(num_layers)]
        self.dropouts3 = [tf.keras.layers.Dropout(dropout) for _ in range(num_layers)]
        self.mha = [MultiheadSelfAttention(d_model, num_heads) for _ in range(num_layers)]
        self.dense_1 = [tf.keras.layers.Dense(d_model * 4, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(d_model) for _ in range(num_layers)]
        self.dense = tf.keras.layers.Dense(vocab_size+1, activation = 'softmax')

    def call(self, encoder_outputs, decoder_inputs):
        decoder_outputs = self.embedding(decoder_inputs)  #shape (batch_size, max_sequence_length(target_text)-1, d_model)
        decoder_outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        decoder_outputs += pos_encoding(decoder_outputs.shape[1], decoder_outputs.shape[2])#shape (batch_size, max_sequence_length(target_text)-1, d_model)
        for i in range(self.num_layers):
            masked_attention = self.masked_mha[i](query=decoder_outputs, key=decoder_outputs, value=decoder_outputs)#shape (batch_size, max_sequence_length(target_text)-1, d_model)
            masked_attention = self.dropouts1[i](masked_attention)
            masked_attention = self.layer_normalization_1[i](masked_attention + decoder_outputs)
            attention = self.mha[i](query=masked_attention, key=encoder_outputs, value=encoder_outputs) #shape (batch_size, max_sequence_length(target_text)-1, d_model)
            attention = self.dropouts2[i](attention)
            attention = self.layer_normalization_2[i](attention+masked_attention)
            decoder_outputs = self.dense_1[i](attention) #shape (batch_size, max_sequence_length(target_text)-1, d_model*4)
            decoder_outputs = self.dense_2[i](decoder_outputs) #shape (batch_size, max_sequence_length(target_text)-1, d_model)
            decoder_outputs = self.dropouts3[i](decoder_outputs)
            decoder_outputs = self.layer_normalization_3[i](decoder_outputs + attention) #shape (batch_size, max_sequence_length(target_text)-1, d_model)
        decoder_outputs = self.dense(decoder_outputs)
        return decoder_outputs #shape (batch_size, max_sequence_length(target_text)-1, len(target_dict) + 1)

### Transformer Model

In [None]:
class Transformer(tf.keras.Model):
    
    def __init__(self, input_vocab_size, target_vocab_size, d_model, num_heads, num_layers_encoder=1, num_layers_decoder=1):
        super().__init__()
        self.encoder = Encoder(vocab_size=input_vocab_size, d_model=d_model, num_heads=num_heads, num_layers=num_layers_encoder) 
        self.decoder = Decoder(vocab_size=target_vocab_size,d_model=d_model, num_heads=num_heads, num_layers=num_layers_decoder) 
    
    def call(self, inputs):
        encoder_outputs = self.encoder(inputs[0]) #shape (batch_size, max_sequence_length(input_text), d_model)
        output = self.decoder(encoder_outputs, inputs[1]) #shape (batch_size, max_sequence_length(target_text)-1, len(target_dict) + 1)
    
        return output

## **Data Preparation**

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


### Parsing data

In [None]:
def conv_dict(url):
    conv_dict = {}
    file = open(url, mode='r',encoding='unicode_escape')
    for line in file:
        line = line.split(' +++$+++ ')
        conv_dict[line[0]] = line[4]
    return conv_dict

In [None]:
def parse(url):
    file = open(url,mode='r',encoding='unicode_escape')
    inputs = []
    targets = []
    c = 0
    for line in file:
        line = line.split(' +++$+++ ')[3]
        line = line.replace("[","").replace("]","").replace(", ",",").replace("'","").replace("\n","").replace("\t","")
        line = line.split(",")
        for i in range(len(line)-1):
            if len(d[line[i]].split()) < 25 and len(d[line[i+1]].split()) <25:
                inputs.append(d[line[i]] )
                targets.append("start_token " + d[line[i+1]] + " end_token")
    return inputs,targets

### Tokenization

In [None]:
def tokenize(text):
    tokenizer = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^{|}`~\t\n')
    tokenizer.fit_on_texts(text)
    text = tokenizer.texts_to_sequences(text)
    return text, tokenizer.word_index, {v: k for k, v in tokenizer.word_index.items()}

In [None]:
def max_sequence_length(text):
    length_list = []
    for sentence in text:
        l = 0
        for _ in sentence:
            l += 1
        length_list.append(l)
    return max(length_list)

In [None]:
def min_sequence_length(text):
    length_list = []
    for sentence in text:
        l = 0
        for _ in sentence:
            l += 1
        length_list.append(l)
    return min(length_list)

### Creating Data Generator for more efficient training

In [None]:
def batch_generator(x, y, batch_size=128):
    for i in range(0, len(x), batch_size):
        encoder_inputs = pad_sequences(x[i:i+batch_size], maxlen=max_sequence_length(input_text), padding='post')
        decoder_inputs = []
        decoder_outputs = []
        for j, txt in enumerate(y[i:i+batch_size]):
            decoder_inputs.append(txt[:-1])
            decoder_outputs.append(txt[1:])
        decoder_inputs = pad_sequences(decoder_inputs,maxlen=max_sequence_length(target_text)-1, padding='post')
        decoder_outputs = pad_sequences(decoder_outputs,maxlen=max_sequence_length(target_text)-1,padding='post')
        yield ([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
d = conv_dict('/content/drive/My Drive/transformer_model/transformer_data/movie_lines.txt')
input_text, target_text = parse('/content/drive/My Drive/transformer_model/transformer_data/movie_conversations.txt')
input_text = input_text[:102000]
target_text = target_text[:102000]

In [None]:
input_text, input_dict, input_dict_r = tokenize(input_text)
target_text, target_dict, target_dict_r = tokenize(target_text)

In [None]:
with open('/content/drive/My Drive/transformer_model/input_dict.json', 'w') as fp:
    json.dump(input_dict, fp, indent=4)
with open('/content/drive/My Drive/transformer_model/target_dict.json', 'w') as fp:
    json.dump(target_dict, fp, indent=4)
with open('/content/drive/My Drive/transformer_model/target_dict_r.json', 'w') as fp:
    json.dump(target_dict_r, fp, sort_keys=True,indent=4)

In [None]:
for inp, tar in zip(input_text[:], target_text[:]):
    if len(inp) > 20  or len(tar) > 20 or len(tar) < 3:
        target_text.remove(tar)
        input_text.remove(inp)

In [None]:
input_text =input_text[:100000]
target_text = target_text[:100000]

In [None]:
num_heads = 8
d_model = 256
input_vocab_size=len(input_dict)
target_vocab_size=len(target_dict)
batch_size=100

### Instantiating Transformer Model

In [None]:
transformer = Transformer(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, d_model=d_model, num_heads=num_heads, num_layers_encoder=1, num_layers_decoder=1)

### Custom training scheduler 

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, model_size, warmup_steps=40000):
    super(CustomSchedule, self).__init__()
    self.model_size = model_size
    self.model_size = tf.cast(self.model_size, tf.float32)
    self.warmup_steps = warmup_steps
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    return tf.math.rsqrt(self.model_size) * tf.math.minimum(arg1, arg2)

In [None]:
lr = CustomSchedule(d_model)

### Custom training loop

In [None]:
optimizer = Adam(lr)
loss_func = tf.keras.losses.SparseCategoricalCrossentropy()
train_loss = tf.keras.metrics.Mean(name='train_loss')
sparse_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
train_accuracy = tf.keras.metrics.Mean()

In [None]:
def train_step(x, y):
    with tf.GradientTape() as tape:
        x[0] = tf.cast(x[0], tf.int64)
        x[1] = tf.cast(x[1], tf.int64)
        y_prediction = transformer(x)
        loss = loss_func(y, y_prediction)
        accuracy = sparse_accuracy(y, y_prediction)
    variables = transformer.trainable_variables
    grads = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(grads, variables))
    train_loss.update_state(loss)
    train_accuracy.update_state(accuracy)

In [None]:
for epoch in range(200):
    train_loss.reset_states()
    train_accuracy.reset_states()
    train_data = batch_generator(input_text, target_text, batch_size=400)
    for  i, train_batch  in enumerate(train_data):
        x_train = train_batch[0]
        y_train = train_batch[1]
        train_step(x_train, y_train)
    print ('Epoch {} train_loss {:.4f} train_accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))

### Save Model's weights

In [None]:
transformer.save_weights('/content/drive/My Drive/transformer_model/my_model', save_format = 'tf')

In [None]:
transformer = Transformer(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, d_model=d_model, num_heads=num_heads, num_layers_encoder=1, num_layers_decoder=1)

In [None]:
transformer.load_weights('/content/drive/My Drive/transformer_model/my_model')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f75f4300a90>

In [None]:
def predict(sentence):
    enc_input = encode(sentence)
    print(sentence)
    out_words = []
    de_input = tf.constant([[1]], dtype=tf.int64)
    while True:
        de_output = transformer([enc_input, de_input])
        new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1], axis=1)
        de_input = tf.concat((de_input, new_word), axis=-1)
        out_words.append(target_dict_r[new_word.numpy()[0][0]])
        if out_words[-1] == 'end_token' or len(out_words) >= 20:
            break
    print(de_input)
    print(' '.join(out_words))

In [None]:
def encode(sentence):
    sentence = sentence.split()
    for i, el  in enumerate(sentence):
        sentence[i] = input_dict[el]
    sentence = pad_sequences([sentence], maxlen=max_sequence_length(input_text), padding='post')
    return sentence