In [3]:
import tensorflow as tf
import dill
import sys
import pandas as pd
import time
from sklearn.model_selection import train_test_split
sys.path.append("..")

from utils.helper import *

In [4]:
with open('../Models/EN_Tokenizer.pkl', 'rb') as file:
    EN = dill.load(file)

with open('../Models/AR_Tokenizer.pkl', 'rb') as file:
    AR = dill.load(file)

In [4]:
df = pd.read_csv("../Data/data.csv")
df.head()

Unnamed: 0,English_seq_pad,Arabic_seq_pad
0,60_10918_10298_2414_5788_8970_293_4_59_0_0_0_0...,39_35901_25960_1429_14269_41831_43883_1718_5_3...
1,60_19045_16539_22846_24304_4_59_0_0_0_0_0_0_0_...,39_97_43052_2465_6731_38_0_0_0_0_0_0_0_0_0_0_0...
2,60_23099_22846_17059_16132_21481_572_14041_112...,39_13333_28503_23295_3160_52263_9389_7284_5294...
3,60_7050_22917_14981_22846_14724_16132_10344_58...,39_49647_29038_48766_8307_15199_44177_8206_135...
4,60_20709_17685_22917_24379_23075_20709_16832_3...,39_43363_44177_9463_19915_30423_28218_24833_81...


In [17]:
df["English_seq_pad"] = df["English_seq_pad"].apply(lambda x : np.array(x.split("_")).astype(int))
df["Arabic_seq_pad"] = df["Arabic_seq_pad"].apply(lambda x : np.array(x.split("_")).astype(int))

In [18]:
X_train, X_val, y_train, y_val = train_test_split(df["English_seq_pad"].values, df["Arabic_seq_pad"].values, test_size=0.2)

In [None]:
EPOCHS = 10
BATCH_SIZE = 64
embedding_dim = 256
units = 1024

global_step = tf.Variable(0, trainable=False)

encoder = Encoder(EN.dictlength, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(AR.dictlength, embedding_dim, units, BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
dataset = tf.data.Dataset.from_tensor_slices((X_train.tolist(), y_train.tolist())).shuffle(len(X_train)).batch(BATCH_SIZE, drop_remainder=True)

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([AR.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        total_loss += (loss / int(targ.shape[1]))
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
      
        optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, loss.numpy() / int(targ.shape[1])))
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss/df.shape[0]))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
encoder.save_weights('../Models/encoder_en2ar.h5')
decoder.save_weights('../Models/decoder_en2ar.h5')

In [None]:
with open('../Models/encoder_en2ar.json', 'w') as file:
    file.write(encoder.to_json())

with open('../Models/decoder_en2ar.json', 'w') as file:
    file.write(decoder.to_json())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df["Arabic_seq_pad"].values,df["English_seq_pad"].values, test_size=0.2)

In [None]:
EPOCHS = 10
BATCH_SIZE = 64
embedding_dim = 256
units = 1024

global_step = tf.Variable(0, trainable=False)

encoder = Encoder(AR.dictlength, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(EN.dictlength, embedding_dim, units, BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
dataset = tf.data.Dataset.from_tensor_slices((X_train.tolist(), y_train.tolist())).shuffle(len(X_train)).batch(BATCH_SIZE, drop_remainder=True)

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([EN.word2idx['<start>']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        total_loss += (loss / int(targ.shape[1]))
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
      
        optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, loss.numpy() / int(targ.shape[1])))
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss/df.shape[0]))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
encoder.save_weights('../Models/encoder_ar2en.h5')
decoder.save_weights('../Models/decoder_ar2en.h5')

In [None]:
with open('../Models/encoder_ar2en.json', 'w') as file:
    file.write(encoder.to_json())

with open('../Models/decoder_ar2en.json', 'w') as file:
    file.write(decoder.to_json())

In [None]:
def predict(sentence, encoder, decoder ,input_tok , output_tok):
    
    sentence = preprocess_sentence(sentence,"en")

    inputs = input_tok.texts_to_sequences([sentence])[0]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=input_tok.maxlen, padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([output_tok.word2idx['<start>']], 0)

    for t in range(output_tok.maxlen):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

        predicted_id = tf.random.categorical(tf.exp(predictions), num_samples=1)[0][0].numpy()

        result += output_tok.idx2word[predicted_id] + ' '

        if output_tok.idx2word[predicted_id] == '<end>':
            return result
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result

In [None]:
result = predict("hello world!", encoder, decoder, EN, AR)[0]