In [52]:
import numpy as np
from keras_transformer import get_model, decode

In [68]:
with open('../Dataset/English_Hindi_Hinglish.txt', mode = 'r') as f:
    data = f.readlines()

data = data[0:195] # 195Because we have that many labeled data points for Hinglish to English translation.

source_tokens = [i.split(',')[1].strip().split(' ') for i in data]
target_tokens = [i.split('	')[0].strip().split(' ') for i in data]

In [69]:
source_tokens

[['wah!'],
 ['bachao!'],
 ['ucchlo.'],
 ['kudo.'],
 ['chhalang.'],
 ['namaste!'],
 ['namaskar!'],
 ['wah-wah!'],
 ['cheers!'],
 ['samajhe', 'ki', 'nhi?'],
 ['Mai', 'theek', 'hu!'],
 ['bahut', 'badiya!'],
 ['andar', 'aa', 'jao!'],
 ['bahar', 'nikal', 'jao!'],
 ['chale', 'jao!'],
 ['khuda', 'hafiz!'],
 ['uttam!'],
 ['sahi!'],
 ['apka', 'swagat', 'hai.'],
 ['swagatam.'],
 ['maze', 'karna.'],
 ['mauj', 'karna.'],
 ['maze', 'karo.'],
 ['mai', 'bhool', 'gaya.'],
 ['mai', 'bool', 'gayi.'],
 ['mai', 'paise', 'dunga.'],
 ['mai', 'theek', 'hu.'],
 ['mera', 'pet', 'bhar', 'gaya', 'hai.'],
 ['chale', 'jao.'],
 ['mujhe', 'jawab', 'do.'],
 ['pancchi', 'udte', 'hain.'],
 ['maaf', 'kijiye.'],
 ['bahut', 'khoob.'],
 ['mai', 'behosh', 'ho', 'gaya.'],
 ['लेकिन', 'वैसा', 'ही', 'है।'],
 ['mai', 'hansa.'],
 ['mai', 'bore', 'ho', 'rha', 'hu.'],
 ['mera', 'diwaliya', 'ho', 'chuka', 'hai.'],
 ['mai', 'thak', 'gaya', 'hu.'],
 ['thand', 'ho', 'rhi', 'hai.'],
 ['kaun', 'jane?'],
 ['kisko', 'pata', 'hai?'],
 ['kis

In [70]:
# Generate dictionaries
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for tokens in token_list:
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# Padding
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# Build & fit model
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Encoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Encoder-Token-Embedding (Embed  [(None, None, 32),  10528       ['Encoder-Input[0][0]']          
 dingRet)                        (329, 32)]                                                       
                                                                                                  
 Encoder-Embedding (TrigPosEmbe  (None, None, 32)    0           ['Encoder-Token-Embedding[0][0]']
 dding)                                                                                           
                                                                                            

In [71]:
%%time
model.fit(
    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=10,
    batch_size=32,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 51min 23s, sys: 3min 52s, total: 55min 16s
Wall time: 25min 39s


<keras.callbacks.History at 0x7f71bd2c5400>

# Validation

In [72]:
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
)
for i in decoded:
    print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))

Wow!
Help!
Jump.
Jump.
Jump.
Hello!
Hello!
Cheers!
Cheers!
Got it?
I'm OK.
Awesome!
Come in.
Get out!
Go away!
Goodbye!
Perfect!
Perfect!
Welcome.
Welcome.
Have fun.
Have fun.
Have fun.
I forgot.
I forgot.
I'll pay.
I'm fine.
I'm full.
Let's go!
Answer me.
Birds fly.
Excuse me.
Fantastic!
I fainted.
I fear so.
I laughed.
I'm bored.
I'm broke.
I am tired.
It's cold.
Who knows?
Who knows?
Who knows?
Who knows?
Wonderful!
Birds sing.
Come on in.
Definitely!
Don't move.
Fire burns.
Follow him.
I am tired.
I can swim.
I can swim.
I love you.
I love you.
I love you.
I love you.
I love you.
I will try.
I'm coming.
I'm hungry!
Let him in.
Let him in.
Let me out!
Once again.
Please sit.
What's new?
What's new?
Who's that?
Don't shout.
Don't shout.
He stood up.
He's strong.
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
I like both.
I like cake.
I like dogs.
I like math.
I'll attend.
Nobody came.
Was I wrong?
What is this?
Are you sick?
Br

# Testing

In [73]:
test_sents = [
    'kaise ho?',
    'kya tum mujhse pyar karte ho?',
    'kya tum mujhe pyar karte ho?'
]

In [74]:
test_tokens = [i.split() for i in test_sents]

In [76]:
test_token_dict = build_token_dict(test_tokens)

In [78]:
test_token_dict_inv = {v: k for k, v in test_token_dict.items()}

In [80]:
test_enc_tokens = [['<START>'] + tokens + ['<END>'] for tokens in test_tokens]
test_enc_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in test_enc_tokens]
test_input = [list(map(lambda x: test_token_dict[x], tokens)) for tokens in test_enc_tokens]

In [81]:
decoded = decode(
    model,
    test_input,
    start_token=test_token_dict['<START>'],
    end_token=test_token_dict['<END>'],
    pad_token=test_token_dict['<PAD>'],
)
for i in decoded:
    print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))

Wow!
I can't you?
I can't now.
