In [12]:
import numpy as np
import pandas as pd
from keras_transformer import get_model, decode

In [13]:
with open('../Dataset/English_Hindi_Hinglish.txt', mode = 'r') as f:
    data = f.readlines()

data = data[0:195] # 195Because we have that many labeled data points for Hinglish to English translation.

source_tokens0 = [i.split(',')[1].strip().split(' ') for i in data]
target_tokens0 = [i.split('	')[0].strip().split(' ') for i in data]

In [14]:
df1 = pd.read_csv('../Dataset/Moni.csv')
df1.dropna(inplace = True)
source_tokens1 = [i.split(' ') for i in df1['Hinglish'].values.tolist()]
target_tokens1 = [i.split(' ') for i in df1['English'].values.tolist()]

In [15]:
df2 = pd.read_csv('../Dataset/Ashish.csv')
df2.dropna(inplace = True)
source_tokens2 = [i.split(' ') for i in df2['Hinglish'].values.tolist()]
target_tokens2 = [i.split(' ') for i in df2['English'].values.tolist()]

In [16]:
source_tokens = source_tokens0 + source_tokens1 + source_tokens2
target_tokens = target_tokens0 + target_tokens1 + target_tokens2

In [17]:
# Generate dictionaries
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for tokens in token_list:
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# Padding
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# Build & fit model
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # Use different embeddings for different languages
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Encoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Encoder-Token-Embedding (Embed  [(None, None, 32),  21408       ['Encoder-Input[0][0]']          
 dingRet)                        (669, 32)]                                                       
                                                                                                  
 Encoder-Embedding (TrigPosEmbe  (None, None, 32)    0           ['Encoder-Token-Embedding[0][0]']
 dding)                                                                                           
                                                                                            

In [18]:
%%time
model.fit(
    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=5,
    batch_size=32,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 50min 9s, sys: 3min 38s, total: 53min 47s
Wall time: 22min 16s


<keras.callbacks.History at 0x7f9bcafc5090>

# Validation

In [19]:
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
)
for i in decoded:
    pass
    #print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))

Wow!
Help!
Jump.
Jump.
Jump.
Hello!
Hello!
Cheers!
Cheers!
Got it?
I'm OK.
Awesome!
Come in.
Get out!
Go away!
Goodbye!
Perfect!
Perfect!
Welcome.
Welcome.
Have fun.
Have fun.
Have fun.
I forgot.
I forgot.
I'll pay.
I'm fine.
I'm full.
Let's go!
Answer me.
Birds fly.
Excuse me.
Fantastic!
I fainted.
I fear so.
I laughed.
I'm bored.
I'm broke.
I'm tired.
It's cold.
Who knows?
Who knows?
Who knows?
Who knows?
Wonderful!
Birds sing.
Come on in.
Definitely!
Don't move.
Fire burns.
Follow him.
I'm tired.
I can swim.
I can swim.
I love you.
I love you.
I love you.
I love you.
I love you.
I will try.
I'm coming.
I'm hungry!
Let him in.
Let him in.
Let me out!
Once again.
Please sit.
What's new?
What's new?
Who's that?
Don't shout.
Don't shout.
He stood up.
He's strong.
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
How are you?
I like both.
I like cake.
I like dogs.
I like math.
I'll attend.
Nobody came.
Was I wrong?
What is this?
Are you sick?
Brin

# Testing

In [20]:
test_sents = [
    'kaise ho?',
    'kya tum mujhse pyar karte ho?',
    'kya tum mujhe pyar karte ho?'
]

In [21]:
test_tokens = [i.split() for i in test_sents]

In [22]:
test_token_dict = build_token_dict(test_tokens)

In [23]:
test_token_dict_inv = {v: k for k, v in test_token_dict.items()}

In [24]:
test_enc_tokens = [['<START>'] + tokens + ['<END>'] for tokens in test_tokens]
test_enc_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in test_enc_tokens]
test_input = [list(map(lambda x: test_token_dict[x], tokens)) for tokens in test_enc_tokens]

In [25]:
decoded = decode(
    model,
    test_input,
    start_token=test_token_dict['<START>'],
    end_token=test_token_dict['<END>'],
    pad_token=test_token_dict['<PAD>'],
)
for i in decoded:
    print(' '.join(map(lambda x: target_token_dict_inv[x], i[1:-1])))

Wow!
I believe you?
I'm really tired.
