In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model, Sequential


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Any results you write to the current directory are saved as output.

In [4]:
lines=pd.read_csv("./data/spa.txt", delimiter="\t", encoding='utf-8', 
                  names = ['english','spanish'], header=None)

In [5]:
lines

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
141538,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
141539,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
141540,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."
141541,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...


In [6]:
pd.isnull(lines).sum()

english    0
spanish    0
dtype: int64

In [7]:
lines=lines[~pd.isnull(lines['english'])]

In [8]:
lines.drop_duplicates(inplace=True)

In [9]:
lines=lines.sample(n=25000,random_state=42)
lines.shape

(25000, 2)

* ### Let us pick any 25000 rows from the dataset.

In [10]:
lines.shape

(25000, 2)

In [11]:
# Lowercase all characters
lines['english']=lines['english'].apply(lambda x: x.lower())
lines['spanish']=lines['spanish'].apply(lambda x: x.lower())

In [12]:
# Remove quotes
lines['english']=lines['english'].apply(lambda x: re.sub("'", '', x))
lines['spanish']=lines['spanish'].apply(lambda x: re.sub("'", '', x))

In [13]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english']=lines['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['spanish']=lines['spanish'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [14]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english']=lines['english'].apply(lambda x: x.translate(remove_digits))
lines['spanish']=lines['spanish'].apply(lambda x: x.translate(remove_digits))

# Remove extra spaces
lines['english']=lines['english'].apply(lambda x: x.strip())
lines['spanish']=lines['spanish'].apply(lambda x: x.strip())
lines['english']=lines['english'].apply(lambda x: re.sub(" +", " ", x))
lines['spanish']=lines['spanish'].apply(lambda x: re.sub(" +", " ", x))


In [15]:
# Add start and end tokens to target sequences
lines['spanish'] = lines['spanish'].apply(lambda x : 'START_ '+ x + ' _END')

In [16]:
lines.head()

Unnamed: 0,english,spanish
138916,do you remember the day when we met each other...,START_ ¿recuerdas el día en que nos encontramo...
80065,it would be good if you ate it,START_ sería bueno que lo comierais _END
132496,tom doesnt like women who wear way too much ma...,START_ a tom no le gustan las mujeres que usan...
65965,our water heater is leaking,START_ nuestro calentador pierde agua _END
38472,dont be irresponsible,START_ no seas irresponsable _END


In [17]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in lines['english']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_spanish_words=set()
for hin in lines['spanish']:
    for word in hin.split():
        if word not in all_spanish_words:
            all_spanish_words.add(word)

In [18]:
len(all_eng_words)

7456

In [19]:
len(all_spanish_words)

12968

In [20]:
lines['length_eng_sentence']=lines['english'].apply(lambda x:len(x.split(" ")))
lines['length_spa_sentence']=lines['spanish'].apply(lambda x:len(x.split(" ")))

In [21]:
lines.head()

Unnamed: 0,english,spanish,length_eng_sentence,length_spa_sentence
138916,do you remember the day when we met each other...,START_ ¿recuerdas el día en que nos encontramo...,14,12
80065,it would be good if you ate it,START_ sería bueno que lo comierais _END,8,7
132496,tom doesnt like women who wear way too much ma...,START_ a tom no le gustan las mujeres que usan...,10,13
65965,our water heater is leaking,START_ nuestro calentador pierde agua _END,5,6
38472,dont be irresponsible,START_ no seas irresponsable _END,3,5


In [22]:
lines[lines['length_eng_sentence']>30].shape

(4, 4)

In [23]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_spa_sentence']<=20]

In [24]:
lines.shape

(24936, 4)

In [25]:
print("maximum length of Hindi Sentence ",max(lines['length_spa_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [26]:
max_length_src=max(lines['length_spa_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [27]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_spanish_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_spanish_words)
num_encoder_tokens, num_decoder_tokens

(7456, 12968)

In [28]:
num_decoder_tokens += 1 #for zero padding


In [29]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [30]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [31]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,english,spanish,length_eng_sentence,length_spa_sentence
33213,youre being watched,START_ usted está siendo observado _END,3,6
16604,i always screw up,START_ siempre fracaso _END,4,4
2820,thats over,START_ se acabó _END,2,4
68449,are you going to buy the car,START_ ¿vas a comprar el coche _END,7,7
101249,their plan sounds interesting to me,START_ su plan me parece interesante _END,6,7
98335,tom made some mistakes on the test,START_ tom cometió algunos errores en la prueb...,7,9
29787,i intend to go there,START_ pienso ir allí _END,5,5
73911,he often comes late to school,START_ a menudo llega tarde al colegio _END,6,8
122631,how many eggs were you able to get yesterday,START_ ¿cuántos huevos pudiste conseguir ayer ...,9,7
5828,im tired now,START_ ahora estoy cansado _END,3,5


### Split the data into train and test

In [32]:
X, y = lines['english'], lines['spanish']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((19948,), (4988,))

### Let us save this data

In [33]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')


In [34]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            return [encoder_input_data, decoder_input_data],decoder_target_data

In [35]:
x_tr, y_tr = generate_batch(X_train, y_train, batch_size=len(X_train))

In [36]:
x_ts, y_ts = generate_batch(X_test, y_test, batch_size=len(X_test))


### Encoder-Decoder Architecture

In [37]:
latent_dim=300

In [38]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [39]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [40]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [41]:
model.summary()

In [42]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [43]:
X_train

19447                          wood burns easily
8676                              this is a desk
84314             i wont go down without a fight
110334    the exercises are simple and effective
13505                           i painted my car
                           ...                  
15258                           wait for me here
14515                           swimming is easy
20153                         go down the stairs
103831       jealous people die but not jealousy
68720               do you want to work with tom
Name: english, Length: 19948, dtype: object

In [None]:
"""model.fit(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)"""
model.fit(x=x_tr, y=y_tr,
                    steps_per_epoch = train_samples,
                    epochs=epochs)


In [41]:
model.save_weights('nmt_weights.h5')

In [42]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [43]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [44]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [45]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: in order to understand whether this is true
Actual Hindi Translation:  यह समझने के लिए कि क्या यह सच है 
Predicted Hindi Translation:  यह समझने के लिए कि क्या यह सच है 


In [46]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: to why india is today growing
Actual Hindi Translation:  कि भारत आज आगे बढ़ रहा है 
Predicted Hindi Translation:  कि भारत आज आगे बढ़ रहा है 


In [47]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: then theyll live years longer”
Actual Hindi Translation:  तो वे साल अधिक जियेंगे” 
Predicted Hindi Translation:  तो वे साल अधिक जियेंगे” 


In [48]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: and start thinking about the long long line from b to e
Actual Hindi Translation:  और सोचना होता है उस लम्बी लम्बी रेखा के बारे में जो बी से ई तक जाती है 
Predicted Hindi Translation:  और सोचना होता है और पता चला रहा है 


In [49]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: well have a model for the rest of language
Actual Hindi Translation:  हमारे पास बाकि भषा के लिए मापद्न्ड होगा 
Predicted Hindi Translation:  तो हमारे पास अच्छी माँ का उपयोग हो सकता है 


In [1]:
import numpy as np

def sample_word(probs):
  """Samples a word from a probability distribution.

  Args:
    probs: A numpy array of probabilities for each word in the vocabulary.

  Returns:
    The index of the sampled word.
  """
  # Sample a number from a uniform distribution between 0 and 1.
  sample = np.random.rand()

  # Accumulate probabilities until the sampled value is exceeded.
  cumulative_prob = 0.0
  for i, prob in enumerate(probs):
    cumulative_prob += prob
    if sample <= cumulative_prob:
      return i

  # If no word is selected due to rounding errors, return the last word.
  return len(probs) - 1

# Example usage:
probs = np.array([0.2, 0.3, 0.4, 0.1])
sampled_word_index = sample_word(probs)
print(f"Sampled word index: {sampled_word_index}")


Sampled word index: 0
