In [82]:
import tensorflow
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np

In [81]:
batchSize = 64 #Batch Size for Training
epochs = 100 #Number of Epochs to Train for
latentDim = 256 #Latent dimensionality of the encoding space
numSamples = 10000 #Number of samples to train on
dataPath = '/content/hin.txt' #Path to the data txt file

In [80]:
inputText = []
targetText = []
inputChars = set()
targetChars = set()

with open(dataPath, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: min(numSamples, len(lines) - 1)]:
    parts = line.split('\t')
    if len(parts) >= 2:
        inputTxt = parts[0].strip()
        targetTxt = parts[1].strip()

        inputText.append(inputTxt)
        targetText.append('\t' + targetTxt + '\n')

        inputChars.update(list(inputTxt))
        targetChars.update(list('\t' + targetTxt + '\n'))

# Ensure special tokens are present
targetChars.add('\t')
targetChars.add('\n')

In [83]:
inputText

['Wow!',
 'Duck!',
 'Duck!',
 'Help!',
 'Jump.',
 'Jump.',
 'Jump.',
 'Hello!',
 'Hello!',
 'Cheers!',
 'Cheers!',
 'Exhale.',
 'Exhale.',
 'Got it?',
 "I'm OK.",
 'Inhale.',
 'Inhale.',
 'Thanks!',
 'We won.',
 'Awesome!',
 'Come in.',
 'Get out!',
 'Go away!',
 'Goodbye!',
 'Perfect!',
 'Perfect!',
 'We lost.',
 'Welcome.',
 'Welcome.',
 'Have fun.',
 'Have fun.',
 'Have fun.',
 'I forgot.',
 'I forgot.',
 "I'll pay.",
 "I'm fine.",
 "I'm full.",
 "Let's go!",
 'Pick Tom.',
 'Answer me.',
 'Birds fly.',
 'Excuse me.',
 'Fantastic!',
 'I fainted.',
 'I fear so.',
 'I laughed.',
 "I'm alone.",
 "I'm alone.",
 "I'm bored.",
 "I'm broke.",
 "I'm tired.",
 "It's cold.",
 'Well done!',
 'Who knows?',
 'Who knows?',
 'Who knows?',
 'Who knows?',
 'Wonderful!',
 'Birds sing.',
 'Come on in.',
 'Definitely!',
 "Don't move.",
 'Fire burns.',
 'Follow him.',
 'I can swim.',
 'I can swim.',
 'I love you.',
 'I love you.',
 'I love you.',
 'I love you.',
 'I love you.',
 "I'm coming.",
 "I'm hung

In [84]:
targetText

['\tवाह!\n',
 '\tझुको!\n',
 '\tबतख़!\n',
 '\tबचाओ!\n',
 '\tउछलो.\n',
 '\tकूदो.\n',
 '\tछलांग.\n',
 '\tनमस्ते।\n',
 '\tनमस्कार।\n',
 '\tवाह-वाह!\n',
 '\tचियर्स!\n',
 '\tसांस छोड़।\n',
 '\tसांस छोड़ो।\n',
 '\tसमझे कि नहीं?\n',
 '\tमैं ठीक हूँ।\n',
 '\tसांस ले।\n',
 '\tसांस लो।\n',
 '\tधन्यवाद!\n',
 '\tहम जीते।\n',
 '\tबहुत बढ़िया!\n',
 '\tअंदर आ जाओ।\n',
 '\tबाहर निकल जाओ!\n',
 '\tचले जाओ!\n',
 '\tख़ुदा हाफ़िज़।\n',
 '\tउत्तम!\n',
 '\tसही!\n',
 '\tहम हार गए।\n',
 '\tआपका स्वागत है।\n',
 '\tस्वागतम्।\n',
 '\tमज़े करना।\n',
 '\tमौज करना।\n',
 '\tमज़े करो।\n',
 '\tमैं भूल गया।\n',
 '\tमैं भूल गई।\n',
 '\tमैं पैसे दूंगा।\n',
 '\tमैं ठीक हूँ।\n',
 '\tमेरा पेट भर गया है।\n',
 '\tचलो चलें!\n',
 '\tटॉम उठाओ।\n',
 '\tमुझे जवाब दो।\n',
 '\tपंछी उड़ते हैं।\n',
 '\tमाफ़ कीजिए।\n',
 '\tबहुत ख़ूब!\n',
 '\tमैं बेहोश हो गया।\n',
 '\tखेद की बात है, लेकिन वैसा ही है।\n',
 '\tमैं हँसा।\n',
 '\tमैं अकेला हूँ ।\n',
 '\tमैं अकेली हूँ ।\n',
 '\tमैं बोर हो रहा हूँ।\n',
 '\tमेरा दीवालिया हो चुका है।\n',
 '\tमैं 

In [85]:
inputChars = sorted(list(inputChars))
targetChars = sorted(list(targetChars))
numEncoderTokens = len(inputChars)
numDecoderTokens = len(targetChars)
maxEncoderSequenceLen = max([len(txt) for txt in inputText])
maxDecoderSequenceLen = max([len(txt) for txt in targetText])


In [86]:
print("No of Samples:", len(inputText))
print("No of unique input tokens:", numEncoderTokens)  # Unique Characters in English dataset
print("No of unique output tokens:", numDecoderTokens) # Unique Characters in Hindi Dataset
print("Max Sequence Length for Inputs:", maxEncoderSequenceLen) # Max length of input english Sentence
print("Max Sequence Length for outputs:", maxDecoderSequenceLen) # Max Length of output hindi sentences

No of Samples: 3116
No of unique input tokens: 70
No of unique output tokens: 93
Max Sequence Length for Inputs: 107
Max Sequence Length for outputs: 123


In [87]:
# Assiging Token(indexes) to each unique Characters Both in ip and op
ipTokenIdx = dict([(char, i) for i, char in enumerate(inputChars)])
targetTokenIdx = dict([(char, i) for i, char in enumerate(targetChars)])

In [88]:
ipTokenIdx, targetTokenIdx

({' ': 0,
  '!': 1,
  '"': 2,
  '$': 3,
  "'": 4,
  ',': 5,
  '-': 6,
  '.': 7,
  '0': 8,
  '1': 9,
  '2': 10,
  '3': 11,
  '4': 12,
  '5': 13,
  '6': 14,
  '7': 15,
  '8': 16,
  '9': 17,
  ':': 18,
  '?': 19,
  'A': 20,
  'B': 21,
  'C': 22,
  'D': 23,
  'E': 24,
  'F': 25,
  'G': 26,
  'H': 27,
  'I': 28,
  'J': 29,
  'K': 30,
  'L': 31,
  'M': 32,
  'N': 33,
  'O': 34,
  'P': 35,
  'R': 36,
  'S': 37,
  'T': 38,
  'U': 39,
  'V': 40,
  'W': 41,
  'Y': 42,
  'a': 43,
  'b': 44,
  'c': 45,
  'd': 46,
  'e': 47,
  'f': 48,
  'g': 49,
  'h': 50,
  'i': 51,
  'j': 52,
  'k': 53,
  'l': 54,
  'm': 55,
  'n': 56,
  'o': 57,
  'p': 58,
  'q': 59,
  'r': 60,
  's': 61,
  't': 62,
  'u': 63,
  'v': 64,
  'w': 65,
  'x': 66,
  'y': 67,
  'z': 68,
  '€': 69},
 {'\t': 0,
  '\n': 1,
  ' ': 2,
  '!': 3,
  '"': 4,
  '(': 5,
  ')': 6,
  ',': 7,
  '-': 8,
  '.': 9,
  '0': 10,
  '1': 11,
  '7': 12,
  '9': 13,
  ':': 14,
  '?': 15,
  'A': 16,
  'B': 17,
  'I': 18,
  '|': 19,
  'ँ': 20,
  'ं': 21,
  'ः'

In [89]:
# One Hot Representation(vectorization) Using Numpy
encoderIpData = np.zeros((len(inputText), maxEncoderSequenceLen, numEncoderTokens), dtype='float32')
decoderIpData = np.zeros((len(inputText), maxDecoderSequenceLen, numDecoderTokens), dtype='float32')
decoderTargetData = np.zeros((len(inputText), maxDecoderSequenceLen, numDecoderTokens), dtype='float32')

In [90]:
encoderIpData, decoderIpData, decoderTargetData

(array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],


In [91]:
# One Hot Representation
for i, (inputText, targetText) in enumerate(zip(inputText, targetText)):
    for t, char in enumerate(inputText):
        encoderIpData[i, t, ipTokenIdx[char]] = 1.0
    encoderIpData[i, t + 1:, ipTokenIdx[' ']] = 1.0

    for t, char in enumerate(targetText):
        decoderIpData[i, t, targetTokenIdx[char]] = 1.0
        if t > 0:
            decoderTargetData[i, t - 1, targetTokenIdx[char]] = 1.0
    decoderIpData[i, t + 1:, targetTokenIdx[' ']] = 1.0
    decoderTargetData[i, t:, targetTokenIdx[' ']] = 1.0

In [92]:
encoderIpData[0], decoderIpData[0], encoderIpData[1], decoderIpData[1], encoderIpData[2], decoderIpData[2], encoderIpData[3], decoderIpData[3],

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.]], dtype=float32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],


In [93]:
# Defining Input Sequence and processing it
encoderInputs = Input(shape=(None, numEncoderTokens))
encoder = LSTM(latentDim, return_state=True)
# In Sequence2Sequence We dont need Op of Encoders So we hide that we only need context vector from encoders and this context vector is given as an ip to decoders when EOS is reached
encoderOutputs, stateHidden, stateCell = encoder(encoderInputs)
encoderStates = [stateHidden, stateCell]

In [94]:
# Setting up the decoder , using encoder states as initial state
decoderInputs = Input(shape=(None, numDecoderTokens))
# Setting Up the decoder to return full output sequences,
# and to return internal states as well, we dont use the return states in the
# training model , but will use them in inference
decoder = LSTM(latentDim, return_sequences=True, return_state=True)
decoderOutputs, _, _ = decoder(decoderInputs, initial_state=encoderStates)
decoderDense = Dense(numDecoderTokens, activation='softmax')
decoderOutputs = decoderDense(decoderOutputs)

In [95]:
# Defining the model
# EncoderInputData and DecoderInputData into DecoderOutputData
model = Model([encoderInputs, decoderInputs], decoderOutputs)

# Training the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoderIpData, decoderIpData], decoderTargetData, batch_size=batchSize, epochs=epochs, validation_split=0.2)

Epoch 1/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - accuracy: 0.8011 - loss: 1.8026 - val_accuracy: 0.6843 - val_loss: 1.7656
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.8075 - loss: 1.0340 - val_accuracy: 0.6867 - val_loss: 1.4711
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8055 - loss: 0.9338 - val_accuracy: 0.6867 - val_loss: 1.3882
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8086 - loss: 0.8967 - val_accuracy: 0.6867 - val_loss: 1.4474
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8098 - loss: 0.8131 - val_accuracy: 0.6867 - val_loss: 1.3795
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8109 - loss: 0.7959 - val_accuracy: 0.6867 - val_loss: 1.3383
Epoch 7/100
[1m39/39[0m [

<keras.src.callbacks.history.History at 0x7aaf7993cb90>

In [96]:
encoderModel = Model(inputs=encoderInputs, outputs=encoderStates)

decoderStateInput_H = Input(shape=(latentDim,))
decoderStateInput_C = Input(shape=(latentDim,))
decoderStatesInputs = [decoderStateInput_H, decoderStateInput_C]

decoderOutputs, state_h, state_c = decoder(decoderInputs, initial_state=decoderStatesInputs)
decoderOutputs = decoderDense(decoderOutputs)
decoderStates = [state_h, state_c]

decoderModel = Model([decoderInputs] + decoderStatesInputs, [decoderOutputs] + decoderStates)

reverseIpCharIndex = dict((i, char) for char, i in ipTokenIdx.items())
reverseTargetCharIndex = dict((i, char) for char, i in targetTokenIdx.items())

def decodeSequence(inputSeq):
    stateValue = encoderModel.predict(inputSeq)
    targetSeq = np.zeros((1, 1, numDecoderTokens))
    targetSeq[0, 0, targetTokenIdx['\t']] = 1.0
    decodedSentence = ''
    stopCondition = False
    while not stopCondition:
        outputTokens, h, c = decoderModel.predict([targetSeq] + stateValue)
        sampledTokenIdx = np.argmax(outputTokens[0, -1, :])
        sampledChar = reverseTargetCharIndex[sampledTokenIdx]
        decodedSentence += sampledChar

        if sampledChar == '\n' or len(decodedSentence) > maxDecoderSequenceLen:
            stopCondition = True

        targetSeq = np.zeros((1, 1, numDecoderTokens))
        targetSeq[0, 0, sampledTokenIdx] = 1.0

        stateValue = [h, c]
    return decodedSentence

In [None]:
for seqIdx in range(10):  # Reduced for quick testing
    inputSeq = encoderIpData[seqIdx: seqIdx + 1]
    decodedSentence = decodeSequence(inputSeq)
    print('-')
    print('Input Sentence:', inputText[seqIdx])
    print('Decoded Sentence:', decodedSentence)