In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head()

Unnamed: 0,Hindi,Telugu
0,कुछ वर्षों पहले मुझे ऐसा लगा जैसे मैं किसी उदा...,"కొన్ని సంవత్సరాల ముందు, నేను బాగా ఆచరానములో ఉన..."
1,यह विचार बहुत ही सरल है,ఈ ఆలోచన చాలా సులభమైనది.
2,ऐसे किसी चीज के बारे में सोचिये जिसे आप हमेशा ...,మీ జీవితములో మీరు చేయాలి అనుకునే పనిని ఆలోచించ...
3,"यह पता चला है कि 30 दिन पर्याप्त समय है, कोई न...",ఫలితము దక్కుతుంది. 30 రోజులనేది మీ జీవితములో ఒ...
4,इस 30 दिन की चुनौतियों के दौरान मैंने कुछ बाते...,ఈ 30 రోజులు పాటించే విధానములో నేను కొన్ని విషయ...


In [3]:
data.count()

Hindi     2648
Telugu    2648
dtype: int64

In [4]:
data = data.head(1000)

In [5]:
data.count()

Hindi     1000
Telugu    1000
dtype: int64

In [6]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

In [7]:
train_data.count(), test_data.count()

(Hindi     900
 Telugu    900
 dtype: int64,
 Hindi     100
 Telugu    100
 dtype: int64)

In [8]:
tokenizer_hindi = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_telugu = Tokenizer(filters='', oov_token='<OOV>')

In [9]:
tokenizer_telugu.fit_on_texts(train_data['Telugu'])
tokenizer_hindi.fit_on_texts(train_data['Hindi'])

In [11]:
print(tokenizer_hindi.word_index)

{'<OOV>': 1, 'के': 2, 'और': 3, 'में': 4, 'है': 5, 'से': 6, 'की': 7, 'एक': 8, 'का': 9, 'को': 10, 'मैं': 11, 'कि': 12, 'यह': 13, 'है।': 14, 'नहीं': 15, 'कर': 16, 'हैं': 17, 'हम': 18, 'पर': 19, 'भी': 20, 'लिए': 21, 'जो': 22, 'तो': 23, 'इस': 24, 'मुझे': 25, '।': 26, 'ये': 27, 'आप': 28, 'करने': 29, 'बहुत': 30, 'ही': 31, 'था': 32, 'हो': 33, 'सकते': 34, 'हैं।': 35, 'हैं,': 36, 'है,': 37, 'कुछ': 38, 'जब': 39, 'अपने': 40, 'मैंने': 41, 'वे': 42, 'किया': 43, 'साथ': 44, 'रही': 45, 'काम': 46, 'हमारे': 47, 'मेरे': 48, 'ने': 49, 'या': 50, 'पास': 51, 'उनके': 52, 'हमने': 53, 'वो': 54, 'लेकिन': 55, 'तक': 56, 'वह': 57, 'क्या': 58, 'थे': 59, 'अधिक': 60, 'करना': 61, 'मे': 62, 'बारे': 63, 'कोई': 64, 'हूँ': 65, 'आपको': 66, 'अपनी': 67, 'गया': 68, 'साल': 69, 'i': 70, 'में,': 71, 'अगर': 72, 'किसी': 73, 'रहे': 74, 'थी': 75, 'समय': 76, 'लोगों': 77, 'रहा': 78, 'सभी': 79, 'सबसे': 80, 'उसे': 81, 'होता': 82, 'मेरा': 83, 'रूप': 84, 'लोग': 85, 'घर': 86, 'जा': 87, 'सकता': 88, 'जीवन': 89, 'करते': 90, 'बात': 91, 'करता': 9

In [10]:
print(tokenizer_telugu.word_index)

{'<OOV>': 1, 'ఒక': 2, 'నేను': 3, 'ఈ': 4, 'మరియు': 5, 'ఇది': 6, 'చాలా': 7, 'మీరు': 8, 'నా': 9, 'మా': 10, 'ఆ': 11, 'లో': 12, 'అది': 13, 'కానీ': 14, 'వారి': 15, 'మేము': 16, 'ఇంకా': 17, 'గురించి': 18, 'కూడా': 19, 'నాకు': 20, 'మన': 21, 'మీ': 22, 'ఆమె': 23, 'వారు': 24, 'మనం': 25, 'అని': 26, 'లేదా': 27, ')': 28, 'నన్ను': 29, '(': 30, 'మీకు': 31, 'నుంచి': 32, 'గా': 33, 'ఇప్పుడు': 34, ',': 35, 'ఎలా': 36, 'ఎక్కువ': 37, 'మనము': 38, 'ఉన్న': 39, 'పెట్టుబడి': 40, 'కొన్ని': 41, 'సౌర': 42, 'పెద్ద': 43, 'కోసం': 44, 'ఎందుకంటే': 45, 'అలా': 46, '--': 47, 'కొత్త': 48, 'మంది': 49, 'అవి': 50, 'పని': 51, 'నుండి': 52, 'మేం': 53, 'అదే': 54, 'యొక్క': 55, 'వాళ్ళు': 56, 'కు': 57, 'మాకు': 58, 'బ్యాంక్': 59, 'మరింత': 60, 'ఉంది.': 61, 'తో': 62, 'ఇక్కడ': 63, 'ప్రింట్': 64, 'ఇతర': 65, 'ఇలా': 66, 'అంటే': 67, 'మనకు': 68, 'చిన్న': 69, 'కాబట్టి': 70, 'అల్జీమర్స్': 71, 'ప్రతి': 72, 'కాదు': 73, 'చేయడానికి': 74, 'మంచి': 75, 'అక్కడ': 76, 'కాని': 77, 'లోని': 78, 'ప్రజలు': 79, 'దాని': 80, 'ద్వారా': 81, 'కాదు.': 82, 'కేవలం': 83, 

In [12]:
text_to_tokenize = "यह विचार बहुत ही सरल है"
tokenized_text = tokenizer_hindi.texts_to_sequences([text_to_tokenize])
print(tokenized_text)

[[13, 322, 30, 31, 1434, 5]]


In [13]:
vocab_size_hindi = len(tokenizer_hindi.word_index) + 1
vocab_size_telugu = len(tokenizer_telugu.word_index) + 1
vocab_size_hindi, vocab_size_telugu

(3522, 5412)

In [14]:
train_input = tokenizer_hindi.texts_to_sequences(train_data['Hindi'])
train_output = tokenizer_telugu.texts_to_sequences(train_data['Telugu'])
test_input = tokenizer_hindi.texts_to_sequences(test_data['Hindi'])
test_output = tokenizer_telugu.texts_to_sequences(test_data['Telugu'])

In [15]:
print(train_input)

[[171, 1470, 536, 1471, 143, 152, 18, 683, 537, 230, 538], [539, 684, 82, 939, 368, 324], [93, 13, 369, 68, 12, 291, 42, 1472, 325, 940, 93, 1473, 17, 153, 31, 93, 172, 89, 941, 231], [13, 540, 9, 8, 84, 14], [24, 154, 1474, 4, 942, 541, 85, 138, 370, 20, 94, 16, 173, 256, 86, 4, 371, 372], [129, 542, 13, 5, 12, 685, 19, 106, 9, 373, 374, 37, 130, 1475, 7, 943, 50, 1476, 232, 944, 84, 6, 543, 14], [1477, 11, 155, 544, 686, 32, 233, 87, 16, 945, 68], [3, 107, 27, 946, 1478, 33, 174, 89, 2, 1479, 175, 62, 174], [326, 57, 1480, 2, 687, 1481, 2, 8, 234, 6, 1482, 230, 947, 235, 545, 948, 2, 375, 30, 688, 689, 26, 57, 8, 108, 1483, 19, 690, 156, 17, 155, 52, 51, 1484, 6, 20, 691, 5, 26], [438, 692, 546, 41, 86, 6, 184, 43, 14], [72, 64, 15, 185, 23, 20, 27, 85, 23, 949, 3, 18, 8, 950, 1485], [129, 327, 47, 51, 8, 1486, 5, 22, 205, 439, 117, 5, 236, 292, 186, 293, 157, 257, 231, 376, 7, 440, 4, 125, 294, 206, 205, 6, 693, 82, 5], [547, 2, 21, 377, 73, 951, 7, 952, 207, 953, 89, 1487, 378, 6, 

In [16]:
max_hindi_sequence_length = max(len(seq) for seq in train_input)
max_telugu_sequence_length = max(len(seq) for seq in train_output)

max_hindi_sequence_length,max_telugu_sequence_length

(90, 53)

In [17]:
max_seq_length = max(max_hindi_sequence_length, max_telugu_sequence_length)
max_seq_length

90

In [18]:
train_input = pad_sequences(train_input, maxlen=max_seq_length, padding='post')
train_output = pad_sequences(train_output, maxlen=max_seq_length, padding='post')
test_input = pad_sequences(test_input, maxlen=max_seq_length, padding='post')
test_output = pad_sequences(test_output, maxlen=max_seq_length, padding='post')

In [19]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention

# Define input and output sequences
encoder_input = Input(shape=(max_seq_length,))
decoder_input = Input(shape=(max_seq_length,))

# Embedding layer for encoder and decoder
embedding_dim = 256  # Adjust as needed
encoder_embedding = Embedding(input_dim=vocab_size_hindi, output_dim=embedding_dim)(encoder_input)
decoder_embedding = Embedding(input_dim=vocab_size_telugu, output_dim=embedding_dim)(decoder_input)

# Encoder LSTM
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Attention layer
attention_layer = Attention()
attention_output = attention_layer([encoder_outputs, decoder_embedding])

# Decoder LSTM
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(attention_output, initial_state=encoder_states)

# Output layer
output_layer = Dense(vocab_size_telugu, activation='softmax')
decoder_outputs = output_layer(decoder_outputs)

# Define the model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 90)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 90, 256)              901632    ['input_1[0][0]']             
                                                                                                  
 input_2 (InputLayer)        [(None, 90)]                 0         []                            
                                                                                                  
 lstm (LSTM)                 [(None, 90, 256),            525312    ['embedding[0][0]']           
                              (None, 256),                                                    

In [21]:
# Train the model
batch_size = 64
epochs = 5

model.fit([train_input, train_output],train_output,batch_size=batch_size,epochs=epochs,validation_split=0.1,)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x28f4daa3cd0>

In [22]:
accuracy = model.evaluate([test_input, test_output], test_output, verbose=0)
print("Test Loss:", accuracy[0])
print("Accuracy:", accuracy[1])

Test Loss: 1.2601304054260254
Accuracy: 0.8724444508552551


In [23]:
import pickle

# Save the trained model
model.save('models/translation_model2.keras')

# Save the Hindi tokenizer
with open('tokenizers/hindi_tokenizer2.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer_hindi, tokenizer_file)

# Save the Telugu tokenizer
with open('tokenizers/telugu_tokenizer2.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer_telugu, tokenizer_file)