In [1]:
import pandas as pd
df = pd.read_csv('data/preprocessed_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,Hindi,Telugu
0,कुछ वर्षों पहले मुझे ऐसा लगा जैसे मैं किसी उदा...,"కొన్ని సంవత్సరాల ముందు, నేను బాగా ఆచరానములో ఉన..."
1,यह विचार बहुत ही सरल है,ఈ ఆలోచన చాలా సులభమైనది.
2,ऐसे किसी चीज के बारे में सोचिये जिसे आप हमेशा ...,మీ జీవితములో మీరు చేయాలి అనుకునే పనిని ఆలోచించ...
3,"यह पता चला है कि 30 दिन पर्याप्त समय है, कोई न...",ఫలితము దక్కుతుంది. 30 రోజులనేది మీ జీవితములో ఒ...
4,इस 30 दिन की चुनौतियों के दौरान मैंने कुछ बाते...,ఈ 30 రోజులు పాటించే విధానములో నేను కొన్ని విషయ...


In [2]:
df.count()

Hindi     2648
Telugu    2648
dtype: int64

In [3]:
df = df.head(1000)

In [4]:
df.count()

Hindi     1000
Telugu    1000
dtype: int64

In [5]:
df.isna().sum()

Hindi     0
Telugu    0
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

hindi_sentences = df['Hindi']
telugu_sentences = df['Telugu']

In [7]:
hindi_sentences, telugu_sentences

(0      कुछ वर्षों पहले मुझे ऐसा लगा जैसे मैं किसी उदा...
 1                               यह विचार बहुत ही सरल है 
 2      ऐसे किसी चीज के बारे में सोचिये जिसे आप हमेशा ...
 3      यह पता चला है कि 30 दिन पर्याप्त समय है, कोई न...
 4      इस 30 दिन की चुनौतियों के दौरान मैंने कुछ बाते...
                              ...                        
 995    अंत में यह पता चला कि दुसरे व्यक्ति के साथ सुर...
 996    और जो लोग ऐसे संबंधों में होते हैं जहाँ वे उनक...
 997    और ये अच्छे संबंध सभी समय एक से होना आवश्यक नह...
 998    कुछ हमारे जोड़े एक दुसरे से लड़ते रहते थे दिन-रा...
 999    तो यह मेसेज, यह कि हमारे स्वास्थ्य और कुशल के ...
 Name: Hindi, Length: 1000, dtype: object,
 0      కొన్ని సంవత్సరాల ముందు, నేను బాగా ఆచరానములో ఉన...
 1                                ఈ ఆలోచన చాలా సులభమైనది.
 2      మీ జీవితములో మీరు చేయాలి అనుకునే పనిని ఆలోచించ...
 3      ఫలితము దక్కుతుంది. 30 రోజులనేది మీ జీవితములో ఒ...
 4      ఈ 30 రోజులు పాటించే విధానములో నేను కొన్ని విషయ...
                             

In [8]:
hindi_train, hindi_val, telugu_train, telugu_val = train_test_split(hindi_sentences, telugu_sentences, test_size=0.2, random_state=42)

In [9]:
print("Hindi Training set shape:", hindi_train.shape)
print("Hindi Val set shape:", hindi_val.shape)
print("Telugu Training set shape:", telugu_train.shape)
print("Telugu Val set shape:", telugu_val.shape)

Hindi Training set shape: (800,)
Hindi Val set shape: (200,)
Telugu Training set shape: (800,)
Telugu Val set shape: (200,)


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
# Tokenize Hindi sentences
hindi_tokenizer = Tokenizer(oov_token="<OOV>")
hindi_tokenizer.fit_on_texts(hindi_sentences)

In [12]:
vocabulary = hindi_tokenizer.word_index
print(vocabulary)

{'<OOV>': 1, 'के': 2, 'और': 3, 'है': 4, 'में': 5, 'से': 6, 'की': 7, 'एक': 8, 'हैं': 9, 'का': 10, 'को': 11, 'मैं': 12, 'कि': 13, 'है।': 14, 'नहीं': 15, 'यह': 16, 'कर': 17, 'पर': 18, 'लिए': 19, 'हम': 20, 'भी': 21, 'तो': 22, 'जो': 23, '।': 24, 'इस': 25, 'मुझे': 26, 'था': 27, 'ये': 28, 'आप': 29, 'हो': 30, 'करने': 31, 'बहुत': 32, 'ही': 33, 'हैं।': 34, 'कुछ': 35, 'सकते': 36, 'किया': 37, 'जब': 38, 'मैंने': 39, 'साथ': 40, 'वे': 41, 'अपने': 42, 'रही': 43, 'काम': 44, 'मेरे': 45, 'थे': 46, 'हमारे': 47, 'हूँ': 48, 'या': 49, 'पास': 50, 'ने': 51, 'तक': 52, 'उनके': 53, 'बारे': 54, 'क्या': 55, 'वो': 56, 'हमने': 57, 'लेकिन': 58, 'थी': 59, 'कोई': 60, 'वह': 61, 'करना': 62, 'रहा': 63, 'अधिक': 64, 'मे': 65, 'रहे': 66, 'i': 67, 'आपको': 68, 'गया': 69, 'बात': 70, 'अपनी': 71, 'अगर': 72, 'समय': 73, 'साल': 74, 'किसी': 75, 'सबसे': 76, 'लोग': 77, 'करते': 78, 'घर': 79, 'उसे': 80, 'सकता': 81, 'लोगों': 82, 'जा': 83, 'सभी': 84, 'जीवन': 85, 'मेरा': 86, 'लिये': 87, 'ऊर्जा': 88, 'जैसे': 89, 'होता': 90, 'अब': 91, 'कहा': 9

In [13]:
text_to_tokenize = "यह विचार बहुत ही सरल है"
tokenized_text = hindi_tokenizer.texts_to_sequences([text_to_tokenize])
print(tokenized_text)

[[16, 272, 32, 33, 994, 4]]


In [14]:
oov_token = hindi_tokenizer.oov_token
print(f"Out-of-Vocabulary token: {oov_token}")

Out-of-Vocabulary token: <OOV>


In [15]:
len(hindi_tokenizer.word_index)

3431

In [16]:
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1
hindi_vocab_size

3432

In [17]:
hindi_sequences = hindi_tokenizer.texts_to_sequences(hindi_sentences)
hindi_sequences

[[35,
  572,
  99,
  26,
  143,
  305,
  89,
  12,
  75,
  1563,
  1564,
  5,
  1565,
  132,
  27,
  402,
  39,
  472,
  1566,
  992,
  1567,
  1568,
  2,
  1569,
  473,
  18,
  993,
  10,
  349,
  37,
  3,
  138,
  306,
  52,
  35,
  350,
  31,
  10,
  351,
  37],
 [16, 272, 32, 33, 994, 4],
 [119,
  75,
  573,
  2,
  54,
  5,
  995,
  186,
  29,
  273,
  42,
  85,
  5,
  62,
  187,
  46,
  3,
  80,
  734,
  138,
  306,
  52,
  31,
  10,
  351,
  474],
 [16,
  120,
  307,
  4,
  13,
  138,
  109,
  574,
  73,
  4,
  60,
  475,
  996,
  735,
  49,
  75,
  996,
  11,
  1570,
  2,
  19,
  42,
  85,
  5,
  6,
  89,
  13,
  997,
  736],
 [25, 138, 109, 7, 575, 2, 737, 39, 35, 738, 1571, 9],
 [274,
  27,
  13,
  739,
  219,
  2,
  403,
  998,
  33,
  1572,
  113,
  1573,
  1574,
  61,
  73,
  32,
  188,
  1575,
  27],
 [16, 125, 576, 10, 8, 404, 27, 740, 1576, 39, 8, 1577, 405],
 [3, 26, 245, 4, 12, 125, 109, 577, 27, 3, 55, 17, 63, 27],
 [39,
  16,
  21,
  741,
  13,
  89,
  89,
  12,
  13

In [18]:
# Tokenize Telugu sentences
telugu_tokenizer = Tokenizer(oov_token="<OOV>")
telugu_tokenizer.fit_on_texts(telugu_sentences)

In [19]:
vocabulary = telugu_tokenizer.word_index
print(vocabulary)

{'<OOV>': 1, 'నేను': 2, 'ఒక': 3, 'ఈ': 4, 'మరియు': 5, 'ఇది': 6, 'చాలా': 7, 'మీరు': 8, 'నా': 9, 'ఆ': 10, 'అది': 11, 'కానీ': 12, 'మా': 13, 'లో': 14, 'కూడా': 15, 'వారి': 16, 'మేము': 17, 'గురించి': 18, 'ఇంకా': 19, 'నాకు': 20, 'అని': 21, 'కాదు': 22, 'వారు': 23, 'మన': 24, 'మీ': 25, 'మనం': 26, 'ఇప్పుడు': 27, 'ఆమె': 28, 'మీకు': 29, 'లేదా': 30, 'నన్ను': 31, 'కోసం': 32, 'నుంచి': 33, 'ఉంది': 34, 'ఎందుకంటే': 35, 'ఎక్కువ': 36, 'గా': 37, 'మనము': 38, 'కొన్ని': 39, 'చప్పట్లు': 40, 'ఉన్న': 41, 'పెట్టుబడి': 42, 'ఎలా': 43, 'పెద్ద': 44, 'నుండి': 45, 'సౌర': 46, 'కొత్త': 47, 'పని': 48, 'లేదు': 49, 'నవ్వులు': 50, 'యొక్క': 51, 'అలా': 52, 'మేం': 53, 'ఓ': 54, 'ఆఫ్రికా': 55, 'మాకు': 56, 'అదే': 57, 'అంటే': 58, 'అవి': 59, 'మంది': 60, 'కాబట్టి': 61, 'కు': 62, 'వాళ్ళు': 63, 'ఇక్కడ': 64, 'తో': 65, 'బ్యాంక్': 66, 'అల్జీమర్స్': 67, 'ప్రతి': 68, 'అక్కడ': 69, 'వచ్చింది': 70, 'అందుకే': 71, 'మంచి': 72, 'రోజు': 73, 'మనకు': 74, 'ప్రజలు': 75, 'ఇతర': 76, 'ఇలా': 77, 'మరింత': 78, 'ద్వారా': 79, 'చేయడం': 80, 'ప్రింట్': 81, 'కాని': 

In [20]:
telugu_vocab_size = len(telugu_tokenizer.word_index) + 1
telugu_vocab_size

5412

In [21]:
telugu_sequences = telugu_tokenizer.texts_to_sequences(telugu_sentences)
telugu_sequences

[[39,
  365,
  157,
  2,
  141,
  1430,
  41,
  1431,
  1432,
  1433,
  497,
  750,
  71,
  2,
  142,
  498,
  1434,
  1435,
  1436,
  1437,
  1438,
  1439,
  57,
  93,
  228,
  47,
  158,
  32,
  1440],
 [4, 115, 7, 1441],
 [25, 366, 8, 367, 1442, 751, 499, 1443, 10, 751, 1444, 93, 228, 1445],
 [1446,
  1447,
  93,
  1448,
  25,
  366,
  3,
  72,
  1449,
  4,
  1450,
  8,
  3,
  47,
  1451,
  1452,
  30,
  25,
  500,
  1453,
  752,
  753,
  754,
  1454,
  752,
  25,
  1455,
  1456],
 [4, 93, 755, 756, 1457, 2, 39, 501, 757],
 [758, 502, 1458, 229, 1459, 230, 1460, 1461, 230, 368, 20, 759],
 [6, 15, 10, 1462, 3, 1463, 2, 3, 760, 3, 278, 1464],
 [2, 1465, 5, 10, 73, 105, 1466, 20, 759],
 [2, 1467, 159, 58, 93, 503, 1468, 1469, 9, 761, 762, 1470, 2, 763],
 [2, 231, 1471, 231, 18, 126, 764, 765, 3, 1472],
 [1473, 1474, 765, 3, 1475, 766],
 [1476, 1477, 2, 1478, 1479, 1480, 1481, 1482, 1483, 160],
 [2, 4, 93, 503, 1484, 756, 157, 767, 1485, 49],
 [2, 768, 15, 504, 8, 1486, 1487, 1488, 4, 9

In [22]:
hindi_vocab_size, telugu_vocab_size

(3432, 5412)

In [23]:
max_hindi_sequence_length = max(len(seq) for seq in hindi_sequences)
max_telugu_sequence_length = max(len(seq) for seq in telugu_sequences)

max_hindi_sequence_length, max_telugu_sequence_length

(90, 51)

In [24]:
max_sequence_length = max(max_hindi_sequence_length, max_telugu_sequence_length)
max_sequence_length

90

In [25]:
hindi_sequences = pad_sequences(hindi_sequences, maxlen=max_hindi_sequence_length, padding="post")
telugu_sequences = pad_sequences(telugu_sequences, maxlen=max_hindi_sequence_length, padding="post")

In [26]:
hindi_sequences, telugu_sequences

(array([[  35,  572,   99, ...,    0,    0,    0],
        [  16,  272,   32, ...,    0,    0,    0],
        [ 119,   75,  573, ...,    0,    0,    0],
        ...,
        [   3,   28,  225, ...,    0,    0,    0],
        [  35,   47,  957, ...,    0,    0,    0],
        [  22,   16, 3430, ...,    0,    0,    0]]),
 array([[  39,  365,  157, ...,    0,    0,    0],
        [   4,  115,    7, ...,    0,    0,    0],
        [  25,  366,    8, ...,    0,    0,    0],
        ...,
        [   5,   10,   72, ...,    0,    0,    0],
        [ 266,  154, 5402, ...,    0,    0,    0],
        [  61,   72, 1428, ...,    0,    0,    0]]))

In [27]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(hindi_vocab_size, 256, input_length=max_sequence_length)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(telugu_vocab_size, 256, input_length=max_sequence_length)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(telugu_vocab_size, activation="softmax")
output = decoder_dense(decoder_outputs)

# Create the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 90)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 90)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 90, 256)              878592    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 90, 256)              1385472   ['input_2[0][0]']             
                                                                                              

In [29]:
import tensorflow as tf
epochs = 5

# Prepare data for training
target_data = tf.keras.utils.to_categorical(telugu_sequences, num_classes=telugu_vocab_size, dtype='float32')

# Train the model
model.fit([hindi_sequences, telugu_sequences], target_data, epochs=epochs, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x21660bf8150>

In [30]:
# Tokenize and pad sequences for test data
hindi_test_sequences = hindi_tokenizer.texts_to_sequences(hindi_val)
telugu_test_sequences = telugu_tokenizer.texts_to_sequences(telugu_val)
hindi_test_sequences = pad_sequences(hindi_test_sequences, maxlen=max_sequence_length, padding="post")
telugu_test_sequences = pad_sequences(telugu_test_sequences, maxlen=max_sequence_length, padding="post")

# Prepare target data for test data
target_test_data = tf.keras.utils.to_categorical(telugu_test_sequences, num_classes=telugu_vocab_size, dtype='float32')

# Evaluate the model on the test data
evaluation = model.evaluate([hindi_test_sequences, telugu_test_sequences], target_test_data)

# Print the evaluation results
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

Test Loss: 1.1545511484146118
Test Accuracy: 0.8713333606719971


In [31]:
import pickle

# Save the trained model
model.save('models/translation_model1.keras')

# Save the Hindi tokenizer
with open('tokenizers/hindi_tokenizer1.pkl', 'wb') as tokenizer_file:
    pickle.dump(hindi_tokenizer, tokenizer_file)

# Save the Telugu tokenizer
with open('tokenizers/telugu_tokenizer1.pkl', 'wb') as tokenizer_file:
    pickle.dump(telugu_tokenizer, tokenizer_file)