In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load your tokenized dataset
data = pd.read_csv('preprocessed_data.csv')
data.head(), data.count()

(                                               Hindi  \
 0  कुछ वर्षों पहले मुझे ऐसा लगा जैसे मैं किसी उदा...   
 1                           यह विचार बहुत ही सरल है    
 2  ऐसे किसी चीज के बारे में सोचिये जिसे आप हमेशा ...   
 3  यह पता चला है कि 30 दिन पर्याप्त समय है, कोई न...   
 4  इस 30 दिन की चुनौतियों के दौरान मैंने कुछ बाते...   
 
                                               Telugu  
 0  కొన్ని సంవత్సరాల ముందు, నేను బాగా ఆచరానములో ఉన...  
 1                            ఈ ఆలోచన చాలా సులభమైనది.  
 2  మీ జీవితములో మీరు చేయాలి అనుకునే పనిని ఆలోచించ...  
 3  ఫలితము దక్కుతుంది. 30 రోజులనేది మీ జీవితములో ఒ...  
 4  ఈ 30 రోజులు పాటించే విధానములో నేను కొన్ని విషయ...  ,
 Hindi     2648
 Telugu    2648
 dtype: int64)

In [3]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

train_data.count(), test_data.count()

(Hindi     2383
 Telugu    2383
 dtype: int64,
 Hindi     265
 Telugu    265
 dtype: int64)

In [4]:
import numpy as np

# Check for NaN or missing values in the 'Hindi' and 'Telugu' columns
train_data = train_data.dropna(subset=['Hindi', 'Telugu'])

# Convert text data to string
train_data['Hindi'] = train_data['Hindi'].astype(str)
train_data['Telugu'] = train_data['Telugu'].astype(str)

# Tokenize the text data
tokenizer_hindi = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_hindi.fit_on_texts(train_data['Hindi'])

tokenizer_telugu = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_telugu.fit_on_texts(train_data['Telugu'])

In [5]:
# Vocabulary size for both languages
vocab_size_hindi = len(tokenizer_hindi.word_index) + 1
vocab_size_telugu = len(tokenizer_telugu.word_index) + 1
print(vocab_size_hindi)
print(vocab_size_telugu)

7828
13588


In [6]:
# Convert text to sequences
train_input = tokenizer_hindi.texts_to_sequences(train_data['Hindi'])
train_output = tokenizer_telugu.texts_to_sequences(train_data['Telugu'])
test_input = tokenizer_hindi.texts_to_sequences(test_data['Hindi'])
test_output = tokenizer_telugu.texts_to_sequences(test_data['Telugu'])

In [18]:
max_seq_length = 50  # Adjust as needed
train_input = pad_sequences(train_input, maxlen=max_seq_length, padding='post')
train_output = pad_sequences(train_output, maxlen=max_seq_length, padding='post')
test_input = pad_sequences(test_input, maxlen=max_seq_length, padding='post')
test_output = pad_sequences(test_output, maxlen=max_seq_length, padding='post')

In [19]:
len(train_input), len(train_output), len(test_input), len(test_output)

(2383, 2383, 265, 265)

In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention

In [21]:
# Define input and output sequences
encoder_input = Input(shape=(max_seq_length,))
decoder_input = Input(shape=(max_seq_length,))

In [22]:
# Embedding layer for encoder and decoder
embedding_dim = 256  # Adjust as needed
encoder_embedding = Embedding(input_dim=vocab_size_hindi, output_dim=embedding_dim)(encoder_input)
decoder_embedding = Embedding(input_dim=vocab_size_telugu, output_dim=embedding_dim)(decoder_input)

In [23]:

# Encoder LSTM
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Attention layer
attention_layer = Attention()
attention_output = attention_layer([encoder_outputs, decoder_embedding])

# Decoder LSTM
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(attention_output, initial_state=encoder_states)

# Output layer
output_layer = Dense(vocab_size_telugu, activation='softmax')
decoder_outputs = output_layer(decoder_outputs)

In [24]:
# Define the model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 50, 256)              2003968   ['input_3[0][0]']             
                                                                                                  
 input_4 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 lstm_2 (LSTM)               [(None, 50, 256),            525312    ['embedding_2[0][0]']         
                              (None, 256),                                                  

In [26]:
# Train the model
batch_size = 64
epochs = 50

model.fit(
    [train_input, train_output[:, :-1]],
    train_output[:, 1:],
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.1,
)


Epoch 1/50


ValueError: in user code:

    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Bhavin\AppData\Local\anaconda3\Lib\site-packages\keras\src\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 1 of layer "model_1" is incompatible with the layer: expected shape=(None, 50), found shape=(None, 49)
