In [4]:
import pandas as pd

df = pd.read_csv('tokenized_data.csv', encoding='utf-8')

from sklearn.model_selection import train_test_split

# Split your dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

Training set shape: (2119, 2)
Testing set shape: (530, 2)


In [5]:
hindi_sentences = df['Hindi_Tokens']
telugu_sentences = df['Telugu_Tokens']

# Split the data into train and validation sets
hindi_train, hindi_val, telugu_train, telugu_val = train_test_split(
    hindi_sentences, telugu_sentences, test_size=0.2, random_state=42)

# Now you have the following variables:
# - hindi_train: List of Hindi sentences for training
# - hindi_val: List of Hindi sentences for validation
# - telugu_train: List of corresponding Telugu sentences for training
# - telugu_val: List of corresponding Telugu sentences for validation

In [6]:
print("Hindi Training set shape:", hindi_train.shape)
print("Hindi Val set shape:", hindi_val.shape)
print("Telugu Training set shape:", telugu_train.shape)
print("Telugu Val set shape:", telugu_val.shape)

Hindi Training set shape: (2119,)
Hindi Val set shape: (530,)
Telugu Training set shape: (2119,)
Telugu Val set shape: (530,)


In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

hindi_tokenizer = Tokenizer(oov_token="<OOV>")
hindi_tokenizer.fit_on_texts(hindi_sentences)
hindi_vocab_size = len(hindi_tokenizer.word_index) + 1
hindi_sequences = hindi_tokenizer.texts_to_sequences(hindi_sentences)

# Tokenize Telugu sentences
telugu_tokenizer = Tokenizer(oov_token="<OOV>")
telugu_tokenizer.fit_on_texts(telugu_sentences)
telugu_vocab_size = len(telugu_tokenizer.word_index) + 1
telugu_sequences = telugu_tokenizer.texts_to_sequences(telugu_sentences)

2023-09-05 13:29:19.393723: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-05 13:29:19.597858: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-05 13:29:19.599795: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
max_sequence_length = max(len(seq) for seq in hindi_sequences)
hindi_sequences = pad_sequences(hindi_sequences, maxlen=max_sequence_length, padding="post")
telugu_sequences = pad_sequences(telugu_sequences, maxlen=max_sequence_length, padding="post")

In [6]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input

# Encoder
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(hindi_vocab_size, 256, input_length=max_sequence_length)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(telugu_vocab_size, 256, input_length=max_sequence_length)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(telugu_vocab_size, activation="softmax")
output = decoder_dense(decoder_outputs)

# Create the model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

2023-09-05 13:24:31.687083: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-05 13:24:31.687667: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-09-05 13:24:31.996673: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gra

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 190)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 190)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 190, 256)     1855744     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 190, 256)     3436544     ['input_2[0][0]']                
                                                                                              

In [1]:
# Prepare data for training
target_data = tf.keras.utils.to_categorical(telugu_sequences, num_classes=telugu_vocab_size)

# Train the model
model.fit([hindi_sequences, telugu_sequences], target_data, epochs=10, batch_size=64, validation_split=0.2)

NameError: name 'tf' is not defined