In [2]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pickle

# Load the dataset
data = pd.read_csv("french_tamil.csv")

# Tokenize French and Tamil words
french_tokenizer = Tokenizer(char_level=True)  # Tokenize by characters
tamil_tokenizer = Tokenizer(char_level=True)

french_tokenizer.fit_on_texts(data['French'])
tamil_tokenizer.fit_on_texts(data['Tamil'])

# Convert text to sequences
french_sequences = french_tokenizer.texts_to_sequences(data['French'])
tamil_sequences = tamil_tokenizer.texts_to_sequences(data['Tamil'])

# Pad the sequences
french_padded = pad_sequences(french_sequences, maxlen=6, padding='post')
tamil_padded = pad_sequences(tamil_sequences, maxlen=8, padding='post')

# Convert Tamil sequences into class indices
y_data = tamil_padded.argmax(axis=-1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(french_padded, y_data, test_size=0.2, random_state=42)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(french_tokenizer.word_index) + 1, output_dim=16, input_length=6),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(tamil_tokenizer.word_index) + 1, activation='softmax')  # Output size matches Tamil vocabulary
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test))

# Save the trained model
model.save("french_to_tamil_model.h5")

# Save the tokenizers for later use in the GUI
with open('french_tokenizer.pkl', 'wb') as f:
    pickle.dump(french_tokenizer, f)

with open('tamil_tokenizer.pkl', 'wb') as f:
    pickle.dump(tamil_tokenizer, f)


Epoch 1/50




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10s/step - accuracy: 0.0000e+00 - loss: 3.0908 - val_accuracy: 0.0000e+00 - val_loss: 3.0862
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step - accuracy: 0.6250 - loss: 3.0818 - val_accuracy: 0.0000e+00 - val_loss: 3.0821
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step - accuracy: 0.6250 - loss: 3.0715 - val_accuracy: 0.0000e+00 - val_loss: 3.0770
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 266ms/step - accuracy: 0.6250 - loss: 3.0597 - val_accuracy: 0.0000e+00 - val_loss: 3.0710
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step - accuracy: 0.6250 - loss: 3.0463 - val_accuracy: 0.0000e+00 - val_loss: 3.0641
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - accuracy: 0.6250 - loss: 3.0308 - val_accuracy: 0.0000e+00 - val_loss: 3.0561
Epoch 7/50
[1m1/1[0m 

