In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [2]:
# Read the CSV file
df = pd.read_csv('data.csv')

# Clean the dataset
df['English'] = df['English'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['Spanish'] = df['Spanish'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [3]:
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
# Tokenization and padding
tokenizer = Tokenizer(oov_token='<OOV>')

# Fit the tokenizer on the English sentences
tokenizer.fit_on_texts(train_df['English'])

In [5]:
df.head()

Unnamed: 0,English,Spanish,Attribution
0,Hi,Ciao,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Hi,Ciao,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run,Corri,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
3,Run,Corra,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run,Correte,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [6]:
# Convert text to sequences
train_input_sequences = tokenizer.texts_to_sequences(train_df['English'])
test_input_sequences = tokenizer.texts_to_sequences(test_df['English'])

In [7]:
train_input_sequences

[[4, 17, 34, 120, 4, 12, 6, 68],
 [456, 3, 44, 307, 15],
 [7, 377, 604, 30, 3, 362, 22, 437, 197],
 [14, 534, 52, 46, 472, 14, 6, 427],
 [49, 147, 263, 9, 3, 27, 5, 12, 104, 3, 63, 238],
 [18, 46, 116, 54],
 [2, 8, 10660, 794],
 [45, 862, 5, 996, 72],
 [2, 1415, 92, 14, 843],
 [2, 267, 5, 22, 833],
 [491, 486, 87],
 [87, 39, 1120],
 [4, 155, 1104, 6, 811],
 [2, 80, 192, 32],
 [4, 90, 47, 6, 224, 394],
 [4, 37, 2, 8, 3292],
 [897, 20, 1487, 5, 22, 737, 33, 5, 22, 1277],
 [62, 383, 3, 49, 13, 489],
 [153, 23, 183, 42],
 [541, 162, 495],
 [25, 64, 5, 5535, 237],
 [1910, 1331],
 [6, 627, 16, 415],
 [260, 1256],
 [176, 16, 124, 281, 129, 121],
 [28, 9, 3, 30, 55, 15],
 [4, 17, 30, 3, 239],
 [52, 59, 322, 26, 70],
 [36, 5231],
 [9, 3, 12, 151, 2568],
 [2, 100, 214, 3, 39],
 [4, 192, 7, 195, 494, 877],
 [2, 177, 10, 5, 317, 210],
 [9, 3, 74, 37, 3, 43, 944, 2],
 [11, 8, 122, 1141, 55, 40, 1699, 1760],
 [4, 148, 1479, 164],
 [9, 3, 30, 2808],
 [3293, 1230, 3610, 15],
 [31, 9, 25, 64, 13],
 [9,

In [8]:
# Pad sequences
max_len = max(len(seq) for seq in train_input_sequences)
train_inputs = pad_sequences(train_input_sequences, maxlen=max_len, padding='post')
test_inputs = pad_sequences(test_input_sequences, maxlen=max_len, padding='post')

In [9]:
train_inputs

array([[  4,  17,  34, ...,   0,   0,   0],
       [456,   3,  44, ...,   0,   0,   0],
       [  7, 377, 604, ...,   0,   0,   0],
       ...,
       [  2, 227, 225, ...,   0,   0,   0],
       [  2, 320, 104, ...,   0,   0,   0],
       [114,  15, 111, ...,   0,   0,   0]])

In [10]:
# Tokenize the Spanish sentences using the same tokenizer
train_target_sequences = tokenizer.texts_to_sequences(train_df['Spanish'])
test_target_sequences = tokenizer.texts_to_sequences(test_df['Spanish'])

In [11]:
train_target_sequences

[[1, 61, 1, 1, 7059],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 83, 1, 1, 1, 1, 1, 1],
 [14, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1],
 [2, 1, 1, 1, 1],
 [1, 657, 1, 1],
 [2, 1, 1, 14, 1],
 [1, 1, 2, 1, 1],
 [1, 1, 3785, 1],
 [3785, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1],
 [2, 1, 1, 1, 32],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 2, 1, 1],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1, 83, 1],
 [1, 1, 1, 1, 1, 1],
 [9318, 1, 1, 1, 1, 1, 1],
 [1, 2275, 9318, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 11663, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 15],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 3785, 1],
 [1, 1, 1],
 [1, 1, 1],
 [2, 1, 1, 1, 1],
 [1, 8200, 1, 1],
 [2, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 2],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1],
 [7, 1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 4824],
 [1, 1, 1],
 [2, 1, 1],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1],
 [1, 1, 1],
 [1, 2622, 1, 7, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1],
 [1, 11663, 8031, 1, 1, 1, 1],
 [1, 1, 1, 3785, 15]

In [12]:
# Pad target sequences
train_targets = pad_sequences(train_target_sequences, maxlen=max_len, padding='post')
test_targets = pad_sequences(test_target_sequences, maxlen=max_len, padding='post')

In [13]:
train_targets

array([[ 1, 61,  1, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0],
       ...,
       [ 2,  1,  1, ...,  0,  0,  0],
       [ 2,  1,  1, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0]])

In [14]:
# Define the Seq2Seq model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index) + 1, 256, input_shape=[max_len]),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax')
])

In [15]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_inputs, train_targets, epochs=1, batch_size=64)



<keras.callbacks.History at 0x2a339544250>

In [16]:
model.save('best_model.h5')

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets, batch_size=64)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')


