In [1]:
!pip install datasets tensorflow

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
# Import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping
from datasets import load_dataset

In [3]:
# Load the Quora pairs dataset
dataset = load_dataset('quora')

# Extract the questions pairs
train_data = dataset['train']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

quora.py:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [4]:
# Prepare the data
text_pairs = []
for pair in train_data:
    if pair['is_duplicate']:  # Check if the pair is a duplicate
        text_pairs.append((pair['questions']['text'][0], pair['questions']['text'][1]))

# Split the pairs into source and target sentences
source_texts = [pair[0] for pair in text_pairs]
target_texts = [pair[1] for pair in text_pairs]

# Tokenize the source and target texts
tokenizer = Tokenizer()
tokenizer.fit_on_texts(source_texts + target_texts)

In [5]:
# Convert texts to sequences of integers
source_sequences = tokenizer.texts_to_sequences(source_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Pad sequences to the same length
max_sequence_length = max(max(len(seq) for seq in source_sequences), max(len(seq) for seq in target_sequences))
source_sequences = pad_sequences(source_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Prepare the target data for training (shifted by one)
target_input_data = target_sequences[:, :-1]
target_output_data = target_sequences[:, 1:]

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [6]:
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [7]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=5,
    restore_best_weights=True
)

In [8]:
history = model.fit(
    [source_sequences, target_input_data],
    np.expand_dims(target_output_data, -1),
    batch_size=64,
    epochs=20,
    validation_split=0.2,
    callbacks=[early_stopping]  # Add early stopping callback
)

Epoch 1/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 293ms/step - accuracy: 0.8909 - loss: 1.1372 - val_accuracy: 0.9120 - val_loss: 0.5775
Epoch 2/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m587s[0m 308ms/step - accuracy: 0.9141 - loss: 0.5451 - val_accuracy: 0.9188 - val_loss: 0.5002
Epoch 3/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m574s[0m 308ms/step - accuracy: 0.9209 - loss: 0.4664 - val_accuracy: 0.9247 - val_loss: 0.4544
Epoch 4/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m601s[0m 296ms/step - accuracy: 0.9278 - loss: 0.4066 - val_accuracy: 0.9307 - val_loss: 0.4109
Epoch 5/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 308ms/step - accuracy: 0.9343 - loss: 0.3531 - val_accuracy: 0.9344 - val_loss: 0.3848
Epoch 6/20
[1m1866/1866[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m623s[0m 309ms/step - accuracy: 0.9393 - loss: 0.3123 - val_accuracy: 0.9372 - val_loss: