In [1]:
!pip install tensorflow datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import datasets

In [3]:
# Load the PAWS dataset
dataset = datasets.load_dataset('paws', 'labeled_final')

# Extract sentences and labels
train_data = dataset['train']
test_data = dataset['test']

# Prepare data
def prepare_data(data):
    sentences1 = [example['sentence1'] for example in data]
    sentences2 = [example['sentence2'] for example in data]
    labels = [example['label'] for example in data]
    return sentences1, sentences2, labels

train_sentences1, train_sentences2, train_labels = prepare_data(train_data)
test_sentences1, test_sentences2, test_labels = prepare_data(test_data)

# Tokenize sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_sentences1 + train_sentences2)

# Convert sentences to sequences
train_sequences1 = tokenizer.texts_to_sequences(train_sentences1)
train_sequences2 = tokenizer.texts_to_sequences(train_sentences2)
test_sequences1 = tokenizer.texts_to_sequences(test_sentences1)
test_sequences2 = tokenizer.texts_to_sequences(test_sentences2)

# Pad sequences
max_len = 50
train_encoder_input = pad_sequences(train_sequences1, maxlen=max_len, padding='post')
train_decoder_input = pad_sequences(train_sequences2, maxlen=max_len, padding='post')
test_encoder_input = pad_sequences(test_sequences1, maxlen=max_len, padding='post')
test_decoder_input = pad_sequences(test_sequences2, maxlen=max_len, padding='post')

# Prepare decoder output (shifted by one)
train_decoder_output = pad_sequences([seq[1:] for seq in train_sequences2], maxlen=max_len, padding='post')
test_decoder_output = pad_sequences([seq[1:] for seq in test_sequences2], maxlen=max_len, padding='post')

# Convert labels to numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [4]:
# Hyperparameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 256
lstm_units = 128

# Encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

In [6]:
# Train the model
batch_size = 64
epochs = 20

history = model.fit(
    [train_encoder_input, train_decoder_input],
    train_decoder_output,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)

Epoch 1/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 134ms/step - accuracy: 0.6850 - loss: 2.4940 - val_accuracy: 0.6984 - val_loss: 2.3377
Epoch 2/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 150ms/step - accuracy: 0.7028 - loss: 2.2501 - val_accuracy: 0.7105 - val_loss: 2.1603
Epoch 3/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 134ms/step - accuracy: 0.7163 - loss: 2.0554 - val_accuracy: 0.7202 - val_loss: 2.0171
Epoch 4/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 149ms/step - accuracy: 0.7278 - loss: 1.8863 - val_accuracy: 0.7294 - val_loss: 1.8888
Epoch 5/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 150ms/step - accuracy: 0.7385 - loss: 1.7380 - val_accuracy: 0.7386 - val_loss: 1.7784
Epoch 6/20
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 150ms/step - accuracy: 0.7510 - loss: 1.5961 - val_accuracy: 0.7485 - val_loss: 1.6792
Epoch 

In [7]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(
    [test_encoder_input, test_decoder_input],
    test_decoder_output,
    batch_size=batch_size
)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 71ms/step - accuracy: 0.7199 - loss: 2.7217
Test Loss: 2.694801092147827
Test Accuracy: 0.7223423719406128


In [23]:
def generate_paraphrase(model, input_text, tokenizer, max_len):
    # Preprocess input text
    input_seq = preprocess_input(input_text, tokenizer, max_len)

    # Initialize decoder input with start token
    start_token = tokenizer.word_index['<start>']  # Replace with your start token
    decoder_input = np.zeros((1, max_len))
    decoder_input[0, 0] = start_token

    # Generate paraphrase
    paraphrase = []
    for i in range(max_len):
        output_tokens = model.predict([input_seq, decoder_input], verbose=0)
        sampled_token_index = np.argmax(output_tokens[0, i, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<end>':  # Replace with your end token
            break

        paraphrase.append(sampled_word)
        if i + 1 < max_len:
            decoder_input[0, i + 1] = sampled_token_index

    return ' '.join(paraphrase)

In [25]:
def generate_paraphrases_for_test_set(model, test_sentences, tokenizer, max_len):
    paraphrases = []
    for sentence in test_sentences:
        paraphrase = generate_paraphrase(model, sentence, tokenizer, max_len)
        paraphrases.append(paraphrase)
    return paraphrases

In [12]:
# Save the model
model.save("lstm_encoder_decoder_model.h5")



In [18]:
# Load the saved model
loaded_model = tf.keras.models.load_model("lstm_encoder_decoder_model.h5")

