In [36]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

In [47]:
def load_data(filepath, has_target=True):
    df = pd.read_csv(filepath)
    input_texts = df.iloc[:, 0].astype(str).tolist()
    target_texts = df.iloc[:, 1].astype(str).tolist() if has_target and df.shape[1] > 1 else None
    return input_texts, target_texts


train_inputs, train_targets = load_data('/content/train.csv')


In [48]:
# Tokenization and padding
max_vocab_size = 10000
max_length = 50

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_inputs + train_targets)

input_sequences = tokenizer.texts_to_sequences(train_inputs)
target_sequences = tokenizer.texts_to_sequences(train_targets)

input_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
target_padded = pad_sequences(target_sequences, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1

In [49]:
# LSTM Encoder-Decoder Model
def build_model():
    embedding_dim = 128
    lstm_units = 256

    encoder_inputs = tf.keras.Input(shape=(max_length,))
    embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_lstm = tf.keras.layers.LSTM(lstm_units, return_state=True)
    _, state_h, state_c = encoder_lstm(embedding)
    encoder_states = [state_h, state_c]

    decoder_inputs = tf.keras.Input(shape=(max_length,))
    decoder_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
    outputs = decoder_dense(decoder_outputs)

    model = tf.keras.Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = build_model()
model.summary()

In [50]:
# Training
batch_size = 64
epochs = 20
model.fit([input_padded, target_padded], target_padded, batch_size=batch_size, epochs=epochs, validation_split=0.2)


Epoch 1/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2s/step - accuracy: 0.5774 - loss: 7.2503 - val_accuracy: 0.7490 - val_loss: 2.4955
Epoch 2/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - accuracy: 0.7326 - loss: 2.3001 - val_accuracy: 0.7490 - val_loss: 1.9229
Epoch 3/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - accuracy: 0.7356 - loss: 1.9538 - val_accuracy: 0.7490 - val_loss: 1.8431
Epoch 4/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - accuracy: 0.7330 - loss: 1.8749 - val_accuracy: 0.7595 - val_loss: 1.7624
Epoch 5/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 2s/step - accuracy: 0.7503 - loss: 1.7625 - val_accuracy: 0.7636 - val_loss: 1.7187
Epoch 6/20
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - accuracy: 0.7587 - loss: 1.7051 - val_accuracy: 0.7721 - val_loss: 1.6929
Epoch 7/20
[1m14/14[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fb1c8669b50>

In [51]:
model.save("paraphraser_model.h5")




In [52]:
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model("paraphraser_model.h5")




In [57]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import accuracy_score

def evaluate_bleu_accuracy(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)

    # Compute BLEU scores with smoothing
    smoothie = SmoothingFunction().method1
    bleu_scores = [
        sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
        for ref, pred in zip(df["question1"], df["question2"])
    ]

    # Compute accuracy (exact match)
    accuracy = accuracy_score(df["question1"], df["question2"])

    # Compute average BLEU score
    average_bleu = sum(bleu_scores) / len(bleu_scores)

    return average_bleu, accuracy

# Path to eval.csv
file_path = "/content/eval (1).csv"
bleu, acc = evaluate_bleu_accuracy(file_path)

print(f"Average BLEU Score: {bleu:.4f}")
print(f"Accuracy: {acc:.4f}")

Average BLEU Score: 0.6709
Accuracy: 0.0000
