In [4]:
# Install necessary libraries
!pip install transformers nltk sentence-transformers torch

import pandas as pd
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Download nltk tokenizer
nltk.download('punkt_tab')

# Define paraphraser function
def academic_paraphraser(paragraph, model_name="Vamsi/T5_Paraphrase_Paws", num_return_sequences=1):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    text = "paraphrase: " + paragraph + " </s>"

    encoding = tokenizer.encode_plus(text, return_tensors="pt", padding="max_length", truncation=True)
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=256, do_sample=True, top_k=200, top_p=0.95,
        early_stopping=True, num_return_sequences=num_return_sequences
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Load dataset
file_path = "/content/eval (1).csv"
df = pd.read_csv(file_path)

# Ensure required columns exist
if "question1" not in df.columns or "question2" not in df.columns:
    raise ValueError(f"Expected columns 'question1' and 'question2' not found. Available columns: {df.columns}")

# Apply paraphraser on question1
df["model_paraphrase"] = df["question1"].apply(lambda x: academic_paraphraser(str(x)))

# Initialize Sentence Transformer model for similarity evaluation
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute BLEU score & similarity-based accuracy
def evaluate_bleu(original, paraphrased):
    original_tokens = [nltk.word_tokenize(original.lower())]
    paraphrased_tokens = nltk.word_tokenize(paraphrased.lower())
    return sentence_bleu(original_tokens, paraphrased_tokens)

def compute_accuracy(original, paraphrased):
    emb1 = sbert_model.encode(original, convert_to_tensor=True)
    emb2 = sbert_model.encode(paraphrased, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2).item()
    return cosine_sim  # Closer to 1 means high similarity

# Evaluate BLEU and similarity
df["BLEU Score"] = df.apply(lambda row: evaluate_bleu(row["question2"], row["model_paraphrase"]), axis=1)
df["Accuracy (Cosine Similarity)"] = df.apply(lambda row: compute_accuracy(row["question2"], row["model_paraphrase"]), axis=1)

# Save results
output_file = "output.csv"
df.to_csv(output_file, index=False)
print(f"Paraphrased results saved to {output_file}")

# Print Average Scores
avg_bleu = df["BLEU Score"].mean()
avg_accuracy = df["Accuracy (Cosine Similarity)"].mean()

print(f"\n🔹 Average BLEU Score: {avg_bleu:.4f}")
print(f"🔹 Average Accuracy (Cosine Similarity): {avg_accuracy:.4f}")




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Paraphrased results saved to output.csv

🔹 Average BLEU Score: 0.4428
🔹 Average Accuracy (Cosine Similarity): 0.9378
