In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the dataset
df = pd.read_csv("reviews_supplements.csv")

# Inspect the columns
print(df.columns)

# Select a few sample reviews using the correct column name
sample_reviews = df['text'].sample(5).tolist()

# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Generate synthetic reviews for each sample
generated_reviews = []

for review in sample_reviews:
    inputs = tokenizer.encode(review, return_tensors="pt")
    outputs = model.generate(inputs, max_length=150, do_sample=True, top_k=50, temperature=0.7)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_reviews.append(generated_text)

# Print generated reviews
for i, synthetic_review in enumerate(generated_reviews):
    print(f"\nGenerated Review {i+1}: {synthetic_review}")


  from pandas.core import (
  from .autonotebook import tqdm as notebook_tqdm


Index(['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'date', 'time'],
      dtype='object')


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati


Generated Review 1: My wyf loves it! I love the way it's wrapped around my arms! I like that it's made of a very soft base that gets to the body. I love the way it wraps around my wrists, I love how it's made from a softer base that gives it a bit of a matte finish. It's soft and comfortable, it doesn't cling too much to my body and it's the best looking base I've ever gotten. It's perfect to put on my dress, my shoes or your outfit and it's perfect to put on my makeup. I love how it's made from a soft base and it's made of a soft base that gives it a bit of a matte finish. It's soft and comfortable,

Generated Review 2: Cheap price.  Cheap quality.  Ordered two.  One didn't work at all.  The other only registered 10% to 20% of the steps taken.  I took it up to a full-time job, but was told I had to pay extra for an extra month.  But I'm not getting any money and am still getting a refund.  It's not worth the money.  I've been told to pay extra for more weeks. I just can't afford the 

In [2]:
# Save the generated reviews to a CSV file
generated_df = pd.DataFrame(generated_reviews, columns=["syntheticReview"])
generated_df.to_csv("synthetic_reviews.csv", index=False)

print("Synthetic reviews saved to synthetic_reviews.csv")


Synthetic reviews saved to synthetic_reviews.csv


In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import numpy as np

# Load GPT-2 for perplexity calculation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Sample generated reviews (replace this with your generated reviews)
generated_reviews = ["This supplement really helped with my energy levels.", 
                     "The vitamins were okay, but I didn’t notice any difference.", 
                     "Great product! I feel more energetic and healthier.", 
                     "I don’t recommend this. It gave me stomach pain."]

# Original reviews (replace this with your original dataset reviews)
original_reviews = ["I have been taking these supplements for a week and they are amazing.",
                    "The packaging was damaged, but the product seems fine.",
                    "I don’t think this supplement works for me.",
                    "Excellent product. I will buy again."]

# Function to measure diversity based on length of reviews
def measure_diversity(reviews):
    lengths = [len(review.split()) for review in reviews]
    print(f"Average review length: {np.mean(lengths)} words")
    print(f"Max review length: {np.max(lengths)} words")
    print(f"Min review length: {np.min(lengths)} words")

# Function to measure sentiment diversity using TextBlob
def measure_sentiment_diversity(reviews):
    sentiments = [TextBlob(review).sentiment.polarity for review in reviews]
    print(f"Average sentiment polarity: {np.mean(sentiments)}")
    print(f"Sentiment range: {np.min(sentiments)} to {np.max(sentiments)}")

# Function to calculate perplexity for realism
def calculate_perplexity(review):
    inputs = tokenizer.encode(review, return_tensors="pt")
    with torch.no_grad():
        outputs = model(inputs, labels=inputs)
        loss = outputs.loss
    return torch.exp(loss).item()

# Function to check originality using cosine similarity
def calculate_originality(generated_reviews, original_reviews):
    vectorizer = TfidfVectorizer()
    all_reviews = generated_reviews + original_reviews
    vectors = vectorizer.fit_transform(all_reviews)
    
    # Compute cosine similarity
    similarity_matrix = cosine_similarity(vectors[:len(generated_reviews)], vectors[len(generated_reviews):])
    avg_similarity = np.mean(similarity_matrix)
    
    print(f"Average cosine similarity between generated and original reviews: {avg_similarity}")

# 1. Measure diversity of generated reviews (length and sentiment)
print("Diversity Analysis:")
measure_diversity(generated_reviews)
measure_sentiment_diversity(generated_reviews)

# 2. Measure realism using perplexity
print("\nRealism Analysis:")
for i, review in enumerate(generated_reviews):
    perplexity = calculate_perplexity(review)
    print(f"Review {i+1} perplexity: {perplexity}")

# 3. Measure originality using cosine similarity
print("\nOriginality Analysis:")
calculate_originality(generated_reviews, original_reviews)


Diversity Analysis:
Average review length: 8.75 words
Max review length: 10 words
Min review length: 8 words
Average sentiment polarity: 0.3416666666666667
Sentiment range: 0.0 to 0.6666666666666666

Realism Analysis:
Review 1 perplexity: 78.65267944335938
Review 2 perplexity: 123.70069122314453
Review 3 perplexity: 77.16495513916016
Review 4 perplexity: 164.3584747314453

Originality Analysis:
Average cosine similarity between generated and original reviews: 0.06324736040844718
