In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch




In [2]:
reviews_df = pd.read_csv('C:/Users/HP/Downloads/assignment_reviews_metadata/reviews_supplements.csv')
product_asin_df = pd.read_csv('C:/Users/HP/Downloads/assignment_reviews_metadata/product_asin.csv')

In [3]:
reviews_df.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,time
0,4,B Complex in gel cap form,I bought this along with Vit C in gel cap form...,B00012ND5G,B00012ND5G,AGDVFFLJWAQ3ULNNKF4LXID2RVSQ,2009-12-11 00:37:33,1,True,2009-12-11,00:37
1,5,Five Stars,great product,B00013Z0ZQ,B00013Z0ZQ,AG3BSKXHDGP6E3EGQD2SXCK6KFQQ,2015-01-04 03:11:26,0,True,2015-01-04,03:11
2,5,Five Stars,Came as expectedly,B00013Z0ZQ,B00013Z0ZQ,AHG2WKFD4LXPC46WWC6JMQGX52JA,2015-09-27 19:15:33,0,True,2015-09-27,19:15
3,5,Vitamin Shoppe Dry Vitamin A,Excellent Product ..... Fast Delivery ....... ...,B00013Z1KA,B00013Z1KA,AEOF7RT3AC4ACRX5HGIP2V3BNIHA,2019-02-09 19:33:16.911,0,True,2019-02-09,19:33
4,5,Un producto que compro regularmente,Es muy buena vitamina,B00013Z1KA,B00013Z1KA,AGW2WETWQRL2PKUGTL2LU7IJ2BPQ,2022-07-25 14:11:10.936,0,True,2022-07-25,14:11


In [4]:
reviews_df.drop_duplicates(inplace=True)
reviews_df.dropna(inplace=True)
reviews_df['cleaned_review'] = reviews_df['text'].str.replace(r'[^\w\s]', '', regex=True)
reviews_df.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,time,cleaned_review
0,4,B Complex in gel cap form,I bought this along with Vit C in gel cap form...,B00012ND5G,B00012ND5G,AGDVFFLJWAQ3ULNNKF4LXID2RVSQ,2009-12-11 00:37:33,1,True,2009-12-11,00:37,I bought this along with Vit C in gel cap form...
1,5,Five Stars,great product,B00013Z0ZQ,B00013Z0ZQ,AG3BSKXHDGP6E3EGQD2SXCK6KFQQ,2015-01-04 03:11:26,0,True,2015-01-04,03:11,great product
2,5,Five Stars,Came as expectedly,B00013Z0ZQ,B00013Z0ZQ,AHG2WKFD4LXPC46WWC6JMQGX52JA,2015-09-27 19:15:33,0,True,2015-09-27,19:15,Came as expectedly
3,5,Vitamin Shoppe Dry Vitamin A,Excellent Product ..... Fast Delivery ....... ...,B00013Z1KA,B00013Z1KA,AEOF7RT3AC4ACRX5HGIP2V3BNIHA,2019-02-09 19:33:16.911,0,True,2019-02-09,19:33,Excellent Product Fast Delivery Will Buy Fro...
4,5,Un producto que compro regularmente,Es muy buena vitamina,B00013Z1KA,B00013Z1KA,AGW2WETWQRL2PKUGTL2LU7IJ2BPQ,2022-07-25 14:11:10.936,0,True,2022-07-25,14:11,Es muy buena vitamina


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token =tokenizer.eos_token

def tokenize_function(examples):
    tokens =tokenizer(
        examples['cleaned_review'], 
        truncation= True, 
        padding ='max_length', 
        max_length =128,
        return_tensors='pt'
    )
    
    tokens['labels']= tokens['input_ids']
    return tokens
dataset =Dataset.from_pandas(reviews_df[['cleaned_review']])
tokenized_dataset =dataset.map(tokenize_function , batched=True)
tokenized_dataset.set_format(type='torch',columns= ['input_ids', 'attention_mask','labels'])
print(tokenized_dataset)



Map:   0%|          | 0/16660 [00:00<?, ? examples/s]

Dataset({
    features: ['cleaned_review', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 16660
})


In [6]:
train_test_split = tokenized_dataset.train_test_split(test_size= 0.2)
train_dataset= train_test_split['train']
val_dataset =train_test_split['test']
print(f"Training size:{len(train_dataset)}, Validation size: {len(val_dataset)}")

Training size:13328,Validation size: 3332


In [14]:
import random
train_size =int(len(train_dataset) *0.1)
small_train_dataset= train_dataset.shuffle(seed= 42).select(range(train_size))
val_size =int(len(val_dataset) *0.1) 
small_val_dataset= val_dataset.shuffle(seed= 42).select(range(val_size))
print(f"Reduced Training size: {len(small_train_dataset)},Reduced Validation size: {len(small_val_dataset)}")

Reduced Training size: 1332, Reduced Validation size: 333


In [8]:
training_args = TrainingArguments(
    output_dir="C:/Users/HP/OneDrive/Desktop/SHL/gpt2-synthetic-reviews",
    overwrite_output_dir=True,
    num_train_epochs=1, 
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy="no",
    save_steps=500,
    logging_dir="C:/Users/HP/OneDrive/Desktop/SHL/logs",
    logging_steps=100,
    max_steps=1000,
)

In [9]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [10]:
trainer = Trainer(
    model =model,
    args =training_args,
    train_dataset =small_train_dataset,
    eval_dataset= small_val_dataset,
)
trainer.train()

model.save_pretrained("C:/Users/HP/OneDrive/Desktop/SHL/gpt2-synthetic-reviews")
tokenizer.save_pretrained("C:/Users/HP/OneDrive/Desktop/SHL/gpt2-synthetic-reviews")
print("Model and tokenizer saved successfully.")

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
100,1.2822
200,1.0891
300,1.1602
400,0.9619
500,0.9529
600,0.9596
700,1.0088
800,0.9545
900,0.9812
1000,1.0534


Model and tokenizer saved successfully.


In [11]:
def generate_review(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs= model.generate(
        inputs,
        max_length= max_length,
        do_sample= True,
        top_k= 50,
        top_p= 0.95,
        num_return_sequences=1
    )
return tokenizer.decode(outputs[0], skip_special_tokens= True)

In [12]:
synthetic_reviews = []
original_prompts = []
ratings = []
for i in range(10):
    prompt = small_train_dataset['cleaned_review'][i][:50]
    generated_review = generate_review(prompt, model,tokenizer)
    synthetic_reviews.append(generated_review)
    original_prompts.append(prompt)
    ratings.append(random.randint(1,5))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask an

In [15]:
synthetic_df = pd.DataFrame({
    "synthetic_review": synthetic_reviews,
    "original_prompt": original_prompts,
    "rating": ratings
})

synthetic_df.to_csv("synthetic_reviews.csv", index= False)
print("Generated synthetic reviews saved to synthetic_reviews.csv!")

Generated synthetic reviews saved to synthetic_reviews.csv!
