In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

In [2]:
filtered_data = pd.read_csv("data/amazon_reviews_subset.csv")
filtered_data

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,50230169,R23MCAR8GSV3T0,0451526341,380925201,Animal farm: A Fairy Story,Books,4.0,2.0,2.0,N,N,Simple Yet Profound,"A generation ago, the sight of the cover of Ge...",2005-10-14
1,US,50776149,RUCZYTA3MP0MR,038551428X,970964974,"The Traveler (Fourth Realm Trilogy, Book 1)",Books,5.0,2.0,5.0,N,N,Great Marketing for a Pretty Good Book,The most interesting thing about this book is ...,2005-10-14
2,US,12598621,RCL2ARHKWH6RL,059035342X,667539744,Harry Potter and the Sorcerer's Stone,Books,5.0,2.0,2.0,N,N,I Think Part Of The Charm Is You Feel Like You...,Even though this is the shortest book in the H...,2005-10-14
3,US,49770667,R2P4B3STC980QP,1594480001,659516630,The Kite Runner,Books,5.0,4.0,4.0,N,N,Praiseworthy first novel,Well I thoroughly enjoyed this book. Although ...,2005-10-14
4,US,49828549,RM0CSYVWKHW5W,0671027360,141370518,Angels & Demons,Books,1.0,31.0,39.0,N,N,Preposterous,"Early in this novel, our hero finds out that a...",2005-10-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24454,US,53025502,RT0WQLFSIQ73O,089480829X,874075168,What to Expect When You're Expecting,Books,3.0,0.0,0.0,N,N,good thing it's cheap- you'll want more books,"Most women like the developmental information,...",1997-04-09
24455,US,53025502,R2JST5WFT5OVVV,0316779059,647537444,The Baby Book: Everything You Need to Know Abo...,Books,5.0,2.0,2.0,N,N,Was sad when my daughter hit two,I was sad to consult this book one day and fin...,1997-04-09
24456,US,53018444,R2XHHUX67S4FI0,0440221471,692576280,The Runaway Jury,Books,1.0,3.0,6.0,N,N,AWFUL,I can usually find something good in every boo...,1997-03-26
24457,US,53068382,R2PXKB2IZ6EZXA,0451169530,825352881,The Stand: Expanded Edition: For the First Tim...,Books,5.0,2.0,3.0,N,N,This book is excellent. The finest book King ...,"When the movie came out in May of 1994, I love...",1997-03-22


In [3]:
# # Group by product_id and aggregate reviews and calculate avg_star_rating
# grouped_reviews = data.groupby(['product_id', 'product_title']).agg({
#     'review_body': lambda x: ' '.join(x),  # Concatenate all review bodies
#     'star_rating': 'mean'  # Calculate the average star rating
# }).reset_index()

# # Rename the star_rating column to avg_star_rating
# grouped_reviews = grouped_reviews.rename(columns={'star_rating': 'avg_star_rating'})

# # Select a subset of 10 rows from the grouped dataframe
# sampled_reviews = grouped_reviews.sample(n=10, random_state=123)

# sampled_reviews

In [7]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import json

# Load the Hugging Face token from the JSON file
with open("config.json", "r") as config_file:
    config = json.load(config_file)
hf_token = config["hf_token"]

# Define the model name
model_name = "meta-llama/Llama-3.2-1B"

# Load the tokenizer with the authentication token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)

# Load the configuration and modify the rope_scaling if necessary
config = AutoConfig.from_pretrained(model_name, use_auth_token=hf_token)

# Check if rope_scaling exists in the config and adjust it
if "rope_scaling" in config.to_dict():
    config.rope_scaling = {"type": "linear", "factor": config.rope_scaling["factor"]}

# Load the model with the adjusted configuration
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, use_auth_token=hf_token)

# Check if the tokenizer has a padding token, if not, set eos_token as pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Assume df is your input dataframe with reviews
df = filtered_data  # Make sure filtered_data is defined elsewhere in your code

# Group by product_id and aggregate reviews and calculate avg_star_rating
grouped_reviews = df.groupby(['product_id', 'product_title']).agg({
    'review_body': lambda x: ' '.join(x),  # Concatenate all review bodies
    'star_rating': 'mean'  # Calculate the average star rating
}).reset_index()

# Rename the star_rating column to avg_star_rating
grouped_reviews = grouped_reviews.rename(columns={'star_rating': 'avg_star_rating'})

# Select a subset of 10 rows from the grouped dataframe
sampled_reviews = grouped_reviews.sample(n=10, random_state=42)

# Define a function to summarize reviews using the LLaMA model (shorter, no query in final summary)
def summarize_reviews(reviews):
    # Prepare the input for the model (removing prompt text in final output)
    prompt = f"{reviews}"  # Directly use the reviews as the input
    
    # Tokenize the input and set the attention mask
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True)
    attention_mask = inputs['attention_mask']
    
    # Generate the summary using the model
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=attention_mask,  # Add attention mask to avoid padding issues
            max_new_tokens=50,  # Shorter summaries (50 tokens max)
            num_return_sequences=1,
            no_repeat_ngram_size=2,  # Avoid repeated phrases
            pad_token_id=tokenizer.pad_token_id  # Use the explicitly set pad token
        )
    
    # Decode the generated output
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary.strip()

# Apply the summarization function to each product's grouped reviews in the sampled subset
sampled_reviews['summarized_review_body'] = sampled_reviews['review_body'].apply(summarize_reviews)

# Drop the original 'review_body' column
final_df = sampled_reviews.drop(columns=['review_body'])

# Save the result as a CSV
final_df.to_csv('data/summarized_book_reviews_sampled_llama.csv', index=False)

print("Summarization complete. The file 'summarized_book_reviews_sampled_llama.csv' has been saved.")


Summarization complete. The file 'summarized_book_reviews_sampled_llama.csv' has been saved.
