# Select the top 3 products of each category

In [32]:
import pandas as pd

top3_csv = pd.read_csv("../data/reviews.csv")

top3_csv.head()

Unnamed: 0,asins,name,rating,title_text_processed,imageURLs,cluster_name
0,"B00QWO9P0O,B00LH3DMUO",AmazonBasics AAA Performance Alkaline Batterie...,3.0,... 3 one item bad quality missing backup spri...,https://images-na.ssl-images-amazon.com/images...,Smart Home & Amazon devices
1,"B00QWO9P0O,B00LH3DMUO",AmazonBasics AAA Performance Alkaline Batterie...,4.0,... always less expensive way go product like ...,https://images-na.ssl-images-amazon.com/images...,Smart Home & Amazon devices
2,"B00QWO9P0O,B00LH3DMUO",AmazonBasics AAA Performance Alkaline Batterie...,5.0,... duracell price happy well duracell price h...,https://images-na.ssl-images-amazon.com/images...,Smart Home & Amazon devices
3,"B00QWO9P0O,B00LH3DMUO",AmazonBasics AAA Performance Alkaline Batterie...,5.0,... well name brand battery much better seem w...,https://images-na.ssl-images-amazon.com/images...,Smart Home & Amazon devices
4,"B00QWO9P0O,B00LH3DMUO",AmazonBasics AAA Performance Alkaline Batterie...,5.0,... battery long lasting price great battery l...,https://images-na.ssl-images-amazon.com/images...,Smart Home & Amazon devices


In [92]:
# Group by ASIN and aggregate product data
groups = top3_csv.groupby("asins").agg({
    "name": "first",  # Take first product name
    "cluster_name": lambda x: x.mode().iloc[0],  # Most frequent cluster name
    "rating": ["mean", "count"],
    "imageURLs": lambda x: ','.join(set(
        # Combine unique image URLs
        url for urls in x.dropna().astype(str) for url in urls.split(',')
    )),
})

groups.columns = ["name", "cluster_name", "rating_mean", "rating_count", "imageURLs"]
groups = groups.reset_index()

# Round average rating to 2 decimals
groups["rating_mean"] = groups["rating_mean"].round(2)

groups.head()

Unnamed: 0,asins,name,cluster_name,rating_mean,rating_count,imageURLs
0,"B0002LCUZK,B010CEC6MI,B01B25NN64",Expanding Accordion File Folder Plastic Portab...,E-Reader & Office Tablets,5.0,9,https://i.ebayimg.com/thumbs/images/g/UOsAAOSw...
1,B001NIZB5M,Amazon Kindle Replacement Power Adapter (Fits ...,E-Reader & Office Tablets,2.8,5,https://images-na.ssl-images-amazon.com/images...
2,B005OOKNP4,AmazonBasics Bluetooth Keyboard for Android De...,E-Reader & Office Tablets,4.33,6,https://images-na.ssl-images-amazon.com/images...
3,B005PB2T0S,"Amazon Kindle Lighted Leather Cover,,,\r\nAmaz...",E-Reader & Office Tablets,4.0,5,
4,B005PB2T2Q,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",E-Reader & Office Tablets,3.67,6,


In [106]:
# Calculate average
C = groups["rating_count"].mean()
m = groups["rating_mean"].mean()

# Compute weighted score (Bayesian average)
groups["weighted_score"] = (
    (groups["rating_count"] / (groups["rating_count"] + C)) * groups["rating_mean"] 
    + (C / (groups["rating_count"] + C)) * m
)

# Get top 3 products per cluster
top3_per_cluster = (
    groups.sort_values(["cluster_name", "weighted_score"], ascending=[True, False])
    .groupby("cluster_name")
    .head(100)
    .reset_index(drop=True)
)

In [107]:
# Filter reviews for top products
reviews = top3_csv[["asins", "rating", "title_text_processed"]]
filtered_reviews = reviews[reviews["asins"].isin(top3_per_cluster["asins"])]

# Add title length column
filtered_reviews['title_length_chars'] = filtered_reviews['title_text_processed'].str.len()

# Count reviews per product
reviews_per_asin = filtered_reviews.groupby('asins').size().reset_index(name='count')
print(reviews_per_asin)

# Filter for medium length reviews (300-500 chars)
reviews_in_range = filtered_reviews[
    (filtered_reviews['title_length_chars'] >= 300) & 
    (filtered_reviews['title_length_chars'] <= 500)
]

filtered_reviews.head()

                               asins  count
0   B0002LCUZK,B010CEC6MI,B01B25NN64      9
1                         B001NIZB5M      5
2                         B005OOKNP4      6
3                         B005PB2T0S      5
4                         B005PB2T2Q      6
..                               ...    ...
76                        B06XD5YCKX     22
77                        B073SQYXTW      2
78             B074MCBG25,B075357QFB      2
79                        B0751RGYJV      3
80                        B0752151W6      2

[81 rows x 2 columns]


Unnamed: 0,asins,rating,title_text_processed,title_length_chars
0,"B00QWO9P0O,B00LH3DMUO",3.0,... 3 one item bad quality missing backup spri...,137.0
1,"B00QWO9P0O,B00LH3DMUO",4.0,... always less expensive way go product like ...,92.0
2,"B00QWO9P0O,B00LH3DMUO",5.0,... duracell price happy well duracell price h...,50.0
3,"B00QWO9P0O,B00LH3DMUO",5.0,... well name brand battery much better seem w...,91.0
4,"B00QWO9P0O,B00LH3DMUO",5.0,... battery long lasting price great battery l...,69.0


In [108]:
# Categorize review sentiment
def categorize_rating(rating):
    if rating in [1, 2]:
        return "negative"
    elif rating == 3:
        return "neutral"
    return "positive"

# Apply sentiment to both DataFrames
filtered_reviews['sentiment'] = filtered_reviews['rating'].apply(categorize_rating)
reviews_in_range['sentiment'] = reviews_in_range['rating'].apply(categorize_rating)  # Fixed: use own ratings

# Count reviews by sentiment
rating_counts = filtered_reviews.groupby(['asins', 'sentiment']).size().reset_index(name='total_reviews')
range_counts = reviews_in_range.groupby(['asins', 'sentiment']).size().reset_index(name='reviews_title_300_500')

# Merge counts
final_counts = pd.merge(
    rating_counts,
    range_counts,
    on=['asins', 'sentiment'],
    how='left'
).fillna(0).astype({'reviews_title_300_500': int})

print(final_counts)

                                asins sentiment  total_reviews  \
0    B0002LCUZK,B010CEC6MI,B01B25NN64  positive              9   
1                          B001NIZB5M  negative              3   
2                          B001NIZB5M  positive              2   
3                          B005OOKNP4  positive              6   
4                          B005PB2T0S   neutral              1   
..                                ...       ...            ...   
166                        B06XD5YCKX  positive             21   
167                        B073SQYXTW  positive              2   
168             B074MCBG25,B075357QFB  positive              2   
169                        B0751RGYJV  positive              3   
170                        B0752151W6  positive              2   

     reviews_title_300_500  
0                        0  
1                        1  
2                        1  
3                        1  
4                        0  
..                     ...  
166 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_in_range['sentiment'] = reviews_in_range['rating'].apply(categorize_rating)  # Fixed: use own ratings


In [109]:
# Combine top 5 reviews per sentiment as strings
grouped_reviews = (
    reviews_in_range
    .groupby(['asins', 'sentiment'])
    ['title_text_processed']
    .apply(lambda x: ' '.join(x.head(5)))  # Join first 5 reviews
    .unstack()
    .rename(columns={
        'positive': 'positive_reviews',
        'negative': 'negative_reviews'
    })
    .reset_index()
)

# Merge with product data
final_df = pd.merge(
    top3_per_cluster,
    grouped_reviews[['asins', 'positive_reviews', 'negative_reviews']],
    on='asins',
    how='left'
)

In [110]:
final_df.to_csv("../data/top3_products.csv", index=False)