In [1]:
import polars as pl
from sklearn.cluster import KMeans

In [2]:
lf = pl.scan_parquet("data/processed/amazon-2023.parquet")

In [3]:
lf: pl.DataFrame = lf.group_by("parent_asin").agg([
    pl.len().alias("total_reviews"),
    pl.col("rating").mean().alias("mean_rating"),
    pl.col("brand").first().fill_null("Unknown").str.to_lowercase().alias("brand_id"),
    pl.col("main_category").first().fill_null("Unknown").str.to_lowercase().alias("category_id"),
])

In [4]:
lf: pl.LazyFrame = lf.with_columns([
    pl.col("brand_id").cast(pl.Categorical).to_physical().alias("brand_id"),
    pl.col("category_id").cast(pl.Categorical).to_physical().alias("category_id")
])

columns: list[str] = ["mean_rating", "total_reviews", "brand_id", "category_id", "parent_asin"]
lf = lf.select(columns)
lf=lf.drop("parent_asin")

In [5]:
df: pl.DataFrame = lf.collect(engine="streaming")
X: pl.DataFrame = df.to_numpy()

In [6]:
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)

df: pl.DataFrame = df.with_columns(pl.Series(name="cluster", values=labels))

In [7]:
summary: pl.DataFrame = (
df.group_by("cluster")
    .agg([
        pl.len().alias("cluster_size"),
        pl.col("mean_rating").mean().alias("avg_mean_rating"),
        pl.col("total_reviews").mean().alias("avg_total_reviews"),
        pl.col("brand_id").mean().alias("avg_brand_id"),
        pl.col("category_id").mean().alias("avg_category_id"),
        pl.col("brand_id").mode().alias("top_brand_id"),
        pl.col("category_id").mode().alias("top_category_id"),
    ])
    .sort("cluster")
)

summary

cluster,cluster_size,avg_mean_rating,avg_total_reviews,avg_brand_id,avg_category_id,top_brand_id,top_category_id
i32,u32,f64,f64,f64,f64,list[u32],list[u32]
0,5427048,4.104922,16.579857,610507.42722,451436.366671,[366564],[487909]
1,1535537,4.323468,6.096428,4056100.0,473927.347637,"[3532404, 3513067]",[487906]
2,23914425,4.074856,15.136763,71162.548929,441133.504817,[86],[487901]
3,1848924,4.280674,7.646283,2712900.0,469303.456396,[2126527],[487906]
4,2639394,4.199917,10.426479,1534700.0,460076.901021,[1091724],[487906]
