In [1]:
from sklearnex import patch_sklearn
patch_sklearn()
import polars as pl
from polars.lazyframe.group_by import LazyGroupBy
from sklearn.cluster import KMeans
import numpy as np

Intel(R) Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [None]:
lf: LazyGroupBy = pl.scan_parquet("data/processed/amazon-2023.parquet").group_by("parent_asin")
lf = lf.agg([
	pl.len().alias("total_reviews"),
	pl.col("rating").mean().alias("mean_rating"),
	pl.col("brand").first().fill_null("Unknown").str.to_lowercase().alias("brand_name"),
	pl.col("main_category").first().fill_null("Unknown").str.to_lowercase().alias("category_name")
])

lf = lf.with_columns([
	pl.col("brand_name").cast(pl.Categorical).to_physical().alias("brand_id"),
	pl.col("category_name").cast(pl.Categorical).to_physical().alias("category_id")
])

columns: list[str] = ["mean_rating", "total_reviews", "brand_id", 
                      "brand_name","category_id","category_name"]

df: pl.DataFrame = lf.select(columns).collect(engine="streaming")

In [None]:
training_columns: list[str]  = ["mean_rating", "total_reviews", 
                     "brand_id","category_id"]

X: np.ndarray = df.select(training_columns).to_numpy()
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
labels: np.ndarray = kmeans.fit_predict(X)

In [None]:
df: pl.DataFrame = df.with_columns(pl.Series("cluster", labels))

summary: pl.DataFrame = df.group_by("cluster").agg([
	pl.len().alias("cluster_size"),
	pl.col("mean_rating").mean().alias("avg_mean_rating"),
	pl.col("total_reviews").mean().alias("avg_total_reviews"),
	pl.col("brand_id").mean().alias("avg_brand_id"),
	pl.col("category_id").mean().alias("avg_category_id"),
	pl.col("brand_id").mode().alias("top_brand_id"),
	pl.col("brand_name").mode().alias("top_brand_name"),
	pl.col("category_name").mode().alias("top_category_name"),
	pl.col("category_id").mode().alias("top_category_id"),
]).sort("cluster")

In [None]:
additional: pl.DataFrame = summary.drop(summary.columns[1:6])
summary: pl.DataFrame = summary.drop(summary.columns[6:])

In [28]:
summary

cluster,cluster_size,avg_mean_rating,avg_total_reviews,avg_brand_id,avg_category_id
i32,u32,f64,f64,f64,f64
0,4900703,4.098899,13.111658,656981.198234,185631.833606
1,1520795,4.324956,6.132356,4062700.0,195285.389854
2,1820326,4.279179,7.70994,2733000.0,193409.32291
3,2569403,4.201071,10.548661,1565700.0,190014.075359
4,24554101,4.077346,15.8126,81107.899496,182308.524842


In [27]:
additional

cluster,top_brand_id,top_brand_name,top_category_name,top_category_id
i32,list[u32],list[str],list[str],list[u32]
0,[708484],"[""abundant earth works""]","[""books""]",[200611]
1,[3541601],"[""vipmvpup""]","[""books""]",[200611]
2,[2163190],"[""kate hoffmann (author) format: kindle edition""]","[""books""]",[200611]
3,[1128601],"[""pennzoni""]","[""books""]",[200611]
4,[0],"[""unknown""]","[""amazon fashion""]",[200620]
