In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [3]:
df = pd.read_parquet('amazon_reviews_for_tfidf.parquet')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Number of unique users: {df['user_id'].nunique()}")
print(f"Number of unique products: {df['product_id'].nunique()}")

Dataset shape: (893040, 13)
Number of unique users: 113741
Number of unique products: 28113


Aggreagate reviews by product_id

In [5]:
product_df = df.groupby('product_id').agg({
        'processed_text': lambda x: ' '.join(x),
    }).reset_index()
    
print(f"Aggregated dataset shape: {product_df.shape}")

Aggregated dataset shape: (28113, 2)


TF-IDF vectorizer

In [6]:
vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1,1),
        min_df=5,
        max_df=0.5
    )
# Fit and transform the processed text
X_tfidf = vectorizer.fit_transform(product_df['processed_text'])

In [7]:
print(f"TF-IDF matrix shape: {X_tfidf.shape}")

TF-IDF matrix shape: (28113, 5000)


In [8]:
k = 10 #found using elbow method with inertia
# Create and fit the K-means model
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_tfidf)

# Add cluster labels to the dataframe
product_df['cluster'] = clusters

In [9]:
cluster_counts = product_df['cluster'].value_counts().sort_index()
print("\nCluster distribution:")
print(cluster_counts)


Cluster distribution:
cluster
0      751
1     1359
2      673
3     1384
4     6861
5    11094
6     1351
7      931
8      824
9     2885
Name: count, dtype: int64


In [12]:
def get_top_terms_per_cluster(kmeans_model, feature_names, n_terms=10):
    """Get the top n terms for each cluster based on the cluster centers."""
    top_terms = {}
    for i, center in enumerate(kmeans_model.cluster_centers_):
        # Get the indices of the top n features for this cluster
        top_indices = center.argsort()[-n_terms:][::-1]
        # Get the feature names for these indices
        top_features = [feature_names[idx] for idx in top_indices]
        top_terms[i] = top_features
    return top_terms

In [14]:
feature_names = vectorizer.get_feature_names_out()

In [15]:
print("\nTop 10 terms for each cluster:")
top_terms = get_top_terms_per_cluster(kmeans, feature_names, n_terms=10)

# Print top terms for each cluster
for cluster_id, terms in top_terms.items():
    cluster_size = cluster_counts[cluster_id]
    print(f"\nCluster {cluster_id}: {cluster_size} products")
    print(f"Top terms: {', '.join(terms)}")
    print("-" * 80)


Top 10 terms for each cluster:

Cluster 0: 751 products
Top terms: ds, xl, case, stylu, fit, charger, nintendo, screen, protect, protector
--------------------------------------------------------------------------------

Cluster 1: 1359 products
Top terms: headset, sound, headphon, mic, ear, comfort, microphon, volum, audio, hear
--------------------------------------------------------------------------------

Cluster 2: 673 products
Top terms: race, car, wheel, track, drive, racer, graphic, kart, speed, mode
--------------------------------------------------------------------------------

Cluster 3: 1384 products
Top terms: case, switch, fit, protect, dock, protector, hold, carri, grip, lite
--------------------------------------------------------------------------------

Cluster 4: 6861 products
Top terms: stori, charact, graphic, level, enemi, puzzl, fight, enjoy, gameplay, weapon
--------------------------------------------------------------------------------

Cluster 5: 11094 pro