References:
* https://www.kaggle.com/datasets/kartikeybartwal/ecommerce-product-recommendation-collaborative
* https://en.wikipedia.org/wiki/Collaborative_filtering

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kartikeybartwal/ecommerce-product-recommendation-collaborative")

print("Path to dataset files:", path)

Path to dataset files: /Users/arie/.cache/kagglehub/datasets/kartikeybartwal/ecommerce-product-recommendation-collaborative/versions/1


In [79]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import minmax_scale

In [3]:
data = pd.read_csv(f"{path}/user_personalized_features.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,User_ID,Age,Gender,Location,Income,Interests,Last_Login_Days_Ago,Purchase_Frequency,Average_Order_Value,Total_Spending,Product_Category_Preference,Time_Spent_on_Site_Minutes,Pages_Viewed,Newsletter_Subscription
0,0,#1,56,Male,Suburban,38037,Sports,5,7,18,2546,Books,584,38,True
1,1,#2,46,Female,Rural,103986,Technology,15,7,118,320,Electronics,432,40,False
2,2,#3,32,Female,Suburban,101942,Sports,28,1,146,3766,Apparel,306,1,True
3,3,#4,60,Female,Suburban,71612,Fashion,18,3,163,4377,Apparel,527,29,False
4,4,#5,25,Male,Suburban,49725,Travel,2,5,141,4502,Health & Beauty,53,10,True


In [5]:
# sns.pairplot(data[data.columns[2:]])

In [7]:
features = data.loc[:, ["User_ID", "Age", "Gender", "Location", "Income", "Interests", "Average_Order_Value", "Total_Spending"]]

In [8]:
features

Unnamed: 0,User_ID,Age,Gender,Location,Income,Interests,Average_Order_Value,Total_Spending
0,#1,56,Male,Suburban,38037,Sports,18,2546
1,#2,46,Female,Rural,103986,Technology,118,320
2,#3,32,Female,Suburban,101942,Sports,146,3766
3,#4,60,Female,Suburban,71612,Fashion,163,4377
4,#5,25,Male,Suburban,49725,Travel,141,4502
...,...,...,...,...,...,...,...,...
995,#996,22,Male,Urban,104162,Technology,83,607
996,#997,40,Male,Urban,99003,Travel,180,431
997,#998,27,Female,Urban,72395,Technology,130,650
998,#999,61,Male,Rural,59758,Travel,152,1041


In [13]:
assert features.User_ID.nunique() == features.shape[0]

In [25]:
ordinal_features = ["Gender", "Location", "Interests"]

In [53]:
# sns.displot(features.Average_Order_Value)
# sns.displot(features.Total_Spending)
# sns.displot(features.Income)
# sns.displot(features.Age)

In [54]:
oe = OrdinalEncoder()
oe.fit_transform(features[ordinal_features])

array([[1., 1., 2.],
       [0., 0., 3.],
       [0., 1., 2.],
       ...,
       [0., 2., 3.],
       [1., 0., 4.],
       [1., 0., 2.]])

In [None]:
oe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['Rural', 'Suburban', 'Urban'], dtype=object),
 array(['Fashion', 'Food', 'Sports', 'Technology', 'Travel'], dtype=object)]

In [61]:
features.loc[:, ordinal_features] = oe.transform(features.loc[:, ordinal_features])

In [82]:
num_features_cols = features.columns[1:]
num_features = features.loc[:, num_features_cols]
norm_features = minmax_scale(num_features, axis=0)

In [83]:
user_similarities = cosine_similarity(norm_features, norm_features)

In [90]:
features[features.User_ID == features.User_ID.iloc[3]].index.values[0]

3

In [121]:
class Sim:
    def __init__(self, similarities, user_ids):
        self.similarities = similarities
        self.user_ids = user_ids
    
    def get_similar(self, user_id, n=5):
        user_index = self.user_ids[
            self.user_ids == user_id
        ].index.values[0]
        sim = self.similarities[user_index]
        sim = [(i, v) for i, v in enumerate(sim)]
        ssim = sorted(sim, key=lambda x: -x[-1])
        similar_users = [i for i, _ in ssim[1:n+1]]
        
        return self.user_ids[similar_users]

In [122]:
sim = Sim(similarities=user_similarities, user_ids=features.User_ID)

In [124]:
sim.get_similar("#2")

833    #834
130    #131
518    #519
160    #161
311    #312
Name: User_ID, dtype: object