In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint
import os
import random
import contextlib
import io

# Read original dataset from csv files

In [2]:
aisles_df = pd.read_csv('capstone-dataset/aisles.csv')
departments_df = pd.read_csv('capstone-dataset/departments.csv')
products_df = pd.read_csv('capstone-dataset/products.csv')
orders_df = pd.read_csv('capstone-dataset/orders.csv')
order_products_prior_df = pd.read_csv('capstone-dataset/order_products__prior.csv')
order_products_train_df = pd.read_csv('capstone-dataset/order_products__train.csv')

In [3]:
products_df = pd.merge(products_df, aisles_df, on="aisle_id")
products_df = pd.merge(products_df, departments_df, on="department_id")
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,0,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,1,All-Seasons Salt,104,13,spices seasonings,pantry
2,2,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,3,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,4,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
...,...,...,...,...,...,...
49683,49683,"Vodka, Triple Distilled, Twist of Vanilla",124,5,spirits,alcohol
49684,49684,En Croute Roast Hazelnut Cranberry,42,1,frozen vegan vegetarian,frozen
49685,49685,Artisan Baguette,112,3,bread,bakery
49686,49686,Smartblend Healthy Metabolism Dry Cat Food,41,8,cat food care,pets


In [4]:
order_products_prior_df = pd.merge(order_products_prior_df, products_df, on="product_id")
order_products_train_df = pd.merge(order_products_train_df, products_df, on="product_id")
order_products_train_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49301,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,1,11108,2,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,108,16,other creams cheeses,dairy eggs
2,1,10245,3,0,Organic Celery Hearts,83,4,fresh vegetables,produce
3,1,49682,4,0,Cucumber Kirby,83,4,fresh vegetables,produce
4,1,43632,5,1,Lightly Smoked Sardines in Olive Oil,95,15,canned meat seafood,canned goods
...,...,...,...,...,...,...,...,...,...
1384612,3421063,14232,3,1,Natural Artesian Water,115,7,water seltzer sparkling water,beverages
1384613,3421063,35547,4,1,Twice Baked Potatoes,13,20,prepared meals,deli
1384614,3421070,35950,1,1,Organic Unsweetened Almond Milk,91,16,soy lactosefree,dairy eggs
1384615,3421070,16952,2,1,Creamy Peanut Butter,88,13,spreads,pantry


# prepare training dataset using data when eval_set='prior' and prepare test dataset using data when eval_set='train'

In [5]:
order_user_prior_df = orders_df[orders_df["eval_set"]=='prior']
order_user_train_df = orders_df[orders_df["eval_set"]=='train']
order_user_train_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,0,train,11,4,8,14.0
25,1492625,1,train,15,1,11,30.0
49,2196797,4,train,5,0,11,6.0
74,525192,6,train,21,2,11,6.0
78,880375,7,train,4,1,14,10.0
...,...,...,...,...,...,...,...
3420838,2585586,206198,train,20,2,16,30.0
3420862,943915,206199,train,24,6,19,6.0
3420924,2371631,206202,train,6,4,19,30.0
3420933,1716008,206204,train,4,1,16,10.0


In [6]:
order_user_product_prior_df = pd.merge(order_user_prior_df, order_products_prior_df, on="order_id")
order_user_product_train_df = pd.merge(order_user_train_df, order_products_train_df, on="order_id")
order_user_product_train_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1187899,0,train,11,4,8,14.0,195,1,1,Soda,77,7,soft drinks,beverages
1,1187899,0,train,11,4,8,14.0,25132,2,1,Organic String Cheese,21,16,packaged cheese,dairy eggs
2,1187899,0,train,11,4,8,14.0,38927,3,1,0% Greek Strained Yogurt,120,16,yogurt,dairy eggs
3,1187899,0,train,11,4,8,14.0,26404,4,1,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household
4,1187899,0,train,11,4,8,14.0,39656,5,1,Milk Chocolate Almonds,45,19,candy chocolate,snacks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1384612,272231,206208,train,14,6,14,30.0,40602,4,0,Fabric Softener Sheets,75,17,laundry,household
1384613,272231,206208,train,14,6,14,30.0,15654,5,0,Dark Chocolate Mint Snacking Chocolate,45,19,candy chocolate,snacks
1384614,272231,206208,train,14,6,14,30.0,42605,6,0,Phish Food Frozen Yogurt,37,1,ice cream ice,frozen
1384615,272231,206208,train,14,6,14,30.0,37965,7,0,French Baguette Bread,112,3,bread,bakery


In [7]:
train_df = order_user_product_prior_df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')
test_df = order_user_product_train_df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')
train_df

Unnamed: 0,user_id,product_id,purchase_count
0,0,195,10
1,0,10257,9
2,0,10325,1
3,0,12426,10
4,0,13031,3
...,...,...,...
13307948,206208,43960,3
13307949,206208,44324,1
13307950,206208,48369,1
13307951,206208,48696,1


### feature engineering

In [8]:
merged_df = pd.merge(train_df, products_df, on="product_id")
aisle_counts = merged_df.groupby(['user_id', 'aisle_id'])['purchase_count'].sum().reset_index()
aisle_counts = aisle_counts.merge(aisles_df, on='aisle_id')
pivot_aisle = aisle_counts.pivot(index='user_id', columns='aisle', values='purchase_count').fillna(0)
department_counts = merged_df.groupby(['user_id', 'department_id'])['purchase_count'].sum().reset_index()
department_counts = department_counts.merge(departments_df, on='department_id')
pivot_department = department_counts.pivot(index='user_id', columns='department', values='purchase_count').fillna(0)
user_features_df = pd.concat([pivot_aisle, pivot_department], axis=1).reset_index()
user_features_df

Unnamed: 0,user_id,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,...,household,international,meat seafood,missing,other,pantry,personal care,pets,produce,snacks
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,22.0
1,1,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,3.0,1.0,0.0,0.0,11.0,1.0,0.0,36.0,42.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,38.0,9.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0
4,4,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,19.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206204,206204,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0
206205,206205,0.0,4.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,...,12.0,4.0,2.0,0.0,1.0,14.0,10.0,0.0,34.0,42.0
206206,206206,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,6.0,0.0,0.0,15.0,0.0,0.0,50.0,23.0
206207,206207,0.0,3.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,...,6.0,3.0,22.0,2.0,0.0,38.0,5.0,0.0,197.0,59.0


In [9]:
#calculate purchase count per product
product_purchase_count = train_df.groupby('product_id')['purchase_count'].sum().reset_index()
#calculate the number of unique users who bought each product
unique_user_count = train_df.groupby('product_id')['user_id'].nunique().reset_index(name='unique_user_count')
purchase_data = pd.merge(product_purchase_count, unique_user_count, on='product_id')
#calculate average purchase count per product (total purchase count / number of unique users)
purchase_data['avg_purchase_count'] = purchase_data['purchase_count'] / purchase_data['unique_user_count']
purchase_data = purchase_data.drop(columns=['unique_user_count', 'purchase_count'])
pivot_aisle = products_df.pivot(index='product_id', columns='aisle', values='aisle_id').notnull().astype(int)
pivot_department = products_df.pivot(index='product_id', columns='department', values='department_id').notnull().astype(int)
product_features_df = pd.concat([purchase_data.set_index('product_id'), pivot_aisle, pivot_department], axis=1).reset_index()
product_features_df.fillna(0, inplace=True)
product_features_df.sort_values(by='product_id', inplace=True)
product_features_df

Unnamed: 0,product_id,avg_purchase_count,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,...,household,international,meat seafood,missing,other,pantry,personal care,pets,produce,snacks
0,0,2.586592,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,1.153846,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2,3.743243,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1.807692,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,2.500000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49672,49683,1.125000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49673,49684,1.139535,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49674,49685,3.333333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49675,49686,1.857143,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### scale user features and product features

In [10]:
#separate user_id
user_ids = user_features_df[['user_id']]
features_to_scale = user_features_df.drop(columns=['user_id'])

#scale user features without user_id
scalerUser = StandardScaler()
scalerUser.fit(features_to_scale)
scaled_features = scalerUser.transform(features_to_scale)

#convert scaled features back to DataFrame
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

#concatenate user_id back
scaled_user_features_df = pd.concat([user_ids, scaled_features_df], axis=1)

print("Scaled User Features DataFrame:")
print(scaled_user_features_df)

Scaled User Features DataFrame:
        user_id  air fresheners candles  asian foods  baby accessories  \
0             0               -0.132137    -0.331369         -0.074269   
1             1               -0.132137     0.899033         -0.074269   
2             2               -0.132137    -0.331369         -0.074269   
3             3               -0.132137    -0.331369         -0.074269   
4             4               -0.132137     0.488899         -0.074269   
...         ...                     ...          ...               ...   
206204   206204               -0.132137    -0.331369          1.802555   
206205   206205               -0.132137     1.309168         -0.074269   
206206   206206               -0.132137    -0.331369         -0.074269   
206207   206207               -0.132137     0.899033         -0.074269   
206208   206208               -0.132137     0.078765         -0.074269   

        baby bath body care  baby food formula  bakery desserts  \
0           

In [11]:
#separate product_id
product_ids = product_features_df[['product_id']]
features_to_scale = product_features_df.drop(columns=['product_id'])

#scale product features without product_id
scalerProduct = StandardScaler()
scalerProduct.fit(features_to_scale)
scaled_features = scalerProduct.transform(features_to_scale)

#convert scaled features back to DataFrame
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

# Concatenate product_id back
scaled_product_features_df = pd.concat([product_ids, scaled_features_df], axis=1)

print("Scaled Product Features DataFrame:")
print(scaled_product_features_df)

Scaled Product Features DataFrame:
       product_id  avg_purchase_count  air fresheners candles  asian foods  \
0               0            1.032001               -0.084829    -0.111023   
1               1           -0.845356               -0.084829    -0.111023   
2               2            2.547585               -0.084829    -0.111023   
3               3            0.011392               -0.084829    -0.111023   
4               4            0.918537               -0.084829    -0.111023   
...           ...                 ...                     ...          ...   
49672       49683           -0.086042               -0.084829    -0.111023   
49673       49684            1.760886               -0.084829    -0.111023   
49674       49685            0.298791               -0.084829    -0.111023   
49675       49686           -0.017406               -0.084829    -0.111023   
49676       49687           -0.413768               -0.084829    -0.111023   

       baby accessories  bab

In [12]:
y = train_df["purchase_count"].values
scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y.reshape(-1, 1))
train_df["scaled_y"] = scalerTarget.transform(y.reshape(-1, 1))
train_df
# y = scalerTarget.transform(y.reshape(-1, 1))

Unnamed: 0,user_id,product_id,purchase_count,scaled_y
0,0,195,10,-0.816327
1,0,10257,9,-0.836735
2,0,10325,1,-1.000000
3,0,12426,10,-0.816327
4,0,13031,3,-0.959184
...,...,...,...,...
13307948,206208,43960,3,-0.959184
13307949,206208,44324,1,-1.000000
13307950,206208,48369,1,-1.000000
13307951,206208,48696,1,-1.000000


### create data generator

In [13]:
def prepare_data(batch_df, user_features, product_features, is_prediction=False):
   #merge batch_df with user_features
    merged_user = pd.merge(batch_df, user_features, on='user_id', how='left')    
    #merge the result with product_features
    merged_product = pd.merge(batch_df, product_features, on='product_id', how='left')
    #define feature columns
    user_feature_cols = user_features.columns.difference(['user_id'])
    product_feature_cols = product_features.columns.difference(['product_id'])
    
    #extract feature chunks
    user_features_chunk = merged_user[user_feature_cols].values
    product_features_chunk = merged_product[product_feature_cols].values
    if is_prediction:
        return user_features_chunk, product_features_chunk
    else:
        labels = batch_df['scaled_y'].values
        return user_features_chunk, product_features_chunk, labels

def data_generator(order_df, user_features, product_features, batch_size):
    num_samples = len(order_df)
    for offset in range(0, num_samples, batch_size):
        batch_df = order_df.iloc[offset:offset + batch_size]
        user_features_chunk, product_features_chunk, labels = prepare_data(batch_df, user_features, product_features)
        yield (user_features_chunk, product_features_chunk), labels
        
#create the TensorFlow datasets
batch_size = 300000
train_data_gen = tf.data.Dataset.from_generator(
    lambda: data_generator(train_df, scaled_user_features_df, scaled_product_features_df, batch_size),
    output_signature=(
        (
            tf.TensorSpec(shape=(None, user_features_df.shape[1] - 1), dtype=tf.float32),
            tf.TensorSpec(shape=(None, product_features_df.shape[1] - 1), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
).repeat().prefetch(tf.data.AUTOTUNE)

### create a directory to save the model checkpoints

In [14]:
checkpoint_dir = './checkpoints/content_based_filterting'
os.makedirs(checkpoint_dir, exist_ok=True)

#define the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'model_epoch_{epoch:02d}.weights.h5'),
    save_weights_only=True,
    save_freq='epoch',  # Save the model at the end of each epoch
    verbose=1
)

### build model and train model

In [15]:
#define the neural networks for users and items
num_outputs = 32
tf.random.set_seed(1)

#user neural network
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

#item neural network
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs)
])

#define the custom L2 normalization layer
class L2Normalization(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.linalg.l2_normalize(inputs, axis=1)

#create the user input and point to the base network
num_user_features = (user_features_df.shape[1] - 1,)  # Exclude user_id
input_user = tf.keras.layers.Input(shape=num_user_features)
vu = user_NN(input_user)
vu = L2Normalization()(vu)
user_model = tf.keras.Model(inputs=input_user, outputs=vu)


#create the item input and point to the base network
num_item_features = (product_features_df.shape[1] - 1,)  # Exclude product_id
input_item = tf.keras.layers.Input(shape=num_item_features)
vm = item_NN(input_item)
vm = L2Normalization()(vm)
item_model = tf.keras.Model(inputs=input_item, outputs=vm)



#compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

#specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae','mean_absolute_percentage_error',
        tf.keras.metrics.CosineSimilarity()])

# model.summary()


#check for existing checkpoints and load the latest one if available
checkpoint_files = sorted([f for f in os.listdir(checkpoint_dir) if f.endswith('.weights.h5')])

initial_epoch = 0
if checkpoint_files:
    latest_checkpoint = os.path.join(checkpoint_dir, checkpoint_files[-1])
    print(f"Loading weights from {latest_checkpoint}")
    model.load_weights(latest_checkpoint)
    initial_epoch = int(latest_checkpoint.split('_')[-1].split('.')[0])  # Continue from the last saved epoch

#train the model with the ModelCheckpoint callback
model.fit(
    train_data_gen,
    epochs=15,
    steps_per_epoch=len(train_df) // batch_size,
    initial_epoch=initial_epoch,
    callbacks=[checkpoint_callback]
)

Loading weights from ./checkpoints/content_based_filterting/model_epoch_15.weights.h5


  saveable.load_own_variables(weights_store.get(inner_path))


<keras.src.callbacks.history.History at 0x29baa13d0>

In [16]:
def test_data_generator(df, user_features, product_features, batch_size):
    num_samples = len(df)
    for offset in range(0, num_samples, batch_size):
        batch_df = df.iloc[offset:offset + batch_size]
        user_features_chunk,product_features_chunk = prepare_data(batch_df, user_features, product_features, is_prediction=True)
        yield user_features_chunk, product_features_chunk

In [17]:
batch_size = 300000  
predictions = []

num_samples = test_df.shape[0]
num_batches = (num_samples + batch_size - 1) // batch_size  

# Use tqdm to track progress
for user_features_chunk, product_features_chunk in tqdm(test_data_generator(test_df, scaled_user_features_df, scaled_product_features_df, batch_size), total=num_batches):
    prediction = model.predict([user_features_chunk, product_features_chunk])
    predictions.extend(prediction)

  0%|                                                                  | 0/5 [00:00<?, ?it/s]

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 503us/step


 20%|███████████▌                                              | 1/5 [00:07<00:29,  7.27s/it]

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 486us/step


 40%|███████████████████████▏                                  | 2/5 [00:13<00:20,  6.81s/it]

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 480us/step


 60%|██████████████████████████████████▊                       | 3/5 [00:20<00:13,  6.62s/it]

[1m9375/9375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 489us/step


 80%|██████████████████████████████████████████████▍           | 4/5 [00:26<00:06,  6.58s/it]

[1m5770/5770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 487us/step


100%|██████████████████████████████████████████████████████████| 5/5 [00:30<00:00,  6.18s/it]


In [18]:
# unscale prediction 
unscaled_prediction = scalerTarget.inverse_transform(predictions)

In [19]:
test_df["prediction"] = unscaled_prediction

In [20]:
test_df

Unnamed: 0,user_id,product_id,purchase_count,prediction
0,0,195,1,2.291361
1,0,10257,1,1.751238
2,0,13031,1,1.919101
3,0,25132,1,1.868872
4,0,26087,1,1.794948
...,...,...,...,...
1384612,206208,24851,1,4.914202
1384613,206208,37965,1,2.808537
1384614,206208,39215,1,2.813277
1384615,206208,40602,1,3.224677


In [21]:
user_embeddings = user_model.predict(user_features_df.drop(columns=['user_id']))

[1m6445/6445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 336us/step


In [22]:
user_embeddings

array([[ 0.07107835,  0.16483606, -0.05138971, ...,  0.1546801 ,
        -0.03296753,  0.18778399],
       [ 0.0978613 ,  0.19845726, -0.04803154, ...,  0.13652202,
         0.02243653,  0.17937547],
       [ 0.12264743,  0.18698661, -0.03076056, ...,  0.06401519,
         0.04610597,  0.22387663],
       ...,
       [ 0.08872855,  0.19973665, -0.03359961, ...,  0.1290415 ,
         0.02689221,  0.16769819],
       [ 0.10893   ,  0.18565696, -0.01785273, ...,  0.11577209,
         0.04153543,  0.19096045],
       [ 0.10731832,  0.20065348, -0.02024438, ...,  0.14108798,
         0.02037832,  0.15856023]], dtype=float32)

In [23]:
product_embeddings = item_model.predict(product_features_df.drop(columns=['product_id']))

[1m1553/1553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 338us/step


In [24]:
product_embeddings

array([[ 0.01871588, -0.22787997,  0.04243439, ..., -0.05351888,
         0.00660903, -0.22197123],
       [ 0.02513898, -0.19640447,  0.07497624, ..., -0.03328418,
         0.01145176, -0.2599202 ],
       [ 0.03919547, -0.2541606 ,  0.02730933, ..., -0.02949511,
         0.00149356, -0.24834487],
       ...,
       [ 0.02932753, -0.25106046,  0.04229419, ..., -0.04444833,
         0.01488037, -0.22253811],
       [-0.0117795 , -0.2529584 ,  0.03855423, ..., -0.04509268,
         0.01793488, -0.2409177 ],
       [ 0.01356701, -0.24631916,  0.08805052, ...,  0.00233436,
        -0.02489923, -0.24609122]], dtype=float32)

# find similar items 

In [25]:
 def product_neighbors(product_id, k=100):
    input_embedding = product_embeddings[product_id]
    squared_distances = np.sum((product_embeddings - input_embedding)**2, axis=1)
    sorted_indices = np.argsort(squared_distances)
    top_K_indices = sorted_indices[:k]
    return top_K_indices

In [26]:
top_K_indices = product_neighbors(1)
top_products = products_df[products_df["product_id"].isin(top_K_indices)]
# reorder top_products to match the order of top_K_indices
top_products.set_index('product_id').loc[top_K_indices].reset_index()

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,All-Seasons Salt,104,13,spices seasonings,pantry
1,15559,Get Crackin,104,13,spices seasonings,pantry
2,15690,Organic Orange Peel,104,13,spices seasonings,pantry
3,45400,Organic Vietnamese 5% Oil Ground Cinnamon,104,13,spices seasonings,pantry
4,45141,Organic Pork Chop Seasoning,104,13,spices seasonings,pantry
...,...,...,...,...,...,...
95,29225,Organic Whole Cumin Seed,104,13,spices seasonings,pantry
96,11754,Chili Powder,104,13,spices seasonings,pantry
97,5620,Sea Salt Fine Crystals,104,13,spices seasonings,pantry
98,22840,"Seasoning, Vegit Magic!",104,13,spices seasonings,pantry


In [54]:
def prediction(user_id, is_all=False):
    predictions = []
    df = None
    if is_all:
        data = {
            'user_id': [user_id] * products_df.shape[0],
            'product_id': products_df["product_id"].values
        }
        df = pd.DataFrame(data)
    else:
        df = test_df[test_df["user_id"]==user_id].copy() 
    user_features, product_features = prepare_data(df, scaled_user_features_df, scaled_product_features_df, is_prediction=True)
    with contextlib.redirect_stdout(io.StringIO()):
        prediction = model.predict([user_features, product_features])
    predictions.extend(prediction)
    unscaled_prediction = scalerTarget.inverse_transform(predictions)
    df["predicted_count"] = unscaled_prediction
    return df
    
def recommend(user_id, k=20):
    pred_df = prediction(user_id, is_all=True)
    top_k_df = pred_df.sort_values(["predicted_count"], ascending=False).head(k)
    top_k_df = pd.merge(top_k_df, products_df, on="product_id")
    return top_k_df    

# investigate user_id=1

In [55]:
# the recommendations for user 1
recommend_df = recommend(1)

In [56]:
recommend_df.sort_values(by="aisle_id")

Unnamed: 0,user_id,product_id,predicted_count,product_name,aisle_id,department_id,aisle,department
8,1,46110,4.588082,Pro V Shampoo & Conditioner 2 In 1 Classic Clean,22,11,hair care,personal care
0,1,24851,4.588082,Banana,24,4,fresh fruits,produce
12,1,19319,4.566869,Ginger Lemongrass,31,7,refrigerated,beverages
1,1,16483,4.588082,Alfredo Chicken,38,1,frozen meals,frozen
4,1,26711,4.588082,Ocean Whitefish & Tuna in Sauce Prime Filets W...,41,8,cat food care,pets
16,1,16626,4.552812,Alive! Whole Food Energizer Liquid Multi Vitam...,47,11,vitamins supplements,personal care
7,1,23626,4.588082,Isaac Mizrahi 2-Ply White Facial Tissues,54,17,paper goods,household
19,1,4329,4.547426,RAW Meal Beyond Organic Meal Replacement Formula,65,11,protein meal replacements,personal care
6,1,7975,4.588082,One Plant-Based Chocolate Flavor Nutritional S...,65,11,protein meal replacements,personal care
2,1,25284,4.588082,Chunky Tomato Bisque Soups,69,15,soup broth bouillon,canned goods


In [30]:
def actual_product_df(user_id):
    df = test_df[test_df["user_id"]==user_id]
    pids = df["product_id"]
    return products_df[products_df["product_id"].isin(pids)]

# actual purchased items of user 1

In [31]:
actual_product_df(1).sort_values(by="aisle_id")

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
26351,26351,Chicken Caesar Salad,1,20,prepared soups salads,deli
45644,45644,Dark Chocolate Nuts & Sea Salt Bars,3,19,energy granola bars,snacks
32791,32791,Chipotle Beef & Pork Realstick,23,19,popcorn jerky,snacks
41786,41786,Bartlett Pears,24,4,fresh fruits,produce
5449,5449,Small Hass Avocado,24,4,fresh fruits,produce
24851,24851,Banana,24,4,fresh fruits,produce
9386,9386,Granny Smith Apples,24,4,fresh fruits,produce
22824,22824,Organic D'Anjou Pears,24,4,fresh fruits,produce
13639,13639,Asian Pears,24,4,fresh fruits,produce
45065,45065,Honeycrisp Apple,24,4,fresh fruits,produce


In [44]:
def calculate_baseline_evaluation(train_df, test_df, N=10, percentage=0.2):
    user_precision = []
    user_recall = []
    user_f1_score = []
    #generate recommendations for each user in the test set
    popular_items = train_df.groupby('product_id')['purchase_count'].sum().sort_values(ascending=False).index.tolist()
    all_user_ids = list(test_df['user_id'].unique())
    random_user_ids = random.sample(all_user_ids, int(len(user_ids) * percentage))
    for user_id in random_user_ids:
        #get the actual items for the user in the test set
        test_items = test_df[test_df['user_id'] == user_id]['product_id'].tolist()
        #the actual products the customer purchased in training dataset
        train_items = train_df[train_df['user_id'] == user_id]['product_id'].tolist()
        #find products in the test set that the user never bought in the training set
        actual_items = set(test_items) - set(train_items)
        
        #recommend the top N popular items
        recommended_items = popular_items[:N]
        
        #calculate the number of relevant items recommended
        relevant_items_recommended = len(set(recommended_items) & set(actual_items))

        #calculate precision for the user
        precision = relevant_items_recommended / len(recommended_items) if recommended_items else 0
        user_precision.append(precision)
        
        #calculate recall for the user
        recall = relevant_items_recommended / len(actual_items) if actual_items else 0
        user_recall.append(recall)

        #calculate f1 score for the user
        f1_score = 0
        if precision + recall > 0:
            f1_score = 2 * (precision * recall) / (precision + recall)
        user_f1_score.append(f1_score)

    #calculate the average precision across all users
    average_precision = np.mean(user_precision)
    
    #calculate the average recall across all users
    average_recall = np.mean(user_recall)

    #calculate the average f1 score across all users
    average_f1_score = np.mean(user_f1_score)

    return average_precision, average_recall, average_f1_score

In [45]:
baseline_precision, baseline_recall, baseline_f1_score = calculate_baseline_evaluation(train_df, test_df)

In [46]:
print(baseline_precision, baseline_recall, baseline_f1_score)

0.0133313935161611 0.026436087848906054 0.015448232495096545


In [63]:
def calculate_model_evaluation(train_df, test_df, N=10, percentage=0.2):
    user_precision = []
    user_recall = []
    user_f1_score = []
    all_user_ids = list(test_df['user_id'].unique())
    random_user_ids = random.sample(all_user_ids, int(len(user_ids) * percentage))
    for user_id in random_user_ids:
        #get the actual items for the user in the test set
        test_items = test_df[test_df['user_id'] == user_id]['product_id'].tolist()
        #the actual products the customer purchased in training dataset
        train_items = train_df[train_df['user_id'] == user_id]['product_id'].tolist()
        #find products in the test set that the user never bought in the training set
        actual_items = set(test_items) - set(train_items)
        
        #get N recommended items from the model
        recommended_items = recommend(user_id, k=N)["product_id"].tolist()
        
        #calculate the number of relevant items recommended
        relevant_items_recommended = len(set(recommended_items) & set(actual_items))

        #calculate precision for the user
        precision = relevant_items_recommended / len(recommended_items) if len(recommended_items) > 0 else 0
        user_precision.append(precision)
        
        #calculate recall for the user
        recall = relevant_items_recommended / len(actual_items) if actual_items else 0
        user_recall.append(recall)

        #calculate f1 score for the user
        f1_score = 0
        if precision + recall > 0:
            f1_score = 2 * (precision * recall) / (precision + recall)
        user_f1_score.append(f1_score)

    #calculate the average precision across all users
    average_precision = np.mean(user_precision)
    
    #calculate the average recall across all users
    average_recall = np.mean(user_recall)

    #calculate the average f1 score across all users
    average_f1_score = np.mean(user_f1_score)

    return average_precision, average_recall, average_f1_score

In [None]:
model_precision, model_recall, model_f1_score = calculate_model_evaluation(train_df, test_df, percentage=0.02)

In [None]:
print("baseline precision: {:.2f}%, model precision: {:.2f}% ".format(baseline_precision * 100, model_precision * 100))
print("baseline recall: {:.2f}%, model recall: {:.2f}% ".format(baseline_recall * 100, model_recall * 100))
print("baseline f1 score: {:.2f}, model f1 score: {:.2f}% %".format(baseline_f1_score * 100, model_f1_score * 100))

In [66]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def evaluate_user(user_id, train_df, test_df, N):
    test_items = test_df[test_df['user_id'] == user_id]['product_id'].tolist()
    train_items = train_df[train_df['user_id'] == user_id]['product_id'].tolist()
    actual_items = set(test_items) - set(train_items)
    
    recommended_items = recommend(user_id, k=N)["product_id"].tolist()
    relevant_items_recommended = len(set(recommended_items) & set(actual_items))

    precision = relevant_items_recommended / len(recommended_items) if len(recommended_items) > 0 else 0
    recall = relevant_items_recommended / len(actual_items) if len(actual_items) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score

def calculate_model_evaluation(train_df, test_df, N=10, percentage=0.01, max_sample_size=1000):
    all_user_ids = list(test_df['user_id'].unique())
    num_users_to_sample = min(int(len(all_user_ids) * percentage), max_sample_size)
    random_user_ids = random.sample(all_user_ids, num_users_to_sample)
    
    user_precision = []
    user_recall = []
    user_f1_score = []

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_user = {executor.submit(evaluate_user, user_id, train_df, test_df, N): user_id for user_id in random_user_ids}
        
        for future in as_completed(future_to_user):
            try:
                precision, recall, f1_score = future.result()
                user_precision.append(precision)
                user_recall.append(recall)
                user_f1_score.append(f1_score)
            except Exception as e:
                print(f"Error processing user {future_to_user[future]}: {e}")

    average_precision = np.mean(user_precision)
    average_recall = np.mean(user_recall)
    average_f1_score = np.mean(user_f1_score)

    return average_precision, average_recall, average_f1_score


model_precision, model_recall, model_f1_score = calculate_model_evaluation(train_df, test_df, percentage=0.002, max_sample_size=1000)

print("Model Precision:", model_precision)
print("Model Recall:", model_recall)
print("Model F1 Score:", model_f1_score)