In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import ast

In [2]:
users = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/data/users_final.csv')
product_embeddings = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/name_embeddings.csv')
products = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/products.csv')
order_items = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/order_items.csv')

In [3]:
product_embeddings.head()

Unnamed: 0,product_id,name_embedding
0,9588,"[-0.4065183401107788, 0.31074419617652893, -0...."
1,1238,"[-0.41240498423576355, 0.5829450488090515, -0...."
2,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
3,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."
4,5485,"[-0.23880276083946228, 0.3456690013408661, -0...."


### Merge name embeddings and products dataframe

In [4]:
products_final = pd.merge(left = products, right = product_embeddings, left_on='id', right_on='product_id')

In [5]:
products_final.drop(columns = ['id', 'name', 'distribution_center_id', 'sku'], inplace = True)

In [6]:
products['brand'].nunique()

2756

In [7]:
products['category'].nunique()

26

### There are too many unique brands to one-hot encode, so label encoding will be used with a reference table

In [8]:
label_encoder = LabelEncoder()

products_final['brand_encoded'] = label_encoder.fit_transform(products_final['brand'])

In [9]:
brands = pd.DataFrame({
    'brand': label_encoder.classes_,
    'brand_encoded': range(len(label_encoder.classes_))
})
print(brands.head())


           brand  brand_encoded
0      !it Jeans              0
1      '47 Brand              1
2    007Lingerie              2
3      106Shades              3
4  12XLShop Inc.              4


In [10]:
products_final.drop(columns = ['brand'], inplace = True)

In [11]:
brands.to_csv('brand_reference.csv', index=False)

### Convert department and category to dummy variables

In [12]:
products_final = pd.get_dummies(products_final, columns = ['department', 'category'], dtype=int)

In [13]:
products_final.dtypes

cost                                      float64
retail_price                              float64
product_id                                  int64
name_embedding                             object
brand_encoded                               int32
department_Men                              int32
department_Women                            int32
category_Accessories                        int32
category_Active                             int32
category_Blazers & Jackets                  int32
category_Clothing Sets                      int32
category_Dresses                            int32
category_Fashion Hoodies & Sweatshirts      int32
category_Intimates                          int32
category_Jeans                              int32
category_Jumpsuits & Rompers                int32
category_Leggings                           int32
category_Maternity                          int32
category_Outerwear & Coats                  int32
category_Pants                              int32


### Convert name_enbeddings to a list of floats

In [14]:
products_final['name_embedding'] = products_final['name_embedding'].apply(ast.literal_eval)
products_final['name_embedding'] = products_final['name_embedding'].apply(lambda x: np.array(x, dtype=np.float32))

In [15]:
products_final.dtypes

cost                                      float64
retail_price                              float64
product_id                                  int64
name_embedding                             object
brand_encoded                               int32
department_Men                              int32
department_Women                            int32
category_Accessories                        int32
category_Active                             int32
category_Blazers & Jackets                  int32
category_Clothing Sets                      int32
category_Dresses                            int32
category_Fashion Hoodies & Sweatshirts      int32
category_Intimates                          int32
category_Jeans                              int32
category_Jumpsuits & Rompers                int32
category_Leggings                           int32
category_Maternity                          int32
category_Outerwear & Coats                  int32
category_Pants                              int32


In [16]:
products_final.to_csv('products_final_numeric.csv', index=False)

### Convert all users data to numeric

In [17]:
users_final = pd.get_dummies(users, columns = ['gender', 'status', 'traffic_source'], dtype=int)

### We need to use 'order_items' in order to extract product id. This is used to make the label for the user and items dataset (engineer a feature that indicates whether the user has interacted with the item or not)

In [18]:
users_final = pd.merge(left = users_final, right = order_items, left_on='id', right_on='user_id')

In [19]:
users_final.columns

Index(['id_x', 'age', 'postal_code', 'avg_sequence_number', 'order_id_x',
       'num_of_item', 'month_day', 'gender_F', 'gender_M', 'status_Cancelled',
       'status_Complete', 'status_Processing', 'status_Returned',
       'status_Shipped', 'traffic_source_Display', 'traffic_source_Email',
       'traffic_source_Facebook', 'traffic_source_Organic',
       'traffic_source_Search', 'id_y', 'order_id_y', 'user_id', 'product_id',
       'inventory_item_id', 'status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'],
      dtype='object')

In [20]:
users_final.drop(columns = ['id_x', 'order_id_x', 'id_y', 'order_id_y', 'user_id', 'inventory_item_id','status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'], inplace = True)

dropping 'month-day' now for formatting purposes, however we can add it back in later if we think it has value

In [21]:
users_final.drop(columns = ['month_day'], inplace = True)

In [22]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'product_id'],
      dtype='object')

In [23]:
users_final.dtypes

age                          int64
postal_code                  int64
avg_sequence_number        float64
num_of_item                  int64
gender_F                     int32
gender_M                     int32
status_Cancelled             int32
status_Complete              int32
status_Processing            int32
status_Returned              int32
status_Shipped               int32
traffic_source_Display       int32
traffic_source_Email         int32
traffic_source_Facebook      int32
traffic_source_Organic       int32
traffic_source_Search        int32
product_id                   int64
dtype: object

In [24]:
users_final.to_csv('users_final_numeric.csv', index=False)

## Building baseline 2 tower model

In [25]:
users_final.shape

(80000, 17)

In [26]:
users_final.isna().sum()

age                        0
postal_code                0
avg_sequence_number        0
num_of_item                0
gender_F                   0
gender_M                   0
status_Cancelled           0
status_Complete            0
status_Processing          0
status_Returned            0
status_Shipped             0
traffic_source_Display     0
traffic_source_Email       0
traffic_source_Facebook    0
traffic_source_Organic     0
traffic_source_Search      0
product_id                 0
dtype: int64

In [27]:
users_final.columns

Index(['age', 'postal_code', 'avg_sequence_number', 'num_of_item', 'gender_F',
       'gender_M', 'status_Cancelled', 'status_Complete', 'status_Processing',
       'status_Returned', 'status_Shipped', 'traffic_source_Display',
       'traffic_source_Email', 'traffic_source_Facebook',
       'traffic_source_Organic', 'traffic_source_Search', 'product_id'],
      dtype='object')

In [28]:
products_final.shape

(19696, 33)

In [29]:
products_final.isna().sum()

cost                                      0
retail_price                              0
product_id                                0
name_embedding                            0
brand_encoded                             0
department_Men                            0
department_Women                          0
category_Accessories                      0
category_Active                           0
category_Blazers & Jackets                0
category_Clothing Sets                    0
category_Dresses                          0
category_Fashion Hoodies & Sweatshirts    0
category_Intimates                        0
category_Jeans                            0
category_Jumpsuits & Rompers              0
category_Leggings                         0
category_Maternity                        0
category_Outerwear & Coats                0
category_Pants                            0
category_Pants & Capris                   0
category_Plus                             0
category_Shorts                 

### The first issue is that for each training batch, we need to have the same amount of user-item pairs as input. This means we need to use some sort of sampling for each batch in order to make sure they're both the same size.

In [33]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dot, Dropout
from tensorflow.keras.models import Model

In [34]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(31,), name='item_input')

In [39]:
#Changed from baseline
user_tower = Dense(128, activation='relu')(user_input)
user_tower = Dropout(0.3)(user_tower)

In [45]:
item_tower = Dense(128, activation='relu')(item_input)
item_tower = Dropout(0.3)(item_tower)

In [46]:
dot_product = Dot(axes=1)([user_tower, item_tower])

In [47]:
model = Model(inputs=[user_input, item_input], outputs=dot_product)

In [48]:
model.compile(optimizer='adam', loss='mse')

In [49]:
model.summary()

### Formatting inputs

In [50]:
def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):
    for epoch in range(num_epochs):
        # generate random user-item pairs through random indices for each batch
        user_indices = np.random.randint(0, len(users_df), size=batch_size)
        product_indices = np.random.randint(0, len(products_df), size=batch_size)

        # extract the data
        user_data = users_df.iloc[user_indices]#.copy()  # copy to avoid SettingWithCopyWarning
        product_data = products_df.iloc[product_indices]#.copy()

        # we will be creating target similarity labels
        target_similarity = []

        # loop through user and product indices to create labels
        for user_idx, product_idx in zip(user_indices, product_indices):
            user_product_id = users_df.iloc[user_idx]['product_id']
            item_product_id = products_df.iloc[product_idx]['product_id']

            # if the user and item product id match, it means the user purchased the product
            # otherwise, there is no interaction and the target similarity would be 0
            target_similarity.append(1 if user_product_id == item_product_id else 0)

        # convert to a numpy array
        target_similarity = np.array(target_similarity)

        # drop 'product_id' from both dataframes
        user_data = user_data.drop(columns=['product_id'])
        product_data = product_data.drop(columns=['product_id', 'name_embedding'])


        # train the model with the pairs
        model.fit([user_data.values, product_data.values], target_similarity, epochs=1, batch_size=batch_size)

# parameters
batch_size = 500
num_epochs = 25

create_labels_and_train(users_final, products_final, model, batch_size, num_epochs)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 705ms/step - loss: 916276645986304.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 942215094337536.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 718133094514688.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 744788533968896.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 686837010006016.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 664993141882880.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 658780404580352.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 772460874039296.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 568043918852096.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 5770663700