In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import ast

In [2]:
users = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/data/users_final.csv')
product_embeddings = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/name_embeddings.csv')
products = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/products.csv')
order_items = pd.read_csv('https://github.com/ardahk/amex/raw/refs/heads/main/data/order_items.csv')

### Merge name embeddings and products dataframe

In [6]:
products_final = pd.merge(left = products, right = product_embeddings, left_on='id', right_on='product_id')
products_final.head()

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id,product_id,name_embedding
0,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0..."
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0..."
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1,14115,"[0.1337980031967163, -0.20477981865406036, 0.0..."
3,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0..."
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0..."


In [4]:
products_final.drop(columns = ['id', 'name', 'distribution_center_id', 'sku'], inplace = True)

In [8]:
products['brand'].nunique()

2756

In [9]:
products['category'].nunique()

26

There are too many unique brands to one-hot encode, so label encoding will be used with a reference table

In [10]:
label_encoder = LabelEncoder()

products_final['brand_encoded'] = label_encoder.fit_transform(products_final['brand'])

In [11]:
brands = pd.DataFrame({
    'brand': label_encoder.classes_,
    'brand_encoded': range(len(label_encoder.classes_))
})
print(brands.head())

           brand  brand_encoded
0      !it Jeans              0
1      '47 Brand              1
2    007Lingerie              2
3      106Shades              3
4  12XLShop Inc.              4


In [12]:
products_final.drop(columns = ['brand'], inplace = True)
products_final.head()

Unnamed: 0,id,cost,category,name,retail_price,department,sku,distribution_center_id,product_id,name_embedding,brand_encoded
0,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...",1158
3,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158


### Convert department and category to dummy variables

In [13]:
products_final = pd.get_dummies(products_final, columns = ['department', 'category'], dtype=int)
products_final.head()

Unnamed: 0,id,cost,name,retail_price,sku,distribution_center_id,product_id,name_embedding,brand_encoded,department_Men,...,category_Skirts,category_Sleep & Lounge,category_Socks,category_Socks & Hosiery,category_Suits,category_Suits & Sport Coats,category_Sweaters,category_Swim,category_Tops & Tees,category_Underwear
0,13928,2.33835,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,5.95,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158,0,...,0,0,0,0,0,0,0,0,0,0
1,13928,2.33835,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,5.95,2EAC42424D12436BDD6A5B8A88480CC3,1,13928,"[-0.5451024770736694, -0.34113600850105286, -0...",1158,0,...,0,0,0,0,0,0,0,0,0,0
2,14115,4.87956,Enzyme Regular Solid Army Caps-Black W35S45D,10.99,EE364229B2791D1EF9355708EFF0BA34,1,14115,"[0.1337980031967163, -0.20477981865406036, 0.0...",1158,0,...,0,0,0,0,0,0,0,0,0,0
3,14273,6.50793,Washed Canvas Ivy Cap - Black W11S64C,15.99,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158,0,...,0,0,0,0,0,0,0,0,0,0
4,14273,6.50793,Washed Canvas Ivy Cap - Black W11S64C,15.99,F531DC20FDE20B7ADF3A73F52B71D0AF,1,14273,"[-0.40627333521842957, 0.14057165384292603, -0...",1158,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
products_final.dtypes

id                                          int64
cost                                      float64
name                                       object
retail_price                              float64
sku                                        object
distribution_center_id                      int64
product_id                                  int64
name_embedding                             object
brand_encoded                               int64
department_Men                              int64
department_Women                            int64
category_Accessories                        int64
category_Active                             int64
category_Blazers & Jackets                  int64
category_Clothing Sets                      int64
category_Dresses                            int64
category_Fashion Hoodies & Sweatshirts      int64
category_Intimates                          int64
category_Jeans                              int64
category_Jumpsuits & Rompers                int64


### Convert name_enbeddings to a list of floats and flattening

In [15]:
products_final['name_embedding'] = products_final['name_embedding'].apply(lambda x: np.array(ast.literal_eval(x), dtype=np.float32) if isinstance(x, str) else x)

# Step 2: Flatten the array so it can be used as a feature
products_final['name_embedding_flat'] = products_final['name_embedding'].apply(lambda x: x.flatten() if isinstance(x, np.ndarray) else x)

In [16]:
products_final.to_csv('products_final_numeric.csv', index=False)

### Convert all users data to numeric

In [21]:
users_final = pd.get_dummies(users, columns = ['gender', 'status', 'traffic_source'], dtype=int)

### Merge with orders_items

In [17]:
order_items.head()

Unnamed: 0,id,order_id,user_id,product_id,inventory_item_id,status,created_at,shipped_at,delivered_at,returned_at,sale_price
0,152013,104663,83582,14235,410368,Cancelled,2023-05-07 06:08:40+00:00,,,,0.02
1,40993,28204,22551,14235,110590,Complete,2023-03-14 03:47:21+00:00,2023-03-15 22:57:00+00:00,2023-03-18 01:08:00+00:00,,0.02
2,51224,35223,28215,14235,138236,Complete,2023-12-05 13:25:30+00:00,2023-12-06 01:20:00+00:00,2023-12-10 10:04:00+00:00,,0.02
3,36717,25278,20165,14235,99072,Shipped,2023-12-22 20:48:19+00:00,2023-12-24 16:44:00+00:00,,,0.02
4,131061,90241,71954,14235,353798,Shipped,2022-06-19 16:57:59+00:00,2022-06-19 19:29:00+00:00,,,0.02


In [22]:
users_final = pd.merge(left = users_final, right = order_items, left_on='id', right_on='user_id')
users_final.columns

Index(['id_x', 'age', 'postal_code', 'avg_sequence_number', 'order_id_x',
       'num_of_item', 'month_day', 'gender_F', 'gender_M', 'status_Cancelled',
       'status_Complete', 'status_Processing', 'status_Returned',
       'status_Shipped', 'traffic_source_Display', 'traffic_source_Email',
       'traffic_source_Facebook', 'traffic_source_Organic',
       'traffic_source_Search', 'id_y', 'order_id_y', 'user_id', 'product_id',
       'inventory_item_id', 'status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'],
      dtype='object')

In [23]:
users_final.drop(columns = ['id_x', 'order_id_x', 'id_y', 'order_id_y', 'user_id', 'inventory_item_id','status', 'created_at', 'shipped_at',
       'delivered_at', 'returned_at', 'sale_price'], inplace = True)

For formatting

In [24]:
users_final.drop(columns = ['month_day'], inplace = True)

In [25]:
users_final.to_csv('users_final_numeric.csv', index=False)

In [26]:
from sklearn.preprocessing import StandardScaler

numeric_features_products = ['retail_price']
numeric_features_users = ['age']

# Initialize scalers
scaler_products = StandardScaler()
scaler_users = StandardScaler()

# Normalize the numeric features in each DataFrame
products_final[numeric_features_products] = scaler_products.fit_transform(products_final[numeric_features_products])
users[numeric_features_users] = scaler_users.fit_transform(users[numeric_features_users])

embeddings_array = np.stack(products_final['name_embedding_flat'].values)
products_final['name_embedding_flat_normalized'] = list(scaler_products.fit_transform(embeddings_array))

### Baseline 2 tower model

In [27]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Dot
from tensorflow.keras.models import Model

Sampling for batch size equality

In [28]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(31,), name='item_input')

In [29]:
user_tower = Dense(64, activation='relu')(user_input)
user_tower = Dense(64, activation='relu')(user_tower)

user_tower = Dense(128, activation='relu')(user_input)
user_tower = Dropout(0.3)(user_tower)
user_tower = Dense(64, activation='relu')(user_tower)

In [30]:
item_tower = Dense(64, activation='relu')(item_input)
item_tower = Dense(64, activation='relu')(item_tower)

item_tower = Dense(128, activation='relu')(item_input)
item_tower = Dropout(0.3)(item_tower)
item_tower = Dense(64, activation='relu')(item_tower)

In [31]:
dot_product = Dot(axes=1)([user_tower, item_tower])

In [32]:
model = Model(inputs=[user_input, item_input], outputs=dot_product)

In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

### Formatting inputs

In [35]:
def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):
    for epoch in range(num_epochs):
        # Generate random user-item pairs through random indices for each batch
        user_indices = np.random.randint(0, len(users_df), size=batch_size)
        product_indices = np.random.randint(0, len(products_df), size=batch_size)

        # Extract the data for the batch
        user_data = users_df.iloc[user_indices]
        product_data = products_df.iloc[product_indices]

        # Create target similarity labels (1 if user purchased product, otherwise 0)
        target_similarity = np.array([
            1 if users_df.iloc[u_idx]['product_id'] == products_df.iloc[p_idx]['product_id'] else 0
            for u_idx, p_idx in zip(user_indices, product_indices)
        ])

        # Drop 'product_id' and exclude the raw 'name_embedding' column
        user_data = user_data.drop(columns=['product_id'])
        product_data = product_data.drop(columns=['product_id', 'name_embedding'])

        # Flattened embeddings and other numerical features should be present in product_data
        if 'name_embedding_flat' in products_df.columns:
            product_data = np.hstack(product_data['name_embedding_flat'].values.reshape(-1, 1))

        # Train the model with the prepared batch
        model.fit([user_data.values, product_data.values], target_similarity, epochs=1, batch_size=batch_size)