In [41]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import ast
from util import check_user_exists, lookup_product_name

In [42]:
users_final = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/final/users_final_data.csv')
products_final= pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/final/products_final_data.csv')
original_products = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/data/products.csv')

## Building baseline 2 tower model

### The first issue is that for each training batch, we need to have the same amount of user-item pairs as input. This means we need to use some sort of sampling for each batch in order to make sure they're both the same size.

In [43]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dot, BatchNormalization
from tensorflow.keras.models import Model

In [44]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(30,), name='item_input')

In [45]:
#Changed from baseline
user_tower = Dense(128, activation='relu')(user_input)
user_tower = BatchNormalization()(user_tower)

In [46]:
item_tower = Dense(128, activation='relu')(item_input)
item_tower = BatchNormalization()(item_tower)

In [47]:
dot_product = Dot(axes=1)([user_tower, item_tower])

In [48]:
model = Model(inputs=[user_input, item_input], outputs=dot_product)

In [49]:
model.compile(optimizer='adam', loss='mse')

In [50]:
model.summary()

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):
    for epoch in range(num_epochs):
        # initilize the target similarity for the batch
        target_similarity = []

        # we're making the target similarity balanced, so there's an equal number of posivie and negetive indices in each batch
        num_indices = batch_size // 2

        # generating 1/2 batch size of random pairs, where there are positive indices (user and product have the same ID)
        positive_user_indices = np.random.randint(0, len(users_df), size=num_indices)
        # initialize storage of positive indicies
        positive_product_indices = []
        # loop over every user
        for user_idx in positive_user_indices:
            # locating product IDs in the user dataframe for the user we sampled
            user_product_id = users_df.iloc[user_idx]['product_id']
            # finding matching products in the products dataframe
            matching_products = products_df[products_df['product_id'] == user_product_id]
            # append the matching product to the positive product indices
            positive_product_indices.append(matching_products.index[0])

        # Generate random negative pairs (user and product have different product_ids)
        negative_user_indices = np.random.randint(0, len(users_df), size=num_indices)
        #print("NEGATIVE USER INDICES: ", negative_user_indices)
        negative_product_indices = []
        for user_idx in negative_user_indices:
            user_product_id = users_df.iloc[user_idx]['product_id']
            # find a product that doesn't have a matching product id
            non_matching_products = products_df[products_df['product_id'] != user_product_id]
            # append that to the negetive indicies
            negative_product_indices.append(non_matching_products.sample(1).index[0])

        # combining both positive and negetive indicies
        user_indices = np.concatenate([positive_user_indices, negative_user_indices])
        product_indices = np.concatenate([positive_product_indices, negative_product_indices])

        # create target similarity labels for the positive and negetive pairs
        target_similarity.extend([1] * num_indices)  # Positive pairs
        target_similarity.extend([0] * num_indices)  # Negative pairs
        target_similarity = np.array(target_similarity)

        # get the positive & negetive user data
        user_data = users_df.iloc[user_indices]
        user_ids = user_data['user_id'].tolist()
        product_data = products_df.iloc[product_indices]
        item_ids = product_data['product_id'].tolist()

        user_data = user_data.drop(columns=['product_id', 'user_id'])
        product_data = product_data.drop(columns=['product_id', 'flattened_name_embedding', 'flattened_brand_embedding'])

        # Split data into training and testing sets
        X_train_users, X_test_users, X_train_products, X_test_products, y_train, y_test = train_test_split(
            user_data, product_data, target_similarity, test_size=0.2, random_state=42
        )

        # Train the model on the training data
        model.fit([X_train_users, X_train_products], y_train, epochs=1, batch_size=batch_size, verbose=False)

        # Predict on the test data
        predicted_probabilities = model.predict([X_test_users, X_test_products]).flatten()

        # Convert probabilities to binary predictions
        y_pred = (predicted_probabilities > 0.5).astype(int)

        # Evaluate the model on the test data
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, predicted_probabilities)

        print(f"Epoch {epoch + 1}/{num_epochs} - "
              f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, "
              f"F1 Score: {f1:.4f}, ROC AUC: {auc:.4f}")

# Parameters
batch_size = 500
num_epochs = 20

create_labels_and_train(users_final, products_final, model, batch_size, num_epochs)




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step




Epoch 1/20 - Accuracy: 0.4600, Precision: 0.4600, Recall: 1.0000, F1 Score: 0.6301, ROC AUC: 0.4565
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Epoch 2/20 - Accuracy: 0.4600, Precision: 0.4600, Recall: 1.0000, F1 Score: 0.6301, ROC AUC: 0.5894
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Epoch 3/20 - Accuracy: 0.4600, Precision: 0.4600, Recall: 1.0000, F1 Score: 0.6301, ROC AUC: 0.5338
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Epoch 4/20 - Accuracy: 0.4700, Precision: 0.4646, Recall: 1.0000, F1 Score: 0.6345, ROC AUC: 0.5427
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Epoch 5/20 - Accuracy: 0.4700, Precision: 0.4646, Recall: 1.0000, F1 Score: 0.6345, ROC AUC: 0.5616
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Epoch 6/20 - Accuracy: 0.4700, Precision: 0.4646, Recall: 1.0000, F1 Score: 0.6345, ROC AUC: 0.4416
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

The above code splits the data into training and testing data, and trains the model.

## Generating Recommendations

In [52]:
products_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13800 entries, 0 to 13799
Data columns (total 33 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   cost                                    13800 non-null  float64
 1   retail_price                            13800 non-null  float64
 2   product_id                              13800 non-null  int64  
 3   flattened_name_embedding                13800 non-null  object 
 4   flattened_brand_embedding               13800 non-null  object 
 5   department_Men                          13800 non-null  int64  
 6   department_Women                        13800 non-null  int64  
 7   category_Accessories                    13800 non-null  int64  
 8   category_Active                         13800 non-null  int64  
 9   category_Blazers & Jackets              13800 non-null  int64  
 10  category_Clothing Sets                  13800 non-null  in

In [62]:
def generate_recommendations(test_users_df, products_df, model, top_n=10):
    # Randomly select a user
    random_user_row = test_users_df.sample(1)
    random_user_id = random_user_row['user_id'].values[0]
    print(f"Generating recommendations for user ID: {random_user_id}...")

    # Prepare user data for the selected user
    user_data = random_user_row.drop(columns=['product_id', 'user_id']).values
    user_data_repeated = np.repeat(user_data, len(products_df), axis=0)

    # Prepare product data
    product_data = products_df.drop(columns=['product_id', 'flattened_name_embedding', 'flattened_brand_embedding']).values

    print("User data repeated shape:", user_data_repeated.shape)
    print("Product data shape:", product_data.shape)
    # Predict probabilities
    predicted_probabilities = model.predict([user_data_repeated, product_data]).flatten()

    # Sort product recommendations by increasing probability
    sorted_indices = np.argsort(predicted_probabilities)
    sorted_products = products_df.iloc[sorted_indices]

    # Display top N recommendations
    top_recommendations = sorted_products.head(top_n)
    #print("Top recommendations (sorted by increasing probability of interaction):")
    #print(top_recommendations[['product_id']])

    #Returns a list of the top n product IDs
    return top_recommendations[['product_id']]['product_id'].tolist()

In [54]:
original_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29120 entries, 0 to 29119
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      29120 non-null  int64  
 1   cost                    29120 non-null  float64
 2   category                29120 non-null  object 
 3   name                    29118 non-null  object 
 4   brand                   29096 non-null  object 
 5   retail_price            29120 non-null  float64
 6   department              29120 non-null  object 
 7   sku                     29120 non-null  object 
 8   distribution_center_id  29120 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 2.0+ MB


In [57]:
recs = generate_recommendations(users_final, products_final, model, top_n=10)
recs
lookup_product_name(recs, original_products)

Generating recommendations for user ID: 94559...
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 871us/step
ID = 28700, Name = 23027    Wayfarer Style Sunglasses Dark Lens Black Frame
Name: name, dtype: object
ID = 14235, Name = 1401    Indestructable Aluminum Aluma Wallet - RED
Name: name, dtype: object
ID = 14202, Name = 11644    GENUINE LEATHER SNAP ON STUDDED WHITE PIANO BE...
Name: name, dtype: object
ID = 13629, Name = 28484    Solid Color Leather Adjustable Skinny Belt with
Name: name, dtype: object
ID = 12536, Name = 13235    Individual Bra Extenders
Name: name, dtype: object
ID = 14298, Name = 16355    Classic Tear Drop Mirror Lens Aviator Sunglasses
Name: name, dtype: object
ID = 28913, Name = 27838    TopTie Mens Black & White Checkerboard Pre-Tie...
Name: name, dtype: object
ID = 25276, Name = 18410    Wool Arctic Socks
Name: name, dtype: object
ID = 28774, Name = 27836    TopTie Unisex Fashion Leopard Spotted Slim Tan...
Name: name, dtype: object
ID = 289

### Generating Recommendations for a specific user

In [75]:
def generate_recommendations_for_user(test_users_df, products_df, model, user_id, top_n=10):
    # Get the data for the specific user
    user_row = test_users_df[test_users_df['user_id'] == user_id]
    
    if user_row.empty:
        raise ValueError(f"User with ID {user_id} not found in the test_users_df.")
    
    print(f"Generating recommendations for user ID: {user_id}...")

    # Prepare user data for the selected user
    user_data = user_row.drop(columns=['product_id', 'user_id']).values

    # Prepare product data (drop 'product_id' and any embeddings)
    product_data = products_df.drop(columns=['product_id', 'flattened_name_embedding', 'flattened_brand_embedding']).values

    # Repeat user data for each product (model requires this structure)
    user_data_repeated = np.repeat(user_data, product_data.shape[0], axis=0)

    print("User data repeated shape:", user_data_repeated.shape)
    print("Product data shape:", product_data.shape)
    # Predict probabilities (model expects a list of two inputs: user data and product data)
    predicted_probabilities = model.predict([user_data_repeated, product_data]).flatten()

    # Sort product recommendations by predicted probabilities
    sorted_indices = np.argsort(predicted_probabilities)
    sorted_products = products_df.iloc[sorted_indices]

    # Get the top N recommendations
    top_recommendations = sorted_products.head(top_n)

    # Return a list of the top N product IDs
    return top_recommendations[['product_id']]['product_id'].tolist()


In [76]:
check_user_exists(users_final, 84533)

User ID 84533 exists in the DataFrame.


True

In [77]:
recs = generate_recommendations_for_user(users_final, products_final, model, 84533, top_n=10)
lookup_product_name(recs, original_products)

Generating recommendations for user ID: 84533...
User data repeated shape: (55200, 16)
Product data shape: (13800, 30)


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 55200, 13800
