In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import ast

In [3]:
users_final = pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/two-tower/users_final_numeric.csv')
products_final= pd.read_csv('https://raw.githubusercontent.com/ardahk/amex/refs/heads/main/two-tower/products_final_numeric.csv')

## Building baseline 2 tower model

### The first issue is that for each training batch, we need to have the same amount of user-item pairs as input. This means we need to use some sort of sampling for each batch in order to make sure they're both the same size.

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dot, BatchNormalization
from tensorflow.keras.models import Model

2024-11-15 22:35:35.339530: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-15 22:35:35.651802: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-15 22:35:35.835717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731710136.148931    6007 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731710136.225954    6007 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-15 22:35:36.920429: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [5]:
user_input = Input(shape=(16,), name='user_input')
item_input = Input(shape=(31,), name='item_input')

In [6]:
#Changed from baseline
user_tower = Dense(128, activation='relu')(user_input)
user_tower = BatchNormalization()(user_tower)

2024-11-15 22:35:40.229824: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [7]:
item_tower = Dense(128, activation='relu')(item_input)
item_tower = BatchNormalization()(item_tower)

In [8]:
dot_product = Dot(axes=1)([user_tower, item_tower])

In [9]:
model = Model(inputs=[user_input, item_input], outputs=dot_product)

In [10]:
model.compile(optimizer='adam', loss='mse')

In [11]:
model.summary()

### Formatting inputs

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def create_labels_and_train(users_df, products_df, model, batch_size, num_epochs):

    #splitting into training and testing datasets
    #set the random state of both to the same thing
    users_train, users_test = train_test_split(users_df, test_size=0.2, random_state=42)
    products_train, products_test = train_test_split(products_df, test_size=0.2, random_state=42)


    for epoch in range(num_epochs):
        # generate random user-item pairs through random indices for each batch
        user_indices = np.random.randint(0, len(users_train), size=batch_size)
        product_indices = np.random.randint(0, len(products_train), size=batch_size)

        # extract the training data
        user_data = users_df.iloc[user_indices]#.copy()  # copy to avoid SettingWithCopyWarning
        product_data = products_df.iloc[product_indices]#.copy()

        # Create similarity labels
        target_similarity = [1 if users_train.iloc[user_idx]['product_id'] == products_train.iloc[product_idx]['product_id'] else 0
            for user_idx, product_idx in zip(user_indices, product_indices)]

        # convert to a numpy array
        target_similarity = np.array(target_similarity)

        # drop 'product_id' from both dataframes
        user_data = user_data.drop(columns=['product_id'])
        product_data = product_data.drop(columns=['product_id', 'name_embedding'])

        # train the model with the pairs
        model.fit([user_data.values, product_data.values], target_similarity, epochs=1, batch_size=batch_size)
    
    # Evaluate the model on the test set
    # Use the test set to generate predictions
    user_indices_test = np.random.randint(0, len(users_test), size=batch_size)
    product_indices_test = np.random.randint(0, len(products_test), size=batch_size)

    user_data_test = users_test.iloc[user_indices_test]
    product_data_test = products_test.iloc[product_indices_test]

    # Create similarity labels for testing
    target_similarity_test = [
        1 if users_test.iloc[user_idx]['product_id'] == products_test.iloc[product_idx]['product_id'] else 0
        for user_idx, product_idx in zip(user_indices_test, product_indices_test)
    ]
    target_similarity_test = np.array(target_similarity_test)

    user_data_test = user_data_test.drop(columns=['product_id'])
    product_data_test = product_data_test.drop(columns=['product_id', 'name_embedding'])

    # Predict similarities using the model
    predicted_similarity = model.predict([user_data_test.values, product_data_test.values])

    # Convert the predicted similarity into binary (thresholded) values
    predicted_labels = (predicted_similarity >= 0.5).astype(int)  # assuming the model outputs a probability

    # Evaluate the performance of the model
    accuracy = accuracy_score(target_similarity_test, predicted_labels)
    precision = precision_score(target_similarity_test, predicted_labels)
    recall = recall_score(target_similarity_test, predicted_labels)
    f1 = f1_score(target_similarity_test, predicted_labels)
    #roc_auc = roc_auc_score(target_similarity_test, predicted_similarity)

    print(f"Accuracy: {accuracy:.4f}")

# parameters
batch_size = 500
num_epochs = 25

create_labels_and_train(users_final, products_final, model, batch_size, num_epochs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 254.0725
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 265.5094
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 268.1228
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 253.5305
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 259.7726
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 227.4268
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 266.4461
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 241.1143
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 241.2016
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 232.8302
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 247.3018
[1m1/1[0m [32m━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
