In [1]:
!pip install keras-tuner

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import model_from_json
import keras_tuner as kt

# Load data
df_phone_dataset = pd.read_csv('phone_dataset.csv')
df_user_clicks = pd.read_csv('user_clicks_1_brand.csv')
df_user_ratings = pd.read_csv('user_ratings_1_brand.csv')

PHONE_COUNT = len(df_phone_dataset)
MIN_USER_ID = min(df_user_clicks['user_id'].min(), df_user_ratings['user_id'].min())
MAX_USER_ID = max(df_user_clicks['user_id'].max(), df_user_ratings['user_id'].max())
TOTAL_USER = MAX_USER_ID - MIN_USER_ID + 1
TAKE_RATING_PER_USER = 20
TAKE_CLICK_PER_USER = 20

# Preprocess user_clicks
df_user_clicks = df_user_clicks.sort_values(by='visit_time', ascending=False).drop_duplicates(subset=['user_id', 'phone_id'], keep='first').groupby('user_id').head(TAKE_CLICK_PER_USER).reset_index(drop=True)
missing_user_ids = set(range(MIN_USER_ID, MAX_USER_ID + 1)) - set(df_user_clicks['user_id'])
missing_data = pd.DataFrame({'user_id': list(missing_user_ids), 'phone_id': -1, 'visit_time': '2024-01-01 00:00:00'})
df_user_clicks_complete = pd.concat([df_user_clicks, missing_data], ignore_index=True).sort_values(by='user_id').reset_index(drop=True)
train_clicks = df_user_clicks_complete[['user_id', 'phone_id']].pivot_table(index='phone_id', columns='user_id', aggfunc=lambda x: 1 if len(x) > 0 else 0, fill_value=0)
if len(missing_user_ids) > 0:
    train_clicks = train_clicks[1:]

# Preprocess user_ratings
df_user_ratings = df_user_ratings.sort_values(by='rate_time', ascending=False).drop_duplicates(subset=['user_id', 'phone_id'], keep='first').groupby('user_id').head(TAKE_RATING_PER_USER).reset_index(drop=True)
missing_user_ids = set(range(MIN_USER_ID, MAX_USER_ID + 1)) - set(df_user_ratings['user_id'])
missing_data = pd.DataFrame({'user_id': list(missing_user_ids), 'phone_id': -1, 'rating': 0})
df_user_ratings_complete = pd.concat([df_user_ratings, missing_data], ignore_index=True).sort_values(by='user_id').reset_index(drop=True)
train_ratings = df_user_ratings_complete.pivot(index='phone_id', columns='user_id', values='rating').fillna(0)
if len(missing_user_ids) > 0:
    train_ratings = train_ratings[1:]

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [2]:
def build_model(hp):
    click_input = tf.keras.Input(shape=(PHONE_COUNT,), dtype=tf.int32, name='click_input')
    rating_input = tf.keras.Input(shape=(PHONE_COUNT,), dtype=tf.int32, name='rating_input')

    # Tune the number of units in the dense layer
    num_units = hp.Int('units', min_value=32, max_value=512, step=32)
    X = tf.keras.layers.Concatenate(name='concatenated_inputs')([rating_input, click_input])
    X = tf.keras.layers.Dense(units=num_units, activation='relu', kernel_initializer='random_normal', name='X')(X)

    # Tune the activation function
    activation = hp.Choice('activation', values=['relu', 'tanh', 'sigmoid'])
    X = tf.keras.layers.Activation(activation)(X)

    # Add a second layer if chosen
    if hp.Boolean('use_second_layer'):
        num_units_second = hp.Int('units_second', min_value=32, max_value=512, step=32)
        X = tf.keras.layers.Dense(units=num_units_second, activation=activation, kernel_initializer='random_normal', name='X2')(X)

    output_click = tf.keras.layers.Dense(PHONE_COUNT, activation='sigmoid', name='output')(X)
    model = tf.keras.Model(inputs=[rating_input, click_input], outputs=output_click, name='collaborative_model')

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(
                      hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3)),
                  loss=tf.keras.losses.MeanSquaredError())

    return model

In [3]:
tuner = kt.Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=30,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='hyperparam_tuning')


In [4]:
click_input = train_clicks.to_numpy().T

# Normalize ratings
train_ratings = train_ratings / 5
rating_input = train_ratings.to_numpy().T

tuner.search([rating_input, click_input], [rating_input, click_input], epochs=30, batch_size=32, validation_split=0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

Trial 180 Complete [00h 00m 07s]
val_loss: 0.037413787096738815

Best val_loss So Far: 0.03696547448635101
Total elapsed time: 00h 07m 54s
{'units': 128, 'activation': 'sigmoid', 'use_second_layer': True, 'learning_rate': 0.0014618661689036517, 'units_second': 352, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 3, 'tuner/round': 0}


In [5]:
model = tuner.hypermodel.build(best_hps)
history = model.fit([rating_input, click_input], [rating_input, click_input], epochs=30, batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
