In [2]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error, mean_absolute_error
import optuna
import xgboost as xgb
import numpy as np

train_data = pd.read_csv('train_data.csv', dtype={'item_id': str}, sep=';')
test_data = pd.read_csv('test_data.csv', dtype={'item_id': str}, sep=';')
data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")
all_item_ids = data['item_id'].unique()


def load_embeddings(embedding_path, item_ids):
    embeddings = np.load(embedding_path)
    return {item_id: normalize(embedding.reshape(1, -1)).flatten() for item_id, embedding in zip(item_ids, embeddings)}

train_user_ids = train_data['user_id'].unique()
train_item_ids = train_data['item_id'].unique()
train_user_id_map = {user_id: idx for idx, user_id in enumerate(train_user_ids)}
train_item_id_map = {item_id: idx for idx, item_id in enumerate(train_item_ids)}

interaction_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
svd = TruncatedSVD(n_components=50, random_state=42)
user_embeddings = svd.fit_transform(interaction_matrix)
item_embeddings_mf = svd.components_.T

scaler = StandardScaler()
user_embeddings = scaler.fit_transform(user_embeddings)
item_embeddings_mf = scaler.fit_transform(item_embeddings_mf)

# description embeddings
item_embeddings_dict = load_embeddings('embeddings/bm25_embeddings.npy', all_item_ids)

# default embedding for missing embeddings in test
default_user_embedding = np.mean(user_embeddings, axis=0)
default_item_embedding_svd = np.mean(item_embeddings_mf, axis=0) 

default_user_embedding = normalize(default_user_embedding.reshape(1, -1)).flatten()
default_item_embedding_svd = normalize(default_item_embedding_svd.reshape(1, -1)).flatten()

train_data_samples = []

for idx, row in train_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = normalize(user_embeddings[user_idx].reshape(1, -1)).flatten() if user_idx is not None else default_user_embedding
    item_embedding = normalize(item_embeddings_mf[item_idx].reshape(1, -1)).flatten() if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))
    combined_embedding = np.hstack((user_embedding, item_final))
    train_data_samples.append((combined_embedding, row['rating']))


X_train, y_train = zip(*train_data_samples)
X_train = np.array(X_train)
y_train = np.array(y_train)

test_data_samples = []
for idx, row in test_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = normalize(user_embeddings[user_idx].reshape(1, -1)).flatten() if user_idx is not None else default_user_embedding
    item_embedding = normalize(item_embeddings_mf[item_idx].reshape(1, -1)).flatten() if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))
    combined_embedding = np.hstack((user_embedding, item_final))
    test_data_samples.append((combined_embedding, row['rating']))

X_test, y_test = zip(*test_data_samples)
X_test = np.array(X_test)
y_test = np.array(y_test)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
    }
    
    xgb_model = xgb.XGBRegressor(**param)
    
    X_train_opt, X_valid_opt, y_train_opt, y_valid_opt = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    xgb_model.fit(X_train_opt, y_train_opt, eval_set=[(X_valid_opt, y_valid_opt)], verbose=False)
    
    y_pred = xgb_model.predict(X_valid_opt)
    
    mse = mean_squared_error(y_valid_opt, y_pred)
    
    # minimize mse
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("best hyperparameters found:")
print(study.best_params)

# use best hyperparameters
best_params = study.best_params
xgb_model = xgb.XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'test MSE: {mse}')
print(f'tes MAE: {mae}'

[I 2024-09-10 20:58:59,945] A new study created in memory with name: no-name-70d20309-cbc3-4055-ade0-0051ddcdfae9
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
[I 2024-09-10 21:00:33,899] Trial 0 finished with value: 0.8083847761154175 and parameters: {'n_estimators': 440, 'max_depth': 3, 'learning_rate': 0.12625681495293667, 'subsample': 0.9725951002289295, 'colsample_bytree': 0.7851381579817921, 'reg_alpha': 9.33766750359203, 'reg_lambda': 0.24813364693744763}. Best is trial 0 with value: 0.8083847761154175.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytr

Melhores hiperparâmetros encontrados:
{'n_estimators': 385, 'max_depth': 6, 'learning_rate': 0.01049634982595945, 'subsample': 0.8560760795399655, 'colsample_bytree': 0.6853364680217993, 'reg_alpha': 0.11759859277956448, 'reg_lambda': 0.41140522891590336}
[0]	validation_0-rmse:0.94734
[1]	validation_0-rmse:0.94676
[2]	validation_0-rmse:0.94609
[3]	validation_0-rmse:0.94559
[4]	validation_0-rmse:0.94487
[5]	validation_0-rmse:0.94432
[6]	validation_0-rmse:0.94383
[7]	validation_0-rmse:0.94330
[8]	validation_0-rmse:0.94286
[9]	validation_0-rmse:0.94227
[10]	validation_0-rmse:0.94188
[11]	validation_0-rmse:0.94135
[12]	validation_0-rmse:0.94086
[13]	validation_0-rmse:0.94034
[14]	validation_0-rmse:0.93977
[15]	validation_0-rmse:0.93925
[16]	validation_0-rmse:0.93887
[17]	validation_0-rmse:0.93834
[18]	validation_0-rmse:0.93780
[19]	validation_0-rmse:0.93718
[20]	validation_0-rmse:0.93672
[21]	validation_0-rmse:0.93634
[22]	validation_0-rmse:0.93597
[23]	validation_0-rmse:0.93569
[24]	valid

In [6]:
import numpy as np
import random

def hit_rate_at_k(predicted_items, true_items, k):
    predicted_at_k = predicted_items[:k]
    hits = len(set(predicted_at_k) & set(true_items))
    return 1.0 if hits > 0 else 0.0

def ndcg_at_k(predicted_items, true_items, k):
    predicted_at_k = predicted_items[:k]
    dcg = 0.0
    idcg = sum((1.0 / np.log2(i + 2) for i in range(min(len(true_items), k))))
    
    for i, item in enumerate(predicted_at_k):
        if item in true_items:
            dcg += 1.0 / np.log2(i + 2)  # Log base 2

    return dcg / idcg if idcg > 0 else 0.0

hr_values = []
ndcg_values = []
hr_seren_values = []
ndcg_seren_values = []
rating_threshold = 3
serendipity_items = 0
serendipity_survival_items = 0
all_item_ids = set(all_item_ids)
for k in [5,10]:
    for user_id in test_data['user_id'].unique():
        user_test_items = test_data[test_data['user_id'] == user_id]
        positive_items = user_test_items[user_test_items['rating'] > rating_threshold]['item_id'].tolist()
        if not positive_items:
            continue


        for target_item in positive_items:

            negative_items = user_test_items[user_test_items['rating'] <= rating_threshold]['item_id'].tolist()
            random.shuffle(negative_items)
            candidate_items = [target_item] + negative_items[:99]
            serendipity_value = data[(data['user_id'] == user_id) & (data['item_id'] == target_item)]['serendipity'].values[0]
            if serendipity_value == 1:
                 serendipity_items += 1
            if not len(candidate_items) >  k:
                
                interacted_items = set(data[data['user_id'] == user_id]['item_id'])
                non_interacted_items = all_item_ids - interacted_items
                interacted_items = set(data[data['user_id'] == user_id]['item_id'])
                filtered_items = data[data['item_id'].isin(non_interacted_items)]['item_id'].unique()
                
                num_items_to_select = min(10, len(filtered_items))
                random_items = random.sample(list(filtered_items), num_items_to_select)
                candidate_items = candidate_items + random_items
            
            X_user = []
            items_id_list = []
            for item_id in candidate_items:
                user_idx = train_user_id_map.get(user_id)
                item_idx = train_item_id_map.get(item_id)
                user_embedding = user_embeddings[user_idx]if user_idx is not None else default_user_embedding
                item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
                item_embedding_bert = item_embeddings_dict[item_id]
                combined_embedding = np.hstack((user_embedding, item_embedding_bert, item_embedding))
                X_user.append(combined_embedding)
                items_id_list.append(item_id)
        
            X_user = np.array(X_user)
            y_score = xgb_model.predict(X_user)
        
            predicted_ratings = pd.DataFrame({
                'item_id': candidate_items,
                'predicted_rating': y_score
            })
        
            predicted_ratings = predicted_ratings.sort_values(by='predicted_rating', ascending=False)
            ordered_items = predicted_ratings['item_id'].tolist()
            hr = hit_rate_at_k(ordered_items, [target_item], k)
            ndcg = ndcg_at_k(ordered_items, [target_item], k)
            hr_values.append(hr)
            ndcg_values.append(ndcg)

            serendipity_value = data[(data['user_id'] == user_id) & (data['item_id'] == target_item)]['serendipity'].values[0]
            if serendipity_value == 1:
                serendipity_survival_items += 1
                hr = hit_rate_at_k(ordered_items, [target_item], k)
                ndcg = ndcg_at_k(ordered_items, [target_item], k)
                hr_seren_values.append(hr)
                ndcg_seren_values.append(ndcg)

        
    average_hr = np.mean(hr_values)
    average_ndcg = np.mean(ndcg_values)
    average_hr_seren = np.mean(hr_seren_values)
    average_ndcg_seren = np.mean(ndcg_seren_values)
    
    print(f"Average HR@{k}: {average_hr}")
    print(f"Average NDCG@{k}: {average_ndcg}")
    print(f"Average HR_seren@{k}: {average_hr_seren}")
    print(f"Average NDCG_seren@{k}: {average_ndcg_seren}")

Average HR@5: 0.42741116751269037
Average NDCG@5: 0.2635026696497137
Average HR_seren@5: 0.45652173913043476
Average NDCG_seren@5: 0.2677143046203485
Average HR@10: 0.6185279187817259
Average NDCG@10: 0.31413907015906944
Average HR_seren@10: 0.6304347826086957
Average NDCG_seren@10: 0.32943423273231803
