In [6]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import precision_score, recall_score, accuracy_score, mean_squared_error, mean_absolute_error
import optuna
import xgboost as xgb
import numpy as np

train_data = pd.read_csv('train_data.csv', dtype={'item_id': str}, sep=';')
test_data = pd.read_csv('test_data.csv', dtype={'item_id': str}, sep=';')
data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")
all_item_ids = data['item_id'].unique()

# load serendipity classifier model
predict_serendipity = xgb.XGBClassifier()
predict_serendipity.load_model('predict_serendipity.json')

def load_embeddings(embedding_path, item_ids):
    embeddings = np.load(embedding_path)
    return {item_id: normalize(embedding.reshape(1, -1)).flatten() for item_id, embedding in zip(item_ids, embeddings)}

train_user_ids = train_data['user_id'].unique()
train_item_ids = train_data['item_id'].unique()
train_user_id_map = {user_id: idx for idx, user_id in enumerate(train_user_ids)}
train_item_id_map = {item_id: idx for idx, item_id in enumerate(train_item_ids)}

interaction_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
svd = TruncatedSVD(n_components=50, random_state=42)
user_embeddings = svd.fit_transform(interaction_matrix)
item_embeddings_mf = svd.components_.T

scaler = StandardScaler()
user_embeddings = scaler.fit_transform(user_embeddings)
item_embeddings_mf = scaler.fit_transform(item_embeddings_mf)

# description embeddings
item_embeddings_dict = load_embeddings('embeddings/all_mpnet_base_embeddings.npy', all_item_ids)

# default embedding for missing embeddings in test
default_user_embedding = np.mean(user_embeddings, axis=0)
default_item_embedding_svd = np.mean(item_embeddings_mf, axis=0)

default_user_embedding = normalize(default_user_embedding.reshape(1, -1)).flatten()
default_item_embedding_svd = normalize(default_item_embedding_svd.reshape(1, -1)).flatten()

train_data_samples = []

for idx, row in train_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = normalize(user_embeddings[user_idx].reshape(1, -1)).flatten() if user_idx is not None else default_user_embedding
    item_embedding = normalize(item_embeddings_mf[item_idx].reshape(1, -1)).flatten() if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))

    #calculate serendipity using the serendipity classifier
    serendipity_input = np.array(np.hstack((user_embedding, item_final))).reshape(1, -1)
    serendipit_pred = predict_serendipity.predict_proba(serendipity_input)[:, 1]
    serendipity_class = (serendipit_pred >= 0.5).astype(int)
    
    # favour serenpity interactions
    serendipity_weight = 3.0 if serendipity_class == 1 else 1.0  # Atribua mais peso se o item for serendipitoso
    item_final = item_final * serendipity_weight

    # add serendipity as a feature
    serendipity_feature = np.array([1]) if serendipity_class == 1 else np.array([0])
    
    combined_embedding = np.hstack((user_embedding, item_final, serendipity_feature))
    train_data_samples.append((combined_embedding, row['rating']))


X_train, y_train = zip(*train_data_samples)
X_train = np.array(X_train)
y_train = np.array(y_train)

test_data_samples = []
for idx, row in test_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = normalize(user_embeddings[user_idx].reshape(1, -1)).flatten() if user_idx is not None else default_user_embedding
    item_embedding = normalize(item_embeddings_mf[item_idx].reshape(1, -1)).flatten() if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))

    #calculate serendipity using the serendipity classifier
    #consider in tests as if the real-world example would calculate it in order to integrate new interactions in the system
    serendipity_input = np.array(np.hstack((user_embedding, item_final))).reshape(1, -1)
    serendipit_pred = predict_serendipity.predict_proba(serendipity_input)[:, 1]
    serendipity_class = (serendipit_pred >= 0.5).astype(int)
    
    # favour serenpity interactions
    serendipity_weight = 3.0 if serendipity_class == 1 else 1.0  
    item_final = item_final * serendipity_weight

    # add serendipity as a feature
    serendipity_feature = np.array([1]) if serendipity_class == 1 else np.array([0])  # Feature binária para serendipidade
    
    combined_embedding = np.hstack((user_embedding, item_final, serendipity_feature))
    test_data_samples.append((combined_embedding, row['rating']))

X_test, y_test = zip(*test_data_samples)
X_test = np.array(X_test)
y_test = np.array(y_test)

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

def objective(trial):
    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
    }
    
    xgb_model = xgb.XGBRegressor(**param)
    
    X_train_opt, X_valid_opt, y_train_opt, y_valid_opt = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    xgb_model.fit(X_train_opt, y_train_opt, eval_set=[(X_valid_opt, y_valid_opt)], verbose=False)
    
    y_pred = xgb_model.predict(X_valid_opt)
    
    mse = mean_squared_error(y_valid_opt, y_pred)
    
    # minimize mse    
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("best hyperparameters found:")
print(study.best_params)

# use best hyperparameters
best_params = study.best_params
xgb_model = xgb.XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'test MSE: {mse}')
print(f'test MAE: {mae}')


[I 2024-09-10 19:41:06,776] A new study created in memory with name: no-name-9d376fbe-6000-4e2d-94b9-124a8415a1d5
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
[I 2024-09-10 19:41:17,684] Trial 0 finished with value: 0.8188788294792175 and parameters: {'n_estimators': 362, 'max_depth': 5, 'learning_rate': 0.11854631632351705, 'subsample': 0.9022953241165619, 'colsample_bytree': 0.9721352949216878, 'reg_alpha': 1.1189534635840073, 'reg_lambda': 4.2863194738531964}. Best is trial 0 with value: 0.8188788294792175.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_byt

Melhores hiperparâmetros encontrados:
{'n_estimators': 156, 'max_depth': 8, 'learning_rate': 0.028477901259371878, 'subsample': 0.8648611591341848, 'colsample_bytree': 0.6990790957155497, 'reg_alpha': 0.19246501303235644, 'reg_lambda': 0.31688342740477526}
[0]	validation_0-rmse:0.94615
[1]	validation_0-rmse:0.94431
[2]	validation_0-rmse:0.94208
[3]	validation_0-rmse:0.94029
[4]	validation_0-rmse:0.93812
[5]	validation_0-rmse:0.93688
[6]	validation_0-rmse:0.93522
[7]	validation_0-rmse:0.93340
[8]	validation_0-rmse:0.93260
[9]	validation_0-rmse:0.93132
[10]	validation_0-rmse:0.92975
[11]	validation_0-rmse:0.92830
[12]	validation_0-rmse:0.92773
[13]	validation_0-rmse:0.92670
[14]	validation_0-rmse:0.92561
[15]	validation_0-rmse:0.92489
[16]	validation_0-rmse:0.92367
[17]	validation_0-rmse:0.92323
[18]	validation_0-rmse:0.92211
[19]	validation_0-rmse:0.92068
[20]	validation_0-rmse:0.92035
[21]	validation_0-rmse:0.91980
[22]	validation_0-rmse:0.91897
[23]	validation_0-rmse:0.91832
[24]	vali

In [7]:
import numpy as np
import random

def hit_rate_at_k(predicted_items, true_items, k):
    predicted_at_k = predicted_items[:k]
    hits = len(set(predicted_at_k) & set(true_items))
    return 1.0 if hits > 0 else 0.0

def ndcg_at_k(predicted_items, true_items, k):
    predicted_at_k = predicted_items[:k]
    dcg = 0.0
    idcg = sum((1.0 / np.log2(i + 2) for i in range(min(len(true_items), k))))
    
    for i, item in enumerate(predicted_at_k):
        if item in true_items:
            dcg += 1.0 / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0.0

hr_values = []
ndcg_values = []
hr_seren_values = []
ndcg_seren_values = []
rating_threshold = 3
serendipity_items = 0
serendipity_survival_items = 0
all_item_ids = set(all_item_ids)
for k in [5,10]:
    for user_id in test_data['user_id'].unique():
        user_test_items = test_data[test_data['user_id'] == user_id]
        positive_items = user_test_items[user_test_items['rating'] > rating_threshold]['item_id'].tolist()
        if not positive_items:
            continue


        for target_item in positive_items:

            negative_items = user_test_items[user_test_items['rating'] <= rating_threshold]['item_id'].tolist()
            random.shuffle(negative_items)
            candidate_items = [target_item] + negative_items[:99]  # Limitar a 99 negativos
            serendipity_value = data[(data['user_id'] == user_id) & (data['item_id'] == target_item)]['serendipity'].values[0]
            if serendipity_value == 1:
                 serendipity_items += 1
            if not len(candidate_items) >  k:
                
                interacted_items = set(data[data['user_id'] == user_id]['item_id'])
                non_interacted_items = all_item_ids - interacted_items
                interacted_items = set(data[data['user_id'] == user_id]['item_id'])
                filtered_items = data[data['item_id'].isin(non_interacted_items)]['item_id'].unique()
                
                num_items_to_select = min(10, len(filtered_items))
                random_items = random.sample(list(filtered_items), num_items_to_select)
                candidate_items = candidate_items + random_items
            
            X_user = []
            items_id_list = []
            for item_id in candidate_items:
                user_idx = train_user_id_map.get(user_id)
                item_idx = train_item_id_map.get(item_id)
                user_embedding = user_embeddings[user_idx]if user_idx is not None else default_user_embedding
                item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
                item_embedding_bert = item_embeddings_dict[item_id]
                item_final = np.hstack((item_embedding_bert, item_embedding))
                
                #calculate serendipity using the serendipity classifier
                #consider in tests as if the real-world example would calculate it in order to integrate new interactions in the system
                serendipity_input = np.array(np.hstack((user_embedding, item_final))).reshape(1, -1)
                serendipit_pred = predict_serendipity.predict_proba(serendipity_input)[:, 1]
                serendipity_class = (serendipit_pred >= 0.5).astype(int)
                
                serendipity_weight = 3.0 if serendipity_class == 1 else 1.0
                item_final = item_final * serendipity_weight
            
                serendipity_feature = np.array([1]) if serendipity_class == 1 else np.array([0])
                
                combined_embedding = np.hstack((user_embedding, item_final, serendipity_feature))
                X_user.append(combined_embedding)
                items_id_list.append(item_id)
        
            X_user = np.array(X_user)
            y_score = xgb_model.predict(X_user)
        
            predicted_ratings = pd.DataFrame({
                'item_id': candidate_items,
                'predicted_rating': y_score
            })
        
            predicted_ratings = predicted_ratings.sort_values(by='predicted_rating', ascending=False)
            ordered_items = predicted_ratings['item_id'].tolist()
            hr = hit_rate_at_k(ordered_items, [target_item], k)
            ndcg = ndcg_at_k(ordered_items, [target_item], k)
            hr_values.append(hr)
            ndcg_values.append(ndcg)

            serendipity_value = data[(data['user_id'] == user_id) & (data['item_id'] == target_item)]['serendipity'].values[0]
            if serendipity_value == 1:
                serendipity_survival_items += 1
                hr = hit_rate_at_k(ordered_items, [target_item], k)
                ndcg = ndcg_at_k(ordered_items, [target_item], k)
                hr_seren_values.append(hr)
                ndcg_seren_values.append(ndcg)

        
    average_hr = np.mean(hr_values)
    average_ndcg = np.mean(ndcg_values)
    average_hr_seren = np.mean(hr_seren_values)
    average_ndcg_seren = np.mean(ndcg_seren_values)
    
    print(f"Average HR@{k}: {average_hr}")
    print(f"Average NDCG@{k}: {average_ndcg}")
    print(f"Average HR_seren@{k}: {average_hr_seren}")
    print(f"Average NDCG_seren@{k}: {average_ndcg_seren}")

Average HR@5: 0.4558375634517767
Average NDCG@5: 0.2828476322760549
Average HR_seren@5: 0.4782608695652174
Average NDCG_seren@5: 0.288430332293146
Average HR@10: 0.6314720812182741
Average NDCG@10: 0.32858417687327046
Average HR_seren@10: 0.6902173913043478
Average NDCG_seren@10: 0.3435499341304586
