In [10]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import optuna
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler

import numpy as np

data = pd.read_csv('Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")
all_item_ids = data['item_id'].unique()

train_data = pd.read_csv('train_data.csv', dtype={'item_id': str}, sep=';')
test_data = pd.read_csv('test_data.csv', dtype={'item_id': str}, sep=';')

def load_embeddings(embedding_path, item_ids):
    embeddings = np.load(embedding_path)
    return {item_id: normalize(embedding.reshape(1, -1)).flatten() for item_id, embedding in zip(item_ids, embeddings)}


train_user_ids = train_data['user_id'].unique()
train_item_ids = train_data['item_id'].unique()
train_user_id_map = {user_id: idx for idx, user_id in enumerate(train_user_ids)}
train_item_id_map = {item_id: idx for idx, item_id in enumerate(train_item_ids)}

interaction_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
svd = TruncatedSVD(n_components=50, random_state=42)
user_embeddings = svd.fit_transform(interaction_matrix)
item_embeddings_mf = svd.components_.T

scaler = StandardScaler()
user_embeddings = scaler.fit_transform(user_embeddings)
item_embeddings_mf = scaler.fit_transform(item_embeddings_mf)

# description embeddings
item_embeddings_dict = load_embeddings('embeddings/all_mpnet_base_embeddings.npy', all_item_ids)

# default embedding for missing embeddings in test
default_user_embedding = np.mean(user_embeddings, axis=0)
default_item_embedding_svd = np.mean(item_embeddings_mf, axis=0)


train_data_samples = []
sample_weights = []

for idx, row in train_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = user_embeddings[user_idx] if user_idx is not None else default_user_embedding
    item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))

    combined_embedding = np.hstack((user_embedding, item_final))
    train_data_samples.append((combined_embedding, row['serendipity']))



X_train, y_train = zip(*train_data_samples)
X_train = np.array(X_train)
y_train = np.array(y_train)

undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)


test_data_samples = []
for idx, row in test_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = user_embeddings[user_idx] if user_idx is not None else default_user_embedding
    item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))
    combined_embedding = np.hstack((user_embedding, item_final))
    test_data_samples.append((combined_embedding, row['serendipity']))

X_test, y_test = zip(*test_data_samples)
X_test = np.array(X_test)
y_test = np.array(y_test)


X_train = X_resampled.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_resampled.astype(np.float32)
y_test = y_test.astype(np.float32)
scale_pos_weight = len(y_resampled[y_resampled == 0]) / len(y_resampled[y_resampled == 1])

# Função objetivo para o Optuna
def objective(trial):
    # Definir o espaço de busca dos hiperparâmetros
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
        'scale_pos_weight': scale_pos_weight
    }
    
    xgb_model = xgb.XGBClassifier(**param)
    
    X_train_opt, X_valid_opt, y_train_opt, y_valid_opt = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    xgb_model.fit(X_train_opt, y_train_opt, eval_set=[(X_valid_opt, y_valid_opt)], verbose=False)
    
    y_pred = xgb_model.predict(X_valid_opt)
    
    precision = precision_score(y_valid_opt, y_pred)
    recall = recall_score(y_valid_opt, y_pred)
    
    # chose metric to maximaze, F1-score
    return 2 * (precision * recall) / (precision + recall + 1e-10)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("best hyperparameters found:")
print(study.best_params)

# Treinar e avaliar o modelo final com os melhores hiperparâmetros
best_params = study.best_params
xgb_model = xgb.XGBClassifier(**best_params)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')

[I 2024-09-11 02:38:20,362] A new study created in memory with name: no-name-90305be8-666c-4539-bd93-29d82a59b2d7
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
[I 2024-09-11 02:38:24,971] Trial 0 finished with value: 0.595238095188159 and parameters: {'n_estimators': 481, 'max_depth': 10, 'learning_rate': 0.09972470312911913, 'subsample': 0.9922412689103293, 'colsample_bytree': 0.7737047876791775, 'reg_alpha': 0.32318217088614054, 'reg_lambda': 0.2227907617122236}. Best is trial 0 with value: 0.595238095188159.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_byt

Melhores hiperparâmetros encontrados:
{'n_estimators': 423, 'max_depth': 10, 'learning_rate': 0.03965457190248209, 'subsample': 0.8845282901420736, 'colsample_bytree': 0.6779361527275303, 'reg_alpha': 2.289019619864856, 'reg_lambda': 1.5475853561905824}
[0]	validation_0-logloss:0.68928
[1]	validation_0-logloss:0.69019
[2]	validation_0-logloss:0.68828
[3]	validation_0-logloss:0.68897
[4]	validation_0-logloss:0.68723
[5]	validation_0-logloss:0.68430
[6]	validation_0-logloss:0.68413
[7]	validation_0-logloss:0.68076
[8]	validation_0-logloss:0.68047
[9]	validation_0-logloss:0.67934
[10]	validation_0-logloss:0.67744
[11]	validation_0-logloss:0.67563
[12]	validation_0-logloss:0.67621
[13]	validation_0-logloss:0.67450
[14]	validation_0-logloss:0.67414
[15]	validation_0-logloss:0.67616
[16]	validation_0-logloss:0.67642
[17]	validation_0-logloss:0.67600
[18]	validation_0-logloss:0.67457
[19]	validation_0-logloss:0.67447
[20]	validation_0-logloss:0.67446
[21]	validation_0-logloss:0.67217
[22]	val

In [11]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1427  870]
 [  41   58]]
              precision    recall  f1-score   support

         0.0       0.97      0.62      0.76      2297
         1.0       0.06      0.59      0.11        99

    accuracy                           0.62      2396
   macro avg       0.52      0.60      0.44      2396
weighted avg       0.93      0.62      0.73      2396



In [12]:
from sklearn.metrics import precision_recall_curve, f1_score, classification_report

y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]  #positive class probability

# Calcular Precision-Recall para diferentes thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# experiment different thresholds
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)

best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Melhor limiar para F1-score: {best_threshold}")

Melhor limiar para F1-score: 0.9054989814758301


In [13]:
y_pred_adjusted = (y_pred_prob >= best_threshold).astype(int)

In [7]:
print(confusion_matrix(y_test, y_pred_adjusted))
print(classification_report(y_test, y_pred_adjusted))

[[   0 2297]
 [   0   99]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      2297
         1.0       0.04      1.00      0.08        99

    accuracy                           0.04      2396
   macro avg       0.02      0.50      0.04      2396
weighted avg       0.00      0.04      0.00      2396



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
xgb_model.save_model('serendipity_classifier_mpnet.json')