In [1]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import optuna
import xgboost as xgb
from imblearn.under_sampling import RandomUnderSampler

import numpy as np

data = pd.read_csv('../Data/serenlens_with_features_cleaned_filtered.csv', dtype={'item_id': str}, sep=";")
all_item_ids = data['item_id'].unique()

train_data = pd.read_csv('../train_data.csv', dtype={'item_id': str}, sep=';')
test_data = pd.read_csv('../test_data.csv', dtype={'item_id': str}, sep=';')

def load_embeddings(embedding_path, item_ids):
    embeddings = np.load(embedding_path)
    return {item_id: normalize(embedding.reshape(1, -1)).flatten() for item_id, embedding in zip(item_ids, embeddings)}


train_user_ids = train_data['user_id'].unique()
train_item_ids = train_data['item_id'].unique()
train_user_id_map = {user_id: idx for idx, user_id in enumerate(train_user_ids)}
train_item_id_map = {item_id: idx for idx, item_id in enumerate(train_item_ids)}

interaction_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating').fillna(0)
svd = TruncatedSVD(n_components=50, random_state=42)
user_embeddings = svd.fit_transform(interaction_matrix)
item_embeddings_mf = svd.components_.T

scaler = StandardScaler()
user_embeddings = scaler.fit_transform(user_embeddings)
item_embeddings_mf = scaler.fit_transform(item_embeddings_mf)

# description embeddings
item_embeddings_dict = load_embeddings('../embeddings/all_mpnet_base_embeddings.npy', all_item_ids)

# default embedding for missing embeddings in test
default_user_embedding = np.mean(user_embeddings, axis=0)
default_item_embedding_svd = np.mean(item_embeddings_mf, axis=0)


train_data_samples = []
sample_weights = []

for idx, row in train_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = user_embeddings[user_idx] if user_idx is not None else default_user_embedding
    item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))

    combined_embedding = np.hstack((user_embedding, item_final))
    train_data_samples.append((combined_embedding, row['serendipity']))



X_train, y_train = zip(*train_data_samples)
X_train = np.array(X_train)
y_train = np.array(y_train)

undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)


test_data_samples = []
for idx, row in test_data.iterrows():
    user_idx = train_user_id_map.get(row['user_id'])
    item_idx = train_item_id_map.get(row['item_id'])
    user_embedding = user_embeddings[user_idx] if user_idx is not None else default_user_embedding
    item_embedding = item_embeddings_mf[item_idx] if item_idx is not None else default_item_embedding_svd
    item_embedding_bert = item_embeddings_dict[row['item_id']]
    item_final = np.hstack((item_embedding_bert, item_embedding))
    combined_embedding = np.hstack((user_embedding, item_final))
    test_data_samples.append((combined_embedding, row['serendipity']))

X_test, y_test = zip(*test_data_samples)
X_test = np.array(X_test)
y_test = np.array(y_test)


X_train = X_resampled.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_resampled.astype(np.float32)
y_test = y_test.astype(np.float32)
scale_pos_weight = len(y_resampled[y_resampled == 0]) / len(y_resampled[y_resampled == 1])

# Função objetivo para o Optuna
def objective(trial):
    # Definir o espaço de busca dos hiperparâmetros
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        #'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.1, 10),
        #'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.1, 10),
        'scale_pos_weight': scale_pos_weight
    }
    
    xgb_model = xgb.XGBClassifier(**param)
    
    X_train_opt, X_valid_opt, y_train_opt, y_valid_opt = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    xgb_model.fit(X_train_opt, y_train_opt, eval_set=[(X_valid_opt, y_valid_opt)], verbose=False)
    
    y_pred = xgb_model.predict(X_valid_opt)
    
    precision = precision_score(y_valid_opt, y_pred)
    recall = recall_score(y_valid_opt, y_pred)
    
    # chose metric to maximaze, F1-score
    return 2 * (precision * recall) / (precision + recall + 1e-10)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("best hyperparameters found:")
print(study.best_params)

# Treinar e avaliar o modelo final com os melhores hiperparâmetros
best_params = study.best_params
xgb_model = xgb.XGBClassifier(**best_params)
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Test accuracy: {accuracy}')
print(f'Test precision: {precision}')
print(f'Test recall: {recall}')

[I 2024-09-11 17:07:58,051] A new study created in memory with name: no-name-9bd0b9f1-bd86-4c39-a56f-ecd897b5f90c
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
[I 2024-09-11 17:08:00,879] Trial 0 finished with value: 0.5962732918754676 and parameters: {'n_estimators': 207, 'max_depth': 6, 'learning_rate': 0.21219258358667037, 'subsample': 0.9215567888294792, 'colsample_bytree': 0.6090061984691153}. Best is trial 0 with value: 0.5962732918754676.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
[I 2024-09-11 17:08:05,946] Trial 1 finished with value: 0.6060606060106225 and parameters: {'n_estimators': 387, 'max_depth': 5, 'learning_rate': 0.08675168200000528, 'subsample

best hyperparameters found:
{'n_estimators': 167, 'max_depth': 11, 'learning_rate': 0.010221450073738071, 'subsample': 0.8843872117626365, 'colsample_bytree': 0.7502004994785784}
[0]	validation_0-logloss:0.69193
[1]	validation_0-logloss:0.69140
[2]	validation_0-logloss:0.69047
[3]	validation_0-logloss:0.68975
[4]	validation_0-logloss:0.68956
[5]	validation_0-logloss:0.68846
[6]	validation_0-logloss:0.68833
[7]	validation_0-logloss:0.68750
[8]	validation_0-logloss:0.68685
[9]	validation_0-logloss:0.68615
[10]	validation_0-logloss:0.68531
[11]	validation_0-logloss:0.68396
[12]	validation_0-logloss:0.68321
[13]	validation_0-logloss:0.68255
[14]	validation_0-logloss:0.68167
[15]	validation_0-logloss:0.68164
[16]	validation_0-logloss:0.68155
[17]	validation_0-logloss:0.68137
[18]	validation_0-logloss:0.68036
[19]	validation_0-logloss:0.67979
[20]	validation_0-logloss:0.67908
[21]	validation_0-logloss:0.67830
[22]	validation_0-logloss:0.67826
[23]	validation_0-logloss:0.67821
[24]	validation

In [2]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1401  896]
 [  38   61]]
              precision    recall  f1-score   support

         0.0       0.97      0.61      0.75      2297
         1.0       0.06      0.62      0.12        99

    accuracy                           0.61      2396
   macro avg       0.52      0.61      0.43      2396
weighted avg       0.94      0.61      0.72      2396



In [3]:
from sklearn.metrics import precision_recall_curve, f1_score, classification_report

y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]  #positive class probability

# Calcular Precision-Recall para diferentes thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# experiment different thresholds
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)

best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Melhor limiar para F1-score: {best_threshold}")

Melhor limiar para F1-score: 0.6798625588417053


In [4]:
y_pred_adjusted = (y_pred_prob >= best_threshold).astype(int)

In [5]:
print(confusion_matrix(y_test, y_pred_adjusted))
print(classification_report(y_test, y_pred_adjusted))

[[2192  105]
 [  79   20]]
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96      2297
         1.0       0.16      0.20      0.18        99

    accuracy                           0.92      2396
   macro avg       0.56      0.58      0.57      2396
weighted avg       0.93      0.92      0.93      2396



In [6]:
xgb_model.save_model('serendipity_classifier_mpnet.json')