In [1]:
import os
from pathlib import Path

module_path = str(Path().resolve())
os.environ['PYTHONPATH'] = module_path

module_path

'/app/app/src'

In [2]:
!pip install "ray[tune]==2.10.0"

[0m

In [3]:
!pip install bayesian-optimization==1.4.3

[0m

In [4]:
!pip install seaborn

[0m

In [5]:
!pip install xgboost

[0m

In [6]:
import os
import shutil
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt

import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam

import ray
from ray import tune
from ray import train
from ray.train import Checkpoint, CheckpointConfig

from ray.tune import ExperimentAnalysis
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.schedulers.async_hyperband import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
xgb.set_config(verbosity=2)

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

# Recog

In [7]:
df_train = pd.read_csv("data/train_reviews_merged_final.csv")
df_train['text'] = df_train['text'].fillna("")

df_train.sample(5)

Unnamed: 0,stars,useful,funny,cool,text,date,stars_neg,review_count_neg,is_open_neg,categories_neg,review_count_user,average_stars_user,compliment_user
895989,5.0,-0.391093,-0.150312,-0.251644,Wow!! I had heard such wonderful reviews and f...,2020,4.5,0.068337,True,Restaurante,0.138126,4.42,-0.027822
304566,3.0,-0.391093,-0.150312,-0.251644,"As Blake once said: you're not punk, and I'm t...",2009,4.0,0.329729,True,Belleza,1.040289,4.01,0.032703
853497,5.0,-0.391093,-0.150312,-0.251644,I've been coming here for about 2 weeks and I ...,2016,3.5,2.214917,False,Restaurante,-0.116121,4.63,-0.032664
179657,5.0,-0.062341,-0.150312,-0.251644,Costco is fantastic! I don't know how people ...,2009,3.5,0.084179,False,Compras,5.67413,3.44,0.062966
496418,5.0,-0.062341,-0.150312,0.255613,What a great find! The amount of food for eac...,2017,4.0,0.408939,True,Restaurante,-0.165329,3.88,-0.035085


In [8]:
X_numeric = df_train[["useful", "funny", "cool", "stars_neg", "review_count_neg", "categories_neg", "review_count_user", "average_stars_user", 
                     "compliment_user"]].copy()
X_numeric["is_open_neg"] = df_train["is_open_neg"].astype(int)

X_numeric.sample(5)

Unnamed: 0,useful,funny,cool,stars_neg,review_count_neg,categories_neg,review_count_user,average_stars_user,compliment_user,is_open_neg
758893,-0.391093,-0.150312,-0.251644,4.0,19.736078,Restaurante,-0.206337,4.71,-0.033875,1
308976,-0.062341,-0.150312,-0.251644,4.5,0.472306,Restaurante,-0.288352,3.0,-0.035085,0
170938,-0.391093,-0.150312,-0.251644,2.0,-0.280185,Hotelería,1.606192,4.27,-0.012086,1
401714,-0.391093,0.300774,-0.251644,3.5,24.654994,Restaurante,0.121723,3.46,-0.024191,1
319873,-0.391093,-0.150312,-0.251644,4.0,-0.264343,Restaurante,0.581006,4.41,0.163437,1


In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X_numeric['categories_neg'] = label_encoder.fit_transform(X_numeric['categories_neg'])

X_numeric.sample(5)

Unnamed: 0,useful,funny,cool,stars_neg,review_count_neg,categories_neg,review_count_user,average_stars_user,compliment_user,is_open_neg
948079,0.26641,-0.150312,0.255613,3.5,1.945605,4,1.21252,3.98,0.014545,1
293790,-0.391093,-0.150312,-0.251644,3.5,1.137667,4,-0.230941,3.91,-0.035085,1
933739,-0.391093,-0.150312,-0.251644,3.5,-0.042556,4,-0.263747,4.0,-0.035085,1
680978,0.595161,0.300774,0.255613,4.0,1.644608,4,-0.239143,3.63,-0.033875,0
467035,-0.391093,-0.150312,-0.251644,3.0,0.781224,3,-0.263747,1.8,-0.035085,1


In [10]:
text = df_train["text"]
text

0         Fantastic fresh food. The greek salad is amazi...
1         Been a patient at Largo Med/Diagnostic Clinic ...
2         The location is convenient to my campus so I d...
3         I agree with all the other compliments posted ...
4         Wanting to help out the local economy, I thoug...
                                ...                        
967779    Ordered takeout and this place didn't disappoi...
967780    I took our annual managers meeting there for d...
967781    My favorite local coffee shop! Great drinks (i...
967782    We hit a quiet time here on a very busy weeken...
967783    On-line buyer beware, I had to return a top th...
Name: text, Length: 967784, dtype: object

# Search Space

In [11]:
booster_dict = {
    1: "gbtree",
    2: "gblinear",
    3: "dart"
}

tree_method_dict = {
    1: "auto",
    2: "approx",
    3: "hist"
}


grow_policy_dict = {
    1: "depthwise",
    2: "lossguide"
}

space = {
    # Categóricos indexados
    "booster_idx": tune.randint(1, 4),
    "tree_method_idx": tune.randint(1, 4),
    "grow_policy_idx": tune.randint(1, 3),

    # Numéricos continuos o log-uniformes
    "eta": tune.loguniform(1e-3, 0.7),
    "gamma": tune.uniform(0.0, 10.0),
    "max_depth": tune.qloguniform(3, 12, 1),
    "min_child_weight": tune.uniform(1, 5),
    "max_delta_step": tune.uniform(0, 5),
    "subsample": tune.uniform(0.2, 1.0),
    "lambda": tune.loguniform(1e-3, 10.0),
    "alpha": tune.loguniform(1e-3, 10.0),
    "scale_pos_weight": tune.uniform(0.5, 5),
    "max_leaves": tune.qloguniform(1, 10, 1),
    "max_bin": tune.qloguniform(125, 525, 25),


    # # Embedding
    # "ngram_range_x": tune.randint(1,3),
    # "max_features": tune.qloguniform(5_000, 20_000, 1_000),
    # "n_components": tune.qloguniform(50, 150, 25),
}


#  Objective Function

In [12]:
from functools import lru_cache

def get_data():
    X_text_reduced = np.load("/app/app/src/data/embeddings_train_tfidf_50.npy", mmap_mode='r')
    X_final = np.hstack([X_numeric.values, X_text_reduced])
    y = df_train["stars"]
    return train_test_split(X_final, y, test_size=0.2, random_state=42)

In [13]:
def objective(config, get_data):
    X_train, X_test, y_train, y_test = get_data()

    # Estos sí los acepta XGBRegressor directamente
    regressor_params = {
        "booster": booster_dict[int(config["booster_idx"])],
        "eta": float(config["eta"]),
        "max_depth": int(config["max_depth"]),
        "min_child_weight": int(config["min_child_weight"]),
        "subsample": float(config["subsample"]),
        "reg_lambda": float(config["lambda"]),
        "reg_alpha": float(config["alpha"]),
        "scale_pos_weight": float(config["scale_pos_weight"]),
        "verbosity": 0,
        "eval_metric":"mae",
        "random_state": 42,
        "n_jobs": 16
    }
    
    # Estos deben ir en fit() o no se usan
    unused_params = {
        "gamma": float(config["gamma"]),
        "grow_policy": grow_policy_dict[int(config["grow_policy_idx"])],
        "max_bin": int(config["max_bin"]),
        "max_delta_step": float(config["max_delta_step"]),
        "max_leaves": int(config["max_leaves"]),
        "tree_method": tree_method_dict[int(config["tree_method_idx"])],
        "sampling_method": "uniform"
    }

    model = XGBRegressor(**regressor_params)
    model.set_params(**unused_params)  

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)

    print(f"MAE: {mae}")
        
    train.report({
        "mae": mae
    })

#  Test the Objective Function

In [14]:
def sample(space):
    config = {}
    for key in space:
        config[key] = space[key].sample()
    return config

In [15]:
config = sample(space)

print(f"""Testing config:
  booster: {booster_dict[int(config["booster_idx"])]}
  tree_method: {tree_method_dict[int(config["tree_method_idx"])]}
  grow_policy: {grow_policy_dict[int(config["grow_policy_idx"])]}
  eta: {float(config['eta']):.5f}
  gamma: {float(config['gamma']):.5f}
  max_depth: {int(config['max_depth'])}
  min_child_weight: {int(config['min_child_weight'])}
  max_delta_step: {float(config['max_delta_step']):.3f}
  subsample: {float(config['subsample']):.3f}
  lambda: {float(config['lambda']):.5f}
  alpha: {float(config['alpha']):.5f}
  scale_pos_weight: {int(config['scale_pos_weight'])}
  max_leaves: {int(config['max_leaves'])}
  max_bin: {int(config['max_bin'])}
""")


Testing config:
  booster: gblinear
  tree_method: approx
  grow_policy: depthwise
  eta: 0.07691
  gamma: 7.51414
  max_depth: 8
  min_child_weight: 4
  max_delta_step: 1.207
  subsample: 0.592
  lambda: 0.04062
  alpha: 0.00129
  scale_pos_weight: 0
  max_leaves: 3
  max_bin: 300



In [16]:
objective(config, get_data)

MAE: 1.0645061142728123


# Setting up Training

# Search Algorithm

In [17]:
!pip install optuna

[0m

In [18]:
from optuna.samplers import TPESampler

search_alg = OptunaSearch(
    sampler=TPESampler(
        n_startup_trials=5,         # Número de pruebas aleatorias antes de empezar la exploración bayesiana
        multivariate=True,          # Considera dependencias entre parámetros
        group=True                  # Agrupa parámetros relacionados (mejor para modelos grandes)
    ),
    metric="mae",
    mode="min"
)

# Scheduler

In [19]:
scheduler = ASHAScheduler(
    metric='mae', 
    mode='min',
    max_t=100,             # número máximo de iteraciones (puede ser num_boost_round)
    grace_period=5,       # deja correr las pruebas al menos 10 iteraciones antes de descartar
    reduction_factor=3     # controla cuántos se promueven vs descartan
)

# Tune Config

In [20]:
 tune_config = tune.TuneConfig( 
    search_alg=search_alg,
    num_samples=750,
    scheduler=scheduler,
    max_concurrent_trials=5,    
)

In [21]:
run_config = train.RunConfig(
        name='temaleos_Y',
        storage_path='/app/app/src/tune_Y',
        verbose = 1,
    )

In [22]:
objective = tune.with_resources(
         objective,
         {
             'cpu': 30, 
             'gpu': .5, 
             'memory': 20 * 1024**3
         }
     )

In [23]:
tuner = tune.Tuner(
         tune.with_parameters(objective, get_data=get_data),
         tune_config=tune_config,
         param_space=space,
         run_config=run_config,
     )

In [24]:
!pip install grpcio

[0m

In [25]:
while ray.is_initialized():
          ray.shutdown()
ray.init()

2025-04-12 13:47:55,524	INFO worker.py:1752 -- Started a local Ray instance.


0,1
Python version:,3.11.11
Ray version:,2.10.0


[36m(objective pid=13872)[0m MAE: 1.483037544311673
[36m(objective pid=14679)[0m MAE: 0.7573391741588513
[36m(objective pid=14023)[0m MAE: 0.795553051513933
[36m(objective pid=14837)[0m MAE: 1.2155041296827218
[36m(objective pid=15014)[0m MAE: 1.2378221338910516
[36m(objective pid=15443)[0m MAE: 0.6618751200897325[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(objective pid=15614)[0m MAE: 0.6740758615412902
[36m(objective pid=15766)[0m MAE: 0.9436748820343711
[36m(objective pid=15311)[0m MAE: 1.6370139762408846
[36m(objective pid=16086)[0m MAE: 1.1624366508816701
[36m(objective pid=16239)[0m MAE: 1.0123564328199468
[36m(objective pid=15918)[0m MAE: 0.8478870527193086
[36m(objective pid=16526)[0m MAE: 0.664225234532619
[36m(objective pid=16396)[0m MAE: 1.3694337076080298


In [26]:
results = tuner.fit()

0,1
Current time:,2025-04-12 14:42:11
Running for:,00:54:14.05
Memory:,21.6/251.6 GiB

Trial name,status,loc,alpha,booster_idx,eta,gamma,grow_policy_idx,lambda,max_bin,max_delta_step,max_depth,max_leaves,min_child_weight,scale_pos_weight,subsample,tree_method_idx,iter,total time (s),mae
objective_8d36cea2,TERMINATED,10.1.34.2:13872,0.0787118,1,0.00449267,1.54026,1,0.905476,369.859,0.66335,11.2188,8.66253,1.57831,4.91333,0.970775,3,1,2.97918,1.48304
objective_ea6bbe64,TERMINATED,10.1.34.2:14023,0.00116406,3,0.0308716,8.44242,1,0.00438779,508.541,3.53019,6.7085,5.41413,2.83194,3.35172,0.756663,2,1,48.7937,0.795553
objective_17f1ff25,TERMINATED,10.1.34.2:14679,0.157293,1,0.213703,2.91729,1,1.09289,174.026,4.63123,6.10813,2.44793,4.30746,3.11372,0.329929,1,1,3.44056,0.757339
objective_12f83863,TERMINATED,10.1.34.2:14837,0.00951097,3,0.00358628,9.56679,2,0.00553219,197.297,3.82977,4.08876,7.46728,4.22424,2.26623,0.794724,3,1,45.5143,1.2155
objective_adbf0b12,TERMINATED,10.1.34.2:15014,0.0627547,2,0.0975577,2.19747,1,0.0176342,318.889,3.6877,6.31347,3.68798,2.37266,4.78415,0.691948,3,1,8.57184,1.23782
objective_0b78db48,TERMINATED,10.1.34.2:15162,2.96278,1,0.00401648,0.270851,1,0.00265754,250.949,4.10448,9.91038,2.95467,4.54804,2.5001,0.427263,1,1,3.00144,1.30951
objective_35360037,TERMINATED,10.1.34.2:15311,0.00212199,3,0.00136803,3.19726,2,0.0827468,150.428,4.30859,5.76225,1.09505,2.40387,4.53855,0.404631,2,1,49.5254,1.63701
objective_4b07332d,TERMINATED,10.1.34.2:15443,0.00391939,1,0.282074,3.55861,1,0.920669,134.857,4.25026,6.3957,5.0245,4.48831,2.69779,0.209559,1,1,4.16331,0.661875
objective_a3c752cc,TERMINATED,10.1.34.2:15614,0.0151389,1,0.109617,4.86265,1,0.0322704,198.908,4.032,7.50247,5.07963,4.15189,2.77255,0.539299,1,1,4.34686,0.674076
objective_2410e932,TERMINATED,10.1.34.2:15766,0.0513987,1,0.0166135,5.48451,1,1.31701,142.006,3.21141,3.08261,7.5321,4.81366,4.29922,0.419849,1,1,4.47227,0.943675


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2025-04-12 14:42:11,886	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/app/app/src/tune_Y/temaleos_Y' in 0.1403s.
2025-04-12 14:42:12,090	INFO tune.py:1048 -- Total run time: 3254.27 seconds (3253.90 seconds for the tuning loop).


# Analyzing the Results

In [27]:
best = results.get_best_result(metric='mae', mode='min')
print(f"MAE : {best.metrics['mae']}")

MAE : 0.5961778806288488


In [28]:
best

Result(
  metrics={'mae': 0.5961778806288488},
  path='/app/app/src/tune_Y/temaleos_Y/objective_d3a1ba9f_544_alpha=0.1443,booster_idx=1,eta=0.5380,gamma=9.7562,grow_policy_idx=2,lambda=0.0916,max_bin=421.5688,max_del_2025-04-12_14-23-36',
  filesystem='local',
  checkpoint=None
)

In [30]:
analysis = ExperimentAnalysis("/app/app/src/tune_Y/temaleos_Y")
best_config = analysis.get_best_config(metric="mae", mode="min")
best_config

{'booster_idx': 1,
 'tree_method_idx': 1,
 'grow_policy_idx': 2,
 'eta': 0.5380190178037957,
 'gamma': 9.756247873771159,
 'max_depth': 11.692554890183727,
 'min_child_weight': 1.3388633161160366,
 'max_delta_step': 4.470422887866947,
 'subsample': 0.7430185517276597,
 'lambda': 0.0916242458100257,
 'alpha': 0.14431553002774017,
 'scale_pos_weight': 0.7162672474244075,
 'max_leaves': 9.511271585541758,
 'max_bin': 421.56884049011995}