## Librerias

In [1]:
import pandas as pd
import joblib
import os
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold , train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import numpy as np

os.chdir('../')

## Datos entrenamiento

In [2]:
df_train = joblib.load("datos/data_train.joblib")
df_test = joblib.load("datos/data_test.joblib")

## Encoder y selector de variables

In [3]:
fwiz = joblib.load("modelos/fwiz.joblib")
cat_econder = joblib.load("modelos/cat_econder.joblib")

Imported 0.3.2 version. Select nrows to a small number when running on huge datasets.
output = featurewiz(dataname, target, corr_limit=0.90, verbose=2, sep=',', 
		header=0, test_data='',feature_engg='', category_encoders='',
		dask_xgboost_flag=False, nrows=None, skip_sulov=False, skip_xgboost=False)
Create new features via 'feature_engg' flag : ['interactions','groupby','target']



## Aplicar encoder y selector de variables

In [4]:
X_train_t = cat_econder.transform(df_train.drop(columns=['clean_valor_total_avaluo']))
X_test_t = cat_econder.transform(df_test.drop(columns=['clean_valor_total_avaluo']))

In [5]:
X_train_selected = pd.concat([fwiz.transform(X_train_t), df_train['clean_valor_total_avaluo']], axis=1)
X_test_selected = pd.concat([fwiz.transform(X_test_t), df_test['clean_valor_total_avaluo']], axis=1)

In [6]:
y_train = df_train['clean_valor_total_avaluo']
y_test = df_test['clean_valor_total_avaluo']

In [7]:
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll import scope

## Iterar modelos 

In [8]:
# Definir parámetros para cada modelo

rf_space = { 
    'max_depth':scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'n_estimators':scope.int(hp.quniform('n_estimators', 5, 500, 1))
}

xgb_space = { 
    'max_depth':scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'n_estimators':scope.int(hp.quniform('n_estimators', 5, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
}

lgbm_space = { 
    'max_depth':scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'n_estimators':scope.int(hp.quniform('n_estimators', 5, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
}

gb_space = { 
    'max_depth':scope.int(hp.quniform('max_depth', 2, 10, 1)),
    'n_estimators':scope.int(hp.quniform('n_estimators', 5, 500, 1)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
}

et_space = { 
    'max_depth':scope.int(hp.quniform('max_depth', 2, 10, 1))
}


In [9]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from functools import partial

In [10]:
def objective(params, model, run_name):
    with mlflow.start_run(run_name=run_name):
        model.set_params(**params)

        # Define the k-fold cross-validation
        cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # Use MAPE as the scoring metric for grid search
        scoring = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
        cv_results = cross_validate(model, X_train_t, y_train, cv=cv, n_jobs=-1, scoring=scoring)

        # Calculate the average MAPE using cross-validation
        mape_train = -cv_results['test_score'].mean()

        # Refit model with all data
        model.fit(X_train_t, y_train)
        y_pred_test = model.predict(X_test_t)

        # Calculta mape on test set
        mape_test = mean_absolute_percentage_error(y_test, y_pred_test)

        # Log the results in MLflow
        mlflow.log_params(params)
        mlflow.log_metric('MAPE_train', mape_train)
        mlflow.log_metric('MAPE_test', mape_test)
   
    return mape_train

In [11]:
# Define the run_experiment function
def run_experiment(model, space, run_name, n_trials=20):
    fmin_objective = partial(objective, model=model, run_name=run_name)        
    trials = Trials()
    best_result = fmin(fn = fmin_objective, space = space, algo = tpe.suggest, max_evals = n_trials, trials = trials)

In [12]:
exp_id = mlflow.set_experiment(experiment_name="optimizacion")

In [13]:
run_experiment(LGBMRegressor(n_jobs=-1, verbose=-1), lgbm_space, run_name='optimizacion_lgbm', n_trials=50)
run_experiment(XGBRegressor(n_jobs=-1), xgb_space, run_name='optimizacion_xgb', n_trials=50)
run_experiment(RandomForestRegressor(n_jobs=-1), rf_space, run_name='optimizacion_rf', n_trials=50)
run_experiment(GradientBoostingRegressor(), gb_space, run_name='optimizacion_gb', n_trials=50)
run_experiment(ExtraTreesRegressor(n_jobs=-1), et_space, run_name='optimizacion_et', n_trials=50)

100%|███████████████████████████████████████████████| 50/50 [02:30<00:00,  3.01s/trial, best loss: 0.15754446568827543]
100%|████████████████████████████████████████████████| 50/50 [16:26<00:00, 19.73s/trial, best loss: 0.1348133784172402]
100%|███████████████████████████████████████████████| 50/50 [19:06<00:00, 22.94s/trial, best loss: 0.16666746128618037]
100%|█████████████████████████████████████████████| 50/50 [1:41:52<00:00, 122.24s/trial, best loss: 0.1373119253304316]
100%|████████████████████████████████████████████████| 50/50 [04:27<00:00,  5.36s/trial, best loss: 0.2095488864922687]
