In [139]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor


In [141]:
# Cargo los datos desde PostgreSQL
from sqlalchemy import create_engine

# Parámetros de conexión
db_user = 'agustinrivas'
db_host = 'localhost'
db_port = '5432'
db_name = 'dataset_ml'

engine = create_engine(f'postgresql://{db_user}@{db_host}:{db_port}/{db_name}')
df = pd.read_sql('SELECT * FROM training_dataset', engine)

In [143]:
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [145]:
# Variable Target
X = df.drop('charges', axis=1)
y = df['charges']

# Columnas numéricas y categóricas
num_cols = ['age', 'bmi', 'children']
cat_cols = ['sex', 'smoker', 'region']

In [147]:
# Transformación variables categóricas
df_final = pd.get_dummies(X, columns = cat_cols)
df_final.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,True,False,False,True,False,False,False,True
1,18,33.77,1,False,True,True,False,False,False,True,False
2,28,33.0,3,False,True,True,False,False,False,True,False
3,33,22.705,0,False,True,True,False,False,True,False,False
4,32,28.88,0,False,True,True,False,False,True,False,False


In [149]:
# Orden de columnas para scoring
columns_order = df_final.columns.tolist()
joblib.dump(columns_order, 'models/columns_order.pkl')

['models/columns_order.pkl']

In [151]:
# Split de los datos
X_train, X_test, y_train, y_test = train_test_split(df_final, y, test_size=0.2, random_state=42)

In [153]:
# Grilla para modelos: RandomForest, GradientBoostingRegressor, XGBoost
grid_models = {
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'param': {
            'n_estimators': [100, 200],
            'max_depth': [1,5,10,25,50],
            'min_samples_leaf':[2,25,50],
            'min_samples_split':[2,25,50],
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'param': {
            'n_estimators': [100, 200],
            'learning_rate': [0.1, 0.05],
            'max_depth': [1,5,10,25,50],
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'param': {
            'n_estimators': [100, 200],
            'learning_rate': [0.1, 0.05],
            'max_depth': [1,5,10,25,50],
        }
    }
}

In [155]:
results = []
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for modelo, config in grid_models.items():

    grid = GridSearchCV(
        estimator=config['model'],
        param_grid=config['param'],
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=1
    )
    
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    results.append((modelo,grid.best_params_, rmse,r2))




In [156]:
# Guardo resultados en txt
df_results = pd.DataFrame(results, columns=['Modelo', 'Mejores_params', 'RMSE', 'R2'])

with open("/Users/agustinrivas/Library/CloudStorage/OneDrive-Personal/Projects/Met  Challengeresultados_modelos.txt", "w") as f:
    f.write("Evaluación de modelos\n")
    f.write(df_results.to_string(index=False))


In [165]:
%run training.py

