In [87]:
# Librerías
import pandas as pd
import joblib
from sklearn.metrics import r2_score

In [89]:
# Cargo los datos desde PostgreSQL
from sqlalchemy import create_engine

# Parámetros de conexión
db_user = 'agustinrivas'
db_host = 'localhost'
db_port = '5432'
db_name = 'dataset_ml'

engine = create_engine(f'postgresql://{db_user}@{db_host}:{db_port}/{db_name}')
df = pd.read_sql('SELECT * FROM training_dataset', engine)

In [91]:
# Tomo muestra aleatoria de 10 filas
df_sample = df.sample(n=10, random_state=42).reset_index(drop=True)
df_original = df_sample.copy()

In [93]:
# Columnas numéricas y categóricas
num_cols = ['age', 'bmi', 'children']
cat_cols = ['sex', 'smoker', 'region']

In [95]:
# Transformación variables categóricas

# Codifico usando get_dummies
df_encoded = pd.get_dummies(df_sample, columns=cat_cols)

# Cargo columnas esperadas desde training
columns_order = joblib.load("models/columns_order.pkl")

# Agrego columnas faltantes que estaban en entrenamiento
for col in columns_order:
    if col not in df_encoded.columns:
        df_encoded[col] = False

# Reordeno para que las columnas estén igual que en entrenamiento
df_encoded = df_encoded[columns_order]

In [97]:
df_encoded.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,45,25.175,2,True,False,True,False,True,False,False,False
1,36,30.02,0,True,False,True,False,False,True,False,False
2,64,26.885,0,True,False,False,True,False,True,False,False
3,46,25.745,3,False,True,True,False,False,True,False,False
4,19,31.92,0,False,True,False,True,False,True,False,False


In [99]:
# Cargo el modelo seleccionado
model = joblib.load("models/best_model_XGBoost.pkl")

In [101]:
# Predict
y_pred = model.predict(df_encoded)

In [117]:
# Mostrar resultados
df_original['predicted_charges'] = y_pred
print(df_original)

   age     sex     bmi  children smoker     region      charges  \
0   45  female  25.175         2     no  northeast   9095.06825   
1   36  female  30.020         0     no  northwest   5272.17580   
2   64  female  26.885         0    yes  northwest  29330.98315   
3   46    male  25.745         3     no  northwest   9301.89355   
4   19    male  31.920         0    yes  northwest  33750.29180   
5   34    male  42.900         1     no  southwest   4536.25900   
6   19  female  22.515         0     no  northwest   2117.33885   
7   64    male  37.905         0     no  northwest  14210.53595   
8   28  female  17.290         0     no  northeast   3732.62510   
9   49    male  28.690         3     no  northwest  10264.44210   

   predicted_charges  
0       10799.731445  
1        5953.257324  
2       28139.390625  
3        9309.177734  
4       34653.316406  
5        5312.598633  
6        2493.903076  
7       16002.472656  
8        5166.419434  
9       10676.375000  


In [119]:
r2 = r2_score(df_original['charges'], df_original['predicted_charges'])
print(f"R2 sobre muestra de scoring: {r2:.4f}")

R2 sobre muestra de scoring: 0.9889


In [121]:
%run scoring.py

   age     sex     bmi  children smoker     region      charges  \
0   45  female  25.175         2     no  northeast   9095.06825   
1   36  female  30.020         0     no  northwest   5272.17580   
2   64  female  26.885         0    yes  northwest  29330.98315   
3   46    male  25.745         3     no  northwest   9301.89355   
4   19    male  31.920         0    yes  northwest  33750.29180   
5   34    male  42.900         1     no  southwest   4536.25900   
6   19  female  22.515         0     no  northwest   2117.33885   
7   64    male  37.905         0     no  northwest  14210.53595   
8   28  female  17.290         0     no  northeast   3732.62510   
9   49    male  28.690         3     no  northwest  10264.44210   

   predicted_charges  
0       10799.731445  
1        5953.257324  
2       28139.390625  
3        9309.177734  
4       34653.316406  
5        5312.598633  
6        2493.903076  
7       16002.472656  
8        5166.419434  
9       10676.375000  
R2 sobre 