In [345]:
!pip install scikit-learn
!pip install mlflow



In [346]:
# Importo librerías
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# Carga de datos

In [347]:
# Cargo los datos
df = pd.read_csv("../data/data_cleaned.csv")

In [348]:
# Reviso las columnas del DataFrame
list(df.columns)

['id',
 'ad_type',
 'start_date',
 'end_date',
 'created_on',
 'lat',
 'lon',
 'l1',
 'l2',
 'l3',
 'rooms',
 'bedrooms',
 'bathrooms',
 'surface_total',
 'surface_covered',
 'price',
 'currency',
 'price_period',
 'title',
 'description',
 'property_type',
 'operation_type',
 'title_clean',
 'price_usd']

In [349]:
# Elimino columnas innecesarias
unique_ads = df['ad_type'].unique()
unique_op_type = df['operation_type'].unique()
print("Unique ad types:", unique_ads)
print("Unique operation types:", unique_op_type)

Unique ad types: ['Propiedad']
Unique operation types: ['Venta']


In [350]:
df = df.drop(columns=['id', 'ad_type','start_date', 'end_date', 'l1', 'l2', 'operation_type','title_clean', 'title', 'description', 'price'])

In [351]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797 entries, 0 to 2796
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   created_on       2797 non-null   object 
 1   lat              2797 non-null   float64
 2   lon              2797 non-null   float64
 3   l3               2797 non-null   object 
 4   rooms            2797 non-null   float64
 5   bedrooms         2797 non-null   float64
 6   bathrooms        2797 non-null   float64
 7   surface_total    2797 non-null   float64
 8   surface_covered  2797 non-null   float64
 9   currency         2797 non-null   object 
 10  price_period     2797 non-null   object 
 11  property_type    2797 non-null   object 
 12  price_usd        2797 non-null   float64
dtypes: float64(8), object(5)
memory usage: 284.2+ KB


In [352]:
# convertir fechas y extraer datos
df['created_on'] = pd.to_datetime(df['created_on'])

# Año, mes, día, día de la semana
df['created_year'] = df['created_on'].dt.year
df['created_month'] = df['created_on'].dt.month
df['created_day'] = df['created_on'].dt.day

In [353]:
# Separar las variables categóricas y numéricas
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encoding para variables categóricas
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Eliminar la columna fecha original
df_encoded = df_encoded.drop(columns=['created_on'])

In [354]:
# Separar variables independientes y dependientes
Y = df_encoded['price_usd']
X = df_encoded.drop(columns=['price_usd'])

In [355]:
# Dividir en conjunto de entrenamiento y prueba
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.33, random_state=42)

# MLflow

In [356]:
# Apuntar a la carpeta mlruns de la raíz

current_dir = os.getcwd()
mlruns_path = os.path.join(current_dir, "..", "mlruns")  
mlflow.set_tracking_uri(f"file:///{mlruns_path}")
mlflow.set_experiment("Property_forecast_v0")

<Experiment: artifact_location='file:///C:/Users/Lara_/property-forecast/mlruns/399738839022833928', creation_time=1762749402242, experiment_id='399738839022833928', last_update_time=1762749402242, lifecycle_stage='active', name='Property_forecast_v0', tags={'mlflow.experimentKind': 'custom_model_development'}>

# Regresion lineal, PCA y PLS

In [None]:
# Escalamiento
numeric_cols_scaled = [c for c in numeric_cols if c in XTrain.columns]
scaler = StandardScaler()
scaler.fit(XTrain[numeric_cols_scaled])

XTrain_scaled = XTrain.copy()
XTest_scaled  = XTest.copy()

XTrain_scaled[numeric_cols_scaled] = scaler.transform(XTrain[numeric_cols_scaled])
XTest_scaled[numeric_cols_scaled]  = scaler.transform(XTest[numeric_cols_scaled])

In [368]:
XTrain_scaled.shape[1]

65

## PCA

In [358]:
with mlflow.start_run(run_name="PCA_Regression"):
    
    # PCA
    p = XTrain_array.shape[1]
    pca = PCA(n_components=p).fit(XTrain_array)
    A = pca.components_
    varianza_acumulada = np.cumsum(pca.explained_variance_ratio_)
    
    # Selección automática de componentes
    porcentaje_objetivo = 0.95
    nComponentesElegidas = np.argmax(varianza_acumulada >= porcentaje_objetivo) + 1
    print(f"Número de componentes seleccionados: {nComponentesElegidas}")

    # Log de parámetros
    mlflow.log_param("method", "PCA")
    mlflow.log_param("PCA_selected_components", nComponentesElegidas)

    # Transformación
    ZTrain = np.matmul(XTrain_array, A)[:, :nComponentesElegidas]
    ZTest  = np.matmul(XTest_array, A)[:, :nComponentesElegidas]

    # Regresión lineal sobre PCA
    regPCA = LinearRegression()
    regPCA.fit(ZTrain, YTrain)
    prediccionPCA = regPCA.predict(ZTest)

    # --- Métricas ---
    mse_pca = mean_squared_error(YTest, prediccionPCA)
    rmse_pca = np.sqrt(mse_pca)
    mae_pca = mean_absolute_error(YTest, prediccionPCA)
    r2_pca = r2_score(YTest, prediccionPCA)

    print(f"MSE PCA: {mse_pca:.2f}, RMSE: {rmse_pca:.2f}, MAE: {mae_pca:.2f}, R2: {r2_pca:.4f}")

    # Guardar métricas y modelo en MLflow
    mlflow.log_metric("MSE", mse_pca)
    mlflow.log_metric("RMSE", rmse_pca)
    mlflow.log_metric("MAE", mae_pca)
    mlflow.log_metric("R2", r2_pca)
    mlflow.sklearn.log_model(regPCA, name="PCA_LinearRegression_Model")


Número de componentes seleccionados: 4
MSE PCA: 40040059500.37, RMSE: 200100.12, MAE: 119355.48, R2: 0.3810




## PLS

In [None]:
with mlflow.start_run(run_name="PLS_Regression"):

    # Convertir DataFrames a arrays de NumPy
    XTrain_array = XTrain_scaled.drop(columns=['price_usd'], errors='ignore').values
    XTest_array  = XTest_scaled.drop(columns=['price_usd'], errors='ignore').values

    # Número máximo de componentes a evaluar
    max_components = XTrain_array.shape[1]

    # Lista para almacenar el MSE promedio de cada número de componentes
    mse_scores = []

    for n in range(1, max_components + 1):
        pls = PLSRegression(n_components=n)
        score = -np.mean(cross_val_score(pls, XTrain_array, YTrain, cv=5, scoring='neg_mean_squared_error'))
        mse_scores.append(score)

    # Seleccionar el número de componentes que minimiza el MSE
    best_n_components = int(np.argmin(mse_scores) + 1)
    print(f"Mejor número de componentes PLS: {best_n_components}")

    # Log del parámetro en MLflow
    mlflow.log_param("method", "PLS")
    mlflow.log_param("PLS_best_components", best_n_components)

    # Entrenar el modelo final con el número óptimo de componentes
    pls_opt = PLSRegression(n_components=best_n_components)
    pls_opt.fit(XTrain_array, YTrain)
    prediccionPLS = pls_opt.predict(XTest_array).ravel()

    # --- Métricas ---
    mse_pls = mean_squared_error(YTest, prediccionPLS)
    rmse_pls = np.sqrt(mse_pls)
    mae_pls = mean_absolute_error(YTest, prediccionPLS)
    r2_pls = r2_score(YTest, prediccionPLS)

    print(f"MSE PLS: {mse_pls:.2f}, RMSE: {rmse_pls:.2f}, MAE: {mae_pls:.2f}, R2: {r2_pls:.4f}")

    # Guardar métricas y modelo en MLflow
    mlflow.log_metric("MSE", mse_pls)
    mlflow.log_metric("RMSE", rmse_pls)
    mlflow.log_metric("MAE", mae_pls)
    mlflow.log_metric("R2", r2_pls)
    mlflow.sklearn.log_model(
        pls_opt,
        name="PLS_Regression_Model"
    ) 

Mejor número de componentes PLS: 5
MSE PLS: 35328666428.59, RMSE: 187959.21, MAE: 112687.44, R2: 0.4538




## Regresion Lineal

In [360]:
# SelectKBest y LinearRegression
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_regression)),
    ('reg', LinearRegression())
])

param_grid = {
    'select__k': list(range(1, XTrain_scaled.shape[1] + 1))
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
grid.fit(XTrain_scaled, YTrain)

best_k = grid.best_params_['select__k']
print("Mejor K:", best_k)

Mejor K: 1


In [361]:
# Entrenar modelo final y registrar en MLflow 
selector_best = SelectKBest(score_func=f_regression, k=best_k)
XTrain_selected = selector_best.fit_transform(XTrain_scaled, YTrain)
XTest_selected  = selector_best.transform(XTest_scaled)
selected_features = XTrain_scaled.columns[selector_best.get_support()]

with mlflow.start_run(run_name="LinearRegression_SelectKBest"):
    model = LinearRegression()
    model.fit(XTrain_selected, YTrain)
    YPred = model.predict(XTest_selected)

    mse_rl = mean_squared_error(YTest, YPred)
    rmse_rl = np.sqrt(mse_rl)
    mae_rl = mean_absolute_error(YTest, YPred)
    r2_rl = r2_score(YTest, YPred)
    print(f"MSE: {mse_rl:.2f}, RMSE: {rmse_rl:.2f}, MAE: {mae_rl:.2f}, R2: {r2_rl:.4f}")

    mlflow.log_metric("MSE", mse_rl)
    mlflow.log_metric("RMSE", rmse_rl)
    mlflow.log_metric("MAE", mae_rl)
    mlflow.log_metric("R2", r2_rl)

    mlflow.log_param("method", "LinearRegression_SelectKBest")
    mlflow.log_param("num_features_selected", best_k)
    mlflow.log_param("selected_features", list(selected_features))

    mlflow.sklearn.log_model(model, name="LinearRegression_SelectKBest")

MSE: 41876618209.97, RMSE: 204637.77, MAE: 117834.01, R2: 0.3526




# Regresion Ridge/Lasso

In [362]:
# Diccionario de alphas a probar
alphas = np.logspace(-3, 3, 13)  # 0.001, 0.01, ..., 1000

## Ridge

In [363]:
# RIDGE REGRESSION 
with mlflow.start_run(run_name="Ridge_Regression"):
    
    ridge = Ridge()
    param_grid = {'alpha': alphas}
    
    grid_ridge = GridSearchCV(
        ridge,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    grid_ridge.fit(XTrain_scaled, YTrain)
    
    best_alpha = grid_ridge.best_params_['alpha']
    print("Mejor alpha Ridge:", best_alpha)
    
    # Entrenar modelo final
    ridge_final = Ridge(alpha=best_alpha)
    ridge_final.fit(XTrain_scaled, YTrain)
    YPred_ridge = ridge_final.predict(XTest_scaled)
    
    # Métricas
    mse_ridge = mean_squared_error(YTest, YPred_ridge)
    rmse_ridge = np.sqrt(mse_ridge)
    mae_ridge = mean_absolute_error(YTest, YPred_ridge)
    r2_ridge = r2_score(YTest, YPred_ridge)
    
    print(f"Ridge -> MSE: {mse_ridge:.2f}, RMSE: {rmse_ridge:.2f}, MAE: {mae_ridge:.2f}, R2: {r2_ridge:.4f}")
    
    # Guardar en MLflow
    mlflow.log_param("method", "Ridge")
    mlflow.log_param("alpha", best_alpha)
    mlflow.log_metric("MSE", mse_ridge)
    mlflow.log_metric("RMSE", rmse_ridge)
    mlflow.log_metric("MAE", mae_ridge)
    mlflow.log_metric("R2", r2_ridge)
    mlflow.sklearn.log_model(ridge_final, name="Ridge_Model")

Mejor alpha Ridge: 1000.0
Ridge -> MSE: 46149069859.43, RMSE: 214823.35, MAE: 126716.95, R2: 0.2865




## Lasso

In [364]:
# LASSO REGRESSION
with mlflow.start_run(run_name="Lasso_Regression"):
    
    lasso = Lasso(max_iter=10000)
    param_grid = {'alpha': alphas}
    
    grid_lasso = GridSearchCV(
        lasso,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    grid_lasso.fit(XTrain_scaled, YTrain)
    
    best_alpha = grid_lasso.best_params_['alpha']
    print("Mejor alpha Lasso:", best_alpha)
    
    # Entrenar modelo final
    lasso_final = Lasso(alpha=best_alpha, max_iter=10000)
    lasso_final.fit(XTrain_scaled, YTrain)
    YPred_lasso = lasso_final.predict(XTest_scaled)
    
    # Métricas
    mse_lasso = mean_squared_error(YTest, YPred_lasso)
    rmse_lasso = np.sqrt(mse_lasso)
    mae_lasso = mean_absolute_error(YTest, YPred_lasso)
    r2_lasso = r2_score(YTest, YPred_lasso)
    
    print(f"Lasso -> MSE: {mse_lasso:.2f}, RMSE: {rmse_lasso:.2f}, MAE: {mae_lasso:.2f}, R2: {r2_lasso:.4f}")
    
    # Guardar en MLflow
    mlflow.log_param("method", "Lasso")
    mlflow.log_param("alpha", best_alpha)
    mlflow.log_metric("MSE", mse_lasso)
    mlflow.log_metric("RMSE", rmse_lasso)
    mlflow.log_metric("MAE", mae_lasso)
    mlflow.log_metric("R2", r2_lasso)
    mlflow.sklearn.log_model(lasso_final, name="Lasso_Model")

  model = cd_fast.enet_coordinate_descent(


Mejor alpha Lasso: 0.31622776601683794


  model = cd_fast.enet_coordinate_descent(


Lasso -> MSE: 214571831994.18, RMSE: 463218.99, MAE: 129290.38, R2: -2.3173




# Arbol de Decision

In [365]:
# Parámetros a probar
param_grid = {
    'max_depth': list(range(3, 21, 2)),        # profundidad de 3 a 20 de 2 en 2
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV con 5 folds
grid_tree = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

with mlflow.start_run(run_name="DecisionTree_Regression"):
    
    grid_tree.fit(XTrain_scaled, YTrain)
    
    # Mejor modelo encontrado
    best_tree = grid_tree.best_estimator_
    print("Mejores parámetros:", grid_tree.best_params_)
    
    # Predicciones
    YPred_tree = best_tree.predict(XTest_scaled)
    
    # Métricas
    mse_tree = mean_squared_error(YTest, YPred_tree)
    rmse_tree = np.sqrt(mse_tree)
    mae_tree = mean_absolute_error(YTest, YPred_tree)
    r2_tree = r2_score(YTest, YPred_tree)
    
    print(f"Decision Tree -> MSE: {mse_tree:.2f}, RMSE: {rmse_tree:.2f}, MAE: {mae_tree:.2f}, R2: {r2_tree:.4f}")
    
    # Guardar en MLflow
    mlflow.log_param("method", "DecisionTree")
    for param, value in grid_tree.best_params_.items():
        mlflow.log_param(param, value)
    
    mlflow.log_metric("MSE", mse_tree)
    mlflow.log_metric("RMSE", rmse_tree)
    mlflow.log_metric("MAE", mae_tree)
    mlflow.log_metric("R2", r2_tree)
    
    mlflow.sklearn.log_model(best_tree, name="DecisionTree_Model")

Mejores parámetros: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 10}
Decision Tree -> MSE: 18705030580.47, RMSE: 136766.34, MAE: 70128.65, R2: 0.7108




# Random Forest

In [366]:
# Parámetros a probar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

with mlflow.start_run(run_name="RandomForest_Regression"):
    
    # Entrenar GridSearch
    grid_rf.fit(XTrain_scaled, YTrain)
    
    # Mejor modelo
    best_rf = grid_rf.best_estimator_
    print("Mejores parámetros Random Forest:", grid_rf.best_params_)
    
    # Predicciones
    YPred_rf = best_rf.predict(XTest_scaled)
    
    # Métricas
    mse_rf = mean_squared_error(YTest, YPred_rf)
    rmse_rf = np.sqrt(mse_rf)
    mae_rf = mean_absolute_error(YTest, YPred_rf)
    r2_rf = r2_score(YTest, YPred_rf)
    
    print(f"Random Forest -> MSE: {mse_rf:.2f}, RMSE: {rmse_rf:.2f}, MAE: {mae_rf:.2f}, R2: {r2_rf:.4f}")
    
    # Guardar en MLflow
    mlflow.log_param("method", "RandomForest")
    for param, value in grid_rf.best_params_.items():
        mlflow.log_param(param, value)
    
    mlflow.log_metric("MSE", mse_rf)
    mlflow.log_metric("RMSE", rmse_rf)
    mlflow.log_metric("MAE", mae_rf)
    mlflow.log_metric("R2", r2_rf)
    
    mlflow.sklearn.log_model(best_rf, name="RandomForest_Model")

Mejores parámetros Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest -> MSE: 12205418208.53, RMSE: 110478.13, MAE: 52847.04, R2: 0.8113


