In [166]:
!pip install scikit-learn
!pip install mlflow



In [167]:
# Importo librerías
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Carga de datos

In [168]:
# Cargo los datos
df = pd.read_csv("../data/data_cleaned.csv")

In [169]:
# Reviso las columnas del DataFrame
list(df.columns)

['id',
 'ad_type',
 'start_date',
 'end_date',
 'created_on',
 'lat',
 'lon',
 'l1',
 'l2',
 'l3',
 'rooms',
 'bedrooms',
 'bathrooms',
 'surface_total',
 'surface_covered',
 'price',
 'currency',
 'price_period',
 'title',
 'description',
 'property_type',
 'operation_type',
 'title_clean',
 'price_usd']

In [170]:
df["property_type"].unique()

array(['Departamento', 'Local comercial', 'Cochera', 'Casa', 'Lote',
       'Oficina', 'PH', 'Otro', 'Depósito'], dtype=object)

In [171]:
# Elimino columnas innecesarias
unique_ads = df['ad_type'].unique()
unique_op_type = df['operation_type'].unique()
print("Unique ad types:", unique_ads)
print("Unique operation types:", unique_op_type)

Unique ad types: ['Propiedad']
Unique operation types: ['Venta']


In [172]:
df = df.drop(columns=['id', 'ad_type','start_date', 'end_date', 'l1', 'l2', 'operation_type','title_clean', 'title', 'description'])

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797 entries, 0 to 2796
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   created_on       2797 non-null   object 
 1   lat              2797 non-null   float64
 2   lon              2797 non-null   float64
 3   l3               2797 non-null   object 
 4   rooms            2797 non-null   float64
 5   bedrooms         2797 non-null   float64
 6   bathrooms        2797 non-null   float64
 7   surface_total    2797 non-null   float64
 8   surface_covered  2797 non-null   float64
 9   price            2797 non-null   float64
 10  currency         2797 non-null   object 
 11  price_period     2797 non-null   object 
 12  property_type    2797 non-null   object 
 13  price_usd        2797 non-null   float64
dtypes: float64(9), object(5)
memory usage: 306.1+ KB


In [174]:
df['created_on'] = pd.to_datetime(df['created_on'])
# Año, mes, día, día de la semana
df['created_year'] = df['created_on'].dt.year
df['created_month'] = df['created_on'].dt.month
df['created_day'] = df['created_on'].dt.day

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797 entries, 0 to 2796
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   created_on       2797 non-null   datetime64[ns]
 1   lat              2797 non-null   float64       
 2   lon              2797 non-null   float64       
 3   l3               2797 non-null   object        
 4   rooms            2797 non-null   float64       
 5   bedrooms         2797 non-null   float64       
 6   bathrooms        2797 non-null   float64       
 7   surface_total    2797 non-null   float64       
 8   surface_covered  2797 non-null   float64       
 9   price            2797 non-null   float64       
 10  currency         2797 non-null   object        
 11  price_period     2797 non-null   object        
 12  property_type    2797 non-null   object        
 13  price_usd        2797 non-null   float64       
 14  created_year     2797 non-null   int32  

In [176]:
categorical_cols = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols)
df_encoded = df_encoded.drop(columns=['created_on'])

In [177]:
Y = df_encoded['price_usd']
X = df_encoded.drop(columns=['price_usd'])

In [178]:
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.33, random_state=42)

# MLflow

In [179]:
# Apuntar a la carpeta mlruns de la raíz
mlflow.set_tracking_uri("file:///C:/Users/Lara_/property-forecast/mlruns")
mlflow.set_experiment("Property_forecast_v0")

2025/11/09 23:36:42 INFO mlflow.tracking.fluent: Experiment with name 'Property_forecast_v0' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/Lara_/property-forecast/mlruns/399738839022833928', creation_time=1762749402242, experiment_id='399738839022833928', last_update_time=1762749402242, lifecycle_stage='active', name='Property_forecast_v0', tags={}>

# PCA

In [180]:
with mlflow.start_run(run_name="PCA_Regression"):
    
    # Escalamiento
    scaler = StandardScaler()
    scaler.fit(XTrain)
    XEscaladoTrain = scaler.transform(XTrain)
    XEscaladoTest = scaler.transform(XTest)

    # PCA
    p = XEscaladoTrain.shape[1]
    pca = PCA(n_components=p).fit(XEscaladoTrain)
    A = pca.components_
    varianza_acumulada = np.cumsum(pca.explained_variance_ratio_)
    
    # Selección automática de componentes
    porcentaje_objetivo = 0.95
    nComponentesElegidas = np.argmax(varianza_acumulada >= porcentaje_objetivo) + 1
    print(f"Número de componentes seleccionados: {nComponentesElegidas}")

    # Log de parámetros
    mlflow.log_param("method", "PCA")
    mlflow.log_param("PCA_selected_components", nComponentesElegidas)

    # Transformación
    ZTotalTrain = np.matmul(XEscaladoTrain, A)
    ZTrain = ZTotalTrain[:, :nComponentesElegidas]

    ZTotalTest = np.matmul(XEscaladoTest, A)
    ZTest = ZTotalTest[:, :nComponentesElegidas]

    # Regresión lineal sobre PCA
    regPCA = LinearRegression()
    regPCA.fit(ZTrain, YTrain)
    prediccionPCA = regPCA.predict(ZTest)

    # --- Métricas ---
    mse_pca = mean_squared_error(YTest, prediccionPCA)
    rmse_pca = np.sqrt(mse_pca)
    mae_pca = mean_absolute_error(YTest, prediccionPCA)
    r2_pca = r2_score(YTest, prediccionPCA)

    print(f"MSE PCA: {mse_pca:.2f}, RMSE: {rmse_pca:.2f}, MAE: {mae_pca:.2f}, R2: {r2_pca:.4f}")

    # Guardar métricas y modelo en MLflow
    mlflow.log_metric("MSE", mse_pca)
    mlflow.log_metric("RMSE", rmse_pca)
    mlflow.log_metric("MAE", mae_pca)
    mlflow.log_metric("R2", r2_pca)
    mlflow.sklearn.log_model(regPCA, name="PCA_LinearRegression_Model")


Número de componentes seleccionados: 49
MSE PCA: 18960732855.35, RMSE: 137697.98, MAE: 74909.31, R2: 0.7069




# PLS

In [181]:
with mlflow.start_run(run_name="PLS_Regression"):

    # Número máximo de componentes a evaluar
    max_components = min(XTrain.shape[1], 20)

    # Lista para almacenar el MSE promedio de cada número de componentes
    mse_scores = []

    for n in range(1, max_components + 1):
        pls = PLSRegression(n_components=n)
        # cross_val_score devuelve scores negativos para 'neg_mean_squared_error'
        score = -np.mean(cross_val_score(pls, XTrain, YTrain, cv=5, scoring='neg_mean_squared_error'))
        mse_scores.append(score)

    # Seleccionar el número de componentes que minimiza el MSE
    best_n_components = np.argmin(mse_scores) + 1
    print(f"Mejor número de componentes PLS: {best_n_components}")

    # Log del parámetro en MLflow
    mlflow.log_param("method", "PLS")
    mlflow.log_param("PLS_best_components", best_n_components)

    # Entrenar el modelo final con el número óptimo de componentes
    pls_opt = PLSRegression(n_components=best_n_components)
    pls_opt.fit(XTrain, YTrain)
    prediccionPLS = pls_opt.predict(XTest).ravel()

    # --- Métricas ---
    mse_pls = mean_squared_error(YTest, prediccionPLS)
    rmse_pls = np.sqrt(mse_pls)
    mae_pls = mean_absolute_error(YTest, prediccionPLS)
    r2_pls = r2_score(YTest, prediccionPLS)

    print(f"MSE PLS: {mse_pls:.2f}, RMSE: {rmse_pls:.2f}, MAE: {mae_pls:.2f}, R2: {r2_pls:.4f}")

    # Guardar métricas y modelo en MLflow
    mlflow.log_metric("MSE", mse_pls)
    mlflow.log_metric("RMSE", rmse_pls)
    mlflow.log_metric("MAE", mae_pls)
    mlflow.log_metric("R2", r2_pls)
    mlflow.sklearn.log_model(
        pls_opt,
        name="PLS_Regression_Model",
        input_example=XTest[:5]
    )

Mejor número de componentes PLS: 20
MSE PLS: 849546.75, RMSE: 921.71, MAE: 99.46, R2: 1.0000


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 943.63it/s]  
