In [None]:
!pip install scikit-learn
!pip install mlflow

In [None]:
# Importo librerías
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# Carga de datos

In [None]:
# Cargo los datos
df = pd.read_csv("../data/data_cleaned.csv")

In [None]:
# Reviso las columnas del DataFrame
list(df.columns)

In [None]:
# Elimino columnas innecesarias
unique_ads = df['ad_type'].unique()
unique_op_type = df['operation_type'].unique()
print("Unique ad types:", unique_ads)
print("Unique operation types:", unique_op_type)

In [None]:
df = df.drop(columns=['id', 'ad_type','start_date', 'end_date', 'l1', 'l2', 'operation_type','title_clean', 'title', 'description', 'price'])

In [None]:
df.info()

In [None]:
# convertir fechas y extraer datos
df['created_on'] = pd.to_datetime(df['created_on'])

# Año, mes, día, día de la semana
df['created_year'] = df['created_on'].dt.year
df['created_month'] = df['created_on'].dt.month
df['created_day'] = df['created_on'].dt.day

In [None]:
# Separar las variables categóricas y numéricas
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encoding para variables categóricas
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Eliminar la columna fecha original
df_encoded = df_encoded.drop(columns=['created_on'])

In [None]:
# Separar variables independientes y dependientes
Y = df_encoded['price_usd']
X = df_encoded.drop(columns=['price_usd'])

In [None]:
# Dividir en conjunto de entrenamiento y prueba
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.33, random_state=42)

# MLflow

In [None]:
# Apuntar a la carpeta mlruns de la raíz

current_dir = os.getcwd()
mlruns_path = os.path.join(current_dir, "..", "mlruns")  
mlflow.set_tracking_uri(f"file:///{mlruns_path}")
mlflow.set_experiment("Property_forecast_v0")

# ElasticNet con GridSearchCV

In [None]:
with mlflow.start_run(run_name="ElasticNet_Grid"):
    grid = {
        "alpha":    np.logspace(-3, 2, 10),
        "l1_ratio": np.linspace(0.05, 0.95, 10)
    }
    model = ElasticNet(max_iter=20000, random_state=42)
    gs = GridSearchCV(model, grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
    gs.fit(Xtr, ytr)

    best = gs.best_estimator_
    mlflow.log_params({"model":"ElasticNet", **gs.best_params_})

    ypred = best.predict(Xte)
    metrics = eval_and_log(yte, ypred)

    mlflow.sklearn.log_model(best, artifact_path="model")
    print("ElasticNet:", gs.best_params_, metrics)

## SVR con kernel RBF

In [None]:
with mlflow.start_run(run_name="SVR_RBF_Grid"):
    grid = {
        "C":       [1, 3, 10, 30, 100],
        "epsilon": [0.01, 0.05, 0.1, 0.2],
        "gamma":   ["scale", "auto"]
    }
    model = SVR(kernel="rbf")
    gs = GridSearchCV(model, grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
    gs.fit(Xtr, ytr)

    best = gs.best_estimator_
    mlflow.log_params({"model":"SVR_RBF", **gs.best_params_})

    ypred = best.predict(Xte)
    metrics = eval_and_log(yte, ypred)

    mlflow.sklearn.log_model(best, artifact_path="model")
    print("SVR RBF:", gs.best_params_, metrics)


## HistGradientBoostingRegressor

In [None]:
with mlflow.start_run(run_name="HistGradientBoosting_Grid"):
    grid = {
        "learning_rate": [0.03, 0.05, 0.1],
        "max_depth": [None, 6, 12],
        "max_leaf_nodes": [31, 63, 127]
    }
    model = HistGradientBoostingRegressor(
        random_state=42,
        early_stopping=True,
        validation_fraction=0.15
    )
    gs = GridSearchCV(model, grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
    gs.fit(Xtr, ytr)

    best = gs.best_estimator_
    mlflow.log_params({"model":"HistGradientBoosting", **gs.best_params_})

    ypred = best.predict(Xte)
    metrics = eval_and_log(yte, ypred)

    mlflow.sklearn.log_model(best, artifact_path="model")
    print("HGBR:", gs.best_params_, metrics)