In [5]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import joblib
import os

In [6]:
# Configurar MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("toyota_project")

2025/05/17 17:23:33 INFO mlflow.tracking.fluent: Experiment with name 'toyota_project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/534797378180424110', creation_time=1747513413205, experiment_id='534797378180424110', last_update_time=1747513413205, lifecycle_stage='active', name='toyota_project', tags={}>

In [7]:
# Cargar dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/dodobeatle/dataeng-datos/refs/heads/main/ToyotaCorolla.csv",
    encoding="utf8",
    engine="python"
)

In [8]:
# Separar variables
X = df.drop("Price", axis=1)
y = df["Price"]

In [9]:
# Filtrar columnas que querés mantener
columns_to_keep = ["Age_08_04", "KM", "Fuel_Type", "HP", "cc", "Doors", "Gears", "Weight"]
X = df[columns_to_keep]
y = df["Price"]

# Preprocesamiento
X = pd.get_dummies(X, drop_first=True)           # One-hot encoding
X = X.apply(pd.to_numeric, errors='coerce')      # Forzar datos a numérico
y = pd.to_numeric(y, errors='coerce')            # Lo mismo para el target

# Combinar X e y para eliminar filas con NaNs
combined = pd.concat([X, y], axis=1).dropna()
X = combined.drop("Price", axis=1)
y = combined["Price"]

# Convertir a float y agregar constante para OLS
X = X.astype(float)
y = y.astype(float)
X = sm.add_constant(X)

In [10]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1
)

In [11]:
# Iniciar experimento
with mlflow.start_run(run_name="ols_model_run"):
    # Ajustar modelo
    model = sm.OLS(y_train, X_train)
    results = model.fit()

    # Predicciones
    y_pred = results.predict(X_test)

    # Métricas
    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2_score", r2)

    # Crear carpeta si no existe
    os.makedirs("mlartifacts", exist_ok=True)

    # Guardar resumen como archivo .txt
    summary_path = os.path.join("mlartifacts", "ols_summary.txt")
    with open(summary_path, "w") as f:
        f.write(results.summary().as_text())
    mlflow.log_artifact(summary_path)

    # Guardar modelo con joblib
    model_path = os.path.join("mlartifacts", "ols_model.pkl")
    joblib.dump(results, model_path)
    mlflow.log_artifact(model_path)

    # Guardar nombres de features
    features_path = os.path.join("mlartifacts", "features.txt")
    with open(features_path, "w") as f:
        f.write("\n".join(X.columns))
    mlflow.log_artifact(features_path)

print("Modelo OLS registrado con MLflow.")

🏃 View run ols_model_run at: http://localhost:5000/#/experiments/534797378180424110/runs/bc4ac131f2cf42478c1953cd1d7e6aba
🧪 View experiment at: http://localhost:5000/#/experiments/534797378180424110
Modelo OLS registrado con MLflow.
