# Notebook para Análisis de Precios de Toyota Corolla

## Configuración y Carga de Datos



In [None]:
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import os
import joblib

In [None]:
# Configuración de MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("toyota_parcial")

In [None]:
# Cargar dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/dodobeatle/dataeng-datos/refs/heads/main/ToyotaCorolla.csv",
    encoding="utf8",
    engine="python"
)

## Limpieza de datos

In [None]:
# Corregir la variable Doors y eliminar observaciones incorrectas
print("\nDistribución original de Doors:")
print(df['Doors'].value_counts())

# Eliminar coches con 2 puertas (observaciones probablemente erróneas)
df = df[df['Doors'] != 2]

# Corregir coches con 4 puertas (4 laterales → 5 contando el maletero)
df.loc[df['Doors'] == 4, 'Doors'] = 5

print("\nDistribución corregida de Doors:")
print(df['Doors'].value_counts())








## Exploración de Datos (EDA)



In [None]:
# Dimensiones del dataset
df.shape

In [None]:
# Información general
df.info()

In [None]:
# Estadísticas descriptivas
df.describe()

In [None]:
# Lista de columnas
print(df.columns.tolist())

In [None]:
# Verificación de valores nulos
df.isnull().sum()

In [None]:
# Análisis de variables categóricas
sns.countplot(data=df, x="Fuel_Type", palette="pastel")
plt.title("Distribución de Fuel_Type")
plt.show()

In [None]:
# Búsqueda de duplicados
def find_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Identifica filas duplicadas completas en el DataFrame.
    """
    mask = df.duplicated(keep='first')
    duplicates = df[mask].copy()
    return duplicates

duplicates = find_duplicates(df)
print(f"Número de filas duplicadas: {len(duplicates)}")
if len(duplicates) > 0:
    print("Filas duplicadas:")
    print(duplicates)

In [None]:
# Generamos matriz de correlación para el análisis exploratorio
corr = df.select_dtypes(include=['number']).corr()

# Top correlaciones con Price
top_vars = (corr['Price']
            .abs()
            .sort_values(ascending=False)
            .head(11))   # Price + 10 más altas

print("Variables más correlacionadas con Price:")
print(top_vars)

In [None]:
# Visualización de correlaciones principales
top_vars_idx = top_vars.index
sub_corr = corr.loc[top_vars_idx, top_vars_idx]

plt.figure(figsize=(10, 8))
sns.heatmap(
    sub_corr,
    cmap="vlag",
    annot=True,
    fmt=".2f",
    linewidths=.5,
    center=0
)
plt.xticks(rotation=45, ha="right")
plt.title("Top 10 correlaciones con Price", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Detección de outliers
def detect_outliers(df):
    cols = df.select_dtypes(include=np.number).columns
    outliers = pd.DataFrame(columns=['Feature', 'Number of Outliers'])
    
    for column in cols:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        fence_low = q1 - (1.5*iqr)
        fence_high = q3 + (1.5*iqr)
        outliers_count = df.loc[(df[column] < fence_low) | (df[column] > fence_high)].shape[0]
        
        outliers = pd.concat([outliers, pd.DataFrame({
            'Feature': [column], 
            'Number of Outliers': [outliers_count]
        })], ignore_index=True)
    
    return outliers.sort_values('Number of Outliers', ascending=False)

outlier_analysis = detect_outliers(df)
outlier_analysis





















## Preprocesamiento y Feature Engineering



In [None]:
# Separar X (predictores) e y (variable objetivo)
X = df.drop("Price", axis=1)
y = df["Price"]

# Eliminar columnas no relevantes para el modelo
X = df.drop(["Price", "Model", "Cylinders", "Id", "Radio_cassette", "BOVAG_Guarantee",
             "Backseat_Divider", "Mfg_Month","Mfg_Year", "Automatic", "Central_Lock",
             "Met_Color", "Mfr_Guarantee", "Guarantee_Period", "Gears", "Radio", "Power_Steering",
             "Metallic_Rim", "Tow_Bar", "Sport_Model"], axis=1)

print("Variables conservadas:")
print(X.columns.tolist())

In [None]:
# Feature Engineering
# Combinar airbags en una sola variable
X["Airbag_Count"] = df["Airbag_1"].fillna(0) + df["Airbag_2"].fillna(0)
X.drop(["Airbag_1", "Airbag_2"], axis=1, inplace=True)

# Codificación de variables categóricas
X = pd.get_dummies(X, drop_first=True)

# Forzar tipos numéricos
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Eliminar filas con NaNs
combined = pd.concat([X, y], axis=1).dropna()
X = combined.drop("Price", axis=1)
y = combined["Price"]

# Convertir a float y agregar constante para OLS
X = X.astype(float)
y = y.astype(float)
X = sm.add_constant(X)

# División train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1
)

print(f"Dimensiones de X_train: {X_train.shape}, X_test: {X_test.shape}")





## Selección de Características con Lasso



In [None]:
# Lasso Regression
from sklearn.linear_model import LassoCV

# Quitar constante para Lasso
X_train_lasso = X_train.drop("const", axis=1)
X_test_lasso = X_test.drop("const", axis=1)

# Entrenamiento con validación cruzada
lasso = LassoCV(cv=5, random_state=1)
lasso.fit(X_train_lasso, y_train)

# Predicciones y métricas
y_pred_lasso = lasso.predict(X_test_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Alpha óptimo: {lasso.alpha_:.6f}")
print(f"RMSE: {rmse_lasso:.2f}")
print(f"MAE: {mae_lasso:.2f}")
print(f"R²: {r2_lasso:.4f}")

# Variables seleccionadas
lasso_features = X_train_lasso.columns[lasso.coef_ != 0]
print(f"\nFeatures seleccionadas por Lasso ({len(lasso_features)}/{X_train_lasso.shape[1]}):")
print(lasso_features.tolist())



## Reducción de Dimensionalidad con PCA



In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Estandarización
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_lasso)
X_test_scaled = scaler.transform(X_test_lasso)

# PCA
pca = PCA(n_components=0.95, random_state=1)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Número de componentes PCA (95% varianza): {pca.n_components_}")

# Modelo con PCA
lasso_pca = LassoCV(cv=5, random_state=1)
lasso_pca.fit(X_train_pca, y_train)
y_pred_pca = lasso_pca.predict(X_test_pca)

# Evaluación
rmse_pca = np.sqrt(mean_squared_error(y_test, y_pred_pca))
r2_pca = r2_score(y_test, y_pred_pca)

print(f"RMSE: {rmse_pca:.2f}")
print(f"R²: {r2_pca:.4f}")



## Modelo de Regresión Lineal (OLS)



In [None]:
with mlflow.start_run(run_name="ols_model_run"):
    # Ajuste OLS
    model = sm.OLS(y_train, X_train)
    results = model.fit()
    
    # Resumen del modelo
    print(results.summary())
    
    # Predicción
    y_pred = results.predict(X_test)
    
    # Métricas
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2_sklearn = r2_score(y_test, y_pred)
    r2_sm = results.rsquared
    adj_r2 = results.rsquared_adj
    aic = results.aic
    bic = results.bic
    
    print(f"\nMétricas de evaluación:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2_sklearn:.4f}")
    print(f"R² ajustado: {adj_r2:.4f}")
    print(f"AIC: {aic:.2f}")
    print(f"BIC: {bic:.2f}")
    
    # Registro en MLflow
    mlflow.log_metric("rmse", float(rmse))
    mlflow.log_metric("mae", float(mae))
    mlflow.log_metric("r2_sklearn", float(r2_sklearn))
    mlflow.log_metric("r2_statsmodels", float(r2_sm))
    mlflow.log_metric("adj_r2", float(adj_r2))
    mlflow.log_metric("aic", float(aic))
    mlflow.log_metric("bic", float(bic))
    mlflow.log_metric("n_features", int(X_train.shape[1]))
    
    # Carpeta para artefactos
    os.makedirs("mlartifacts", exist_ok=True)
    
    # Guardar artefactos
    summary_path = os.path.join("mlartifacts", "ols_summary.txt")
    with open(summary_path, "w") as f:
        f.write(results.summary().as_text())
    mlflow.log_artifact(summary_path)
    
    model_pkl = os.path.join("mlartifacts", "ols_model.pkl")
    joblib.dump(results, model_pkl)
    mlflow.log_artifact(model_pkl)
    
    features_path = os.path.join("mlartifacts", "features.txt")
    with open(features_path, "w") as f:
        f.write("\n".join(X.columns))
    mlflow.log_artifact(features_path)



## Análisis de Multicolinealidad



In [None]:
# Análisis VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) 
                  for i in range(X_train.shape[1])]
    
# Ordenar por VIF descendente
vif_data = vif_data.sort_values("VIF", ascending=False)

print("Análisis de Factor de Inflación de Varianza (VIF):")
print("Un VIF > 10 indica alta multicolinealidad")
print(vif_data)

# Guardar resultados
vif_path = os.path.join("mlartifacts", "vif_report.csv")
vif_data.to_csv(vif_path, index=False)
mlflow.log_artifact(vif_path)



## Visualización de Resultados



In [None]:
# Plot: Real vs. Predicho
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', lw=2)
plt.xlabel("Precio Real")
plt.ylabel("Precio Predicho")
plt.title("Precio Real vs. Precio Predicho")
plt.grid(True)
plt.tight_layout()

real_vs_pred_path = os.path.join("mlartifacts", "actual_vs_predicted.png")
plt.savefig(real_vs_pred_path)
mlflow.log_artifact(real_vs_pred_path)
plt.show()

In [None]:
# Análisis de residuos
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, edgecolors='k')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Precio Predicho")
plt.ylabel("Residuos")
plt.title("Gráfico de Residuos")
plt.grid(True)
plt.tight_layout()

residuals_path = os.path.join("mlartifacts", "residuals_plot.png")
plt.savefig(residuals_path)
mlflow.log_artifact(residuals_path)
plt.show()

In [None]:
# Histograma de residuos
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=30, alpha=0.7, edgecolor='k')
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel("Residuos")
plt.ylabel("Frecuencia")
plt.title("Distribución de Residuos")
plt.grid(True)
plt.tight_layout()
plt.show()







## Comparativa de Modelos



In [None]:
# Crear tabla comparativa
results_df = pd.DataFrame({
    "Modelo": ["OLS", "Lasso", "Lasso+PCA"],
    "RMSE": [rmse, rmse_lasso, rmse_pca],
    "MAE": [mae, mae_lasso, mean_absolute_error(y_test, y_pred_pca)],
    "R²": [r2_sklearn, r2_lasso, r2_pca],
    "Num. Features": [X_train.shape[1], len(lasso_features), pca.n_components_]
})

print("Comparativa de modelos:")
print(results_df)

In [None]:
# Conclusiones
print("\nCONCLUSIONES:")
print("-" * 60)
print(f"• El modelo {results_df.loc[results_df['R²'].idxmax(), 'Modelo']} tiene el mejor R² ({results_df['R²'].max():.4f}).")
print(f"• El modelo {results_df.loc[results_df['RMSE'].idxmin(), 'Modelo']} tiene el RMSE más bajo ({results_df['RMSE'].min():.2f}).")
print(f"• Las variables más importantes para predecir el precio son:")
for feat, coef in sorted(zip(X_train.columns[1:], results.params[1:]), key=lambda x: abs(x[1]), reverse=True)[:5]:
    print(f"  - {feat}: {coef:.4f}")
print("-" * 60)
print("Modelo registrado con MLflow.")