<a href="https://colab.research.google.com/github/brunobobadilla06/Proyecto-Final---Ciencia-de-Datos-I/blob/main/Proyecto_Final_Ciencia_de_datos_I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


RANDOM_STATE = 42


In [None]:
df = pd.read_csv(r"/mnt/data/dataset_ecommerce_tecnologia (1).csv")
display(df.head())
print("Shape:", df.shape)
print(df.dtypes)


In [None]:

df["fecha_ingreso"] = pd.to_datetime(df["fecha_ingreso"], format="%Y/%m/%d", errors="coerce")


df["anio_ingreso"] = df["fecha_ingreso"].dt.year
df["mes_ingreso"] = df["fecha_ingreso"].dt.month
df["dia_ingreso"] = df["fecha_ingreso"].dt.day


df_model = df.dropna(subset=["satisfaccion_cliente"]).copy()

print("Filas antes:", len(df), "| Filas para modelar:", len(df_model))
display(df_model.head())


In [None]:

X = df_model.drop(columns=["satisfaccion_cliente", "fecha_ingreso"])
y = df_model["satisfaccion_cliente"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "| Test:", X_test.shape)


In [None]:

numeric_features = ["precio", "stock", "costo_operativo", "anio_ingreso", "mes_ingreso", "dia_ingreso"]
categorical_features = ["nombre_producto", "region"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)

preprocessor


In [None]:

K_BEST = 10

feature_selector = SelectKBest(score_func=f_regression, k=K_BEST)
feature_selector


In [None]:

pipe_lr = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("select_kbest", feature_selector),
    ("model", LinearRegression())
])


pipe_rf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("select_kbest", feature_selector),
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=RANDOM_STATE
    ))
])

pipe_lr, pipe_rf


In [None]:
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring = {
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),  # negativo: sklearn maximiza
    "RMSE": make_scorer(rmse, greater_is_better=False),
    "R2": "r2"
}

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_lr = cross_validate(pipe_lr, X_train, y_train, cv=cv, scoring=scoring, error_score="raise")
cv_rf = cross_validate(pipe_rf, X_train, y_train, cv=cv, scoring=scoring, error_score="raise")

def summarize_cv(cv_result, name):
    mae = -cv_result["test_MAE"]
    rmse_vals = -cv_result["test_RMSE"]
    r2 = cv_result["test_R2"]
    return pd.DataFrame({
        "modelo": [name],
        "MAE_mean": [mae.mean()],
        "MAE_std": [mae.std()],
        "RMSE_mean": [rmse_vals.mean()],
        "RMSE_std": [rmse_vals.std()],
        "R2_mean": [r2.mean()],
        "R2_std": [r2.std()],
    })

results = pd.concat([
    summarize_cv(cv_lr, "LinearRegression + SelectKBest"),
    summarize_cv(cv_rf, "RandomForestRegressor + SelectKBest"),
], ignore_index=True)

results


In [None]:

best_name = results.sort_values("R2_mean", ascending=False).iloc[0]["modelo"]
print("Modelo elegido:", best_name)

best_model = pipe_rf if "RandomForest" in best_name else pipe_lr


best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

mae_test = mean_absolute_error(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
r2_test = r2_score(y_test, y_pred)

print("MÃ©tricas en TEST")
print("MAE :", mae_test)
print("RMSE:", rmse_test)
print("R2  :", r2_test)


In [None]:



num_names = numeric_features

ohe = best_model.named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
ohe_feature_names = list(ohe.get_feature_names_out(categorical_features))

final_feature_names = num_names + ohe_feature_names


mask = best_model.named_steps["select_kbest"].get_support()

selected_features = [f for f, keep in zip(final_feature_names, mask) if keep]

print(f"Cantidad de features finales: {len(final_feature_names)}")
print(f"Cantidad seleccionadas (K={K_BEST}): {len(selected_features)}")
selected_features
