
# Practica 4 - Base Unificada (Listings)

Objetivo: construir una base unificada con **listings.csv** y aplicar tecnicas vistas en los notebooks `source/##_*.ipynb`.

- Target principal: **price** (regresion)
- Dataset: **local** `listings.csv`
- Incluye: EDA profundo + diccionario de datos, limpieza, ingenieria de variables, seleccion de variables,
  modelos lineales/regularizados, KNN, SVM/SVR, Kernel Ridge, SGD, arboles, ensambles, redes neuronales,
  y una seccion auxiliar de clasificacion por segmentos de precio para aplicar LDA/Naive Bayes.


In [None]:

# === 1) Imports y configuracion ===
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression

# Modelos regresion
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor,
    VotingRegressor
)
from sklearn.neural_network import MLPRegressor

# Modelos clasificacion (auxiliar)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Metricas
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, f1_score


In [None]:
DATA_DIR = Path("data")

if not Path(DATA_PATH).exists():
    raise FileNotFoundError(f"No existe {DATA_PATH} en el directorio actual")

raw = pd.read_csv(DATA_DIR / "listings.csv.gz", compression="gzip", low_memory=False)
print("Shape:", raw.shape)
raw.head(3)


## 3) EDA profundo y diccionario de datos

In [None]:

# Vista rapida
raw.info()


In [None]:

# Resumen de nulos
missing = (raw.isna().mean() * 100).sort_values(ascending=False)
missing.head(20)


In [None]:

# Diccionario de datos

def build_data_dictionary(df):
    rows = []
    n = len(df)
    for col in df.columns:
        s = df[col]
        dtype = str(s.dtype)
        n_missing = s.isna().sum()
        pct_missing = (n_missing / n) * 100
        n_unique = s.nunique(dropna=True)
        pct_unique = (n_unique / n) * 100
        sample = s.dropna().astype(str).head(3).tolist()
        mean_len = None
        if s.dtype == 'object':
            mean_len = s.dropna().astype(str).str.len().mean()
        rows.append({
            'column': col,
            'dtype': dtype,
            'n_missing': n_missing,
            'pct_missing': round(pct_missing, 2),
            'n_unique': n_unique,
            'pct_unique': round(pct_unique, 2),
            'mean_len': round(mean_len, 2) if mean_len else None,
            'sample': sample
        })
    return pd.DataFrame(rows)


data_dict = build_data_dictionary(raw)
data_dict.head(10)


In [None]:

# Heuristicas para candidatos a eliminar

def detect_drop_candidates(df, missing_threshold=60, high_card_threshold=200, long_text_len=80):
    rows = []
    n = len(df)
    for col in df.columns:
        s = df[col]
        name = col.lower()
        n_unique = s.nunique(dropna=True)
        pct_unique = (n_unique / n) * 100 if n else 0
        pct_missing = s.isna().mean() * 100
        reasons = []

        # Constantes
        if n_unique <= 1:
            reasons.append('constant')

        # Muchos nulos
        if pct_missing > missing_threshold:
            reasons.append(f'missing>{missing_threshold}%')

        # IDs / URL / metadata
        if 'url' in name or s.astype(str).str.contains('http', case=False, na=False).mean() > 0.2:
            reasons.append('url_like')
        if 'scrape' in name:
            reasons.append('scrape_meta')
        if name == 'id' or name.endswith('id') or 'id_' in name:
            if pct_unique > 90:
                reasons.append('id_like')

        # Texto largo / alta cardinalidad
        if s.dtype == 'object':
            mean_len = s.dropna().astype(str).str.len().mean()
            if mean_len and mean_len > long_text_len:
                reasons.append('long_text')
            if n_unique > high_card_threshold:
                reasons.append('high_cardinality')

        if reasons:
            rows.append({
                'column': col,
                'reasons': ', '.join(reasons),
                'pct_missing': round(pct_missing, 2),
                'pct_unique': round(pct_unique, 2),
                'n_unique': n_unique
            })

    report = pd.DataFrame(rows).sort_values(by=['pct_missing', 'n_unique'], ascending=False)
    candidates = report['column'].tolist()
    return report, candidates


drop_report, drop_candidates = detect_drop_candidates(raw)

# Columnas que NO queremos eliminar aunque salgan como candidatas
protect_cols = ['price', 'amenities', 'neighbourhood_cleansed', 'property_type', 'room_type']
drop_candidates = [c for c in drop_candidates if c not in protect_cols]

drop_report.head(20)


In [None]:

# Candidatos a eliminar (resumen)
print("Total candidatos:", len(drop_candidates))
print(drop_candidates[:20])


In [None]:

# Columnas relacionadas con price (posible leakage si se usan como features)
price_cols = [c for c in raw.columns if 'price' in c.lower()]
price_cols


In [None]:
# Correlaciones con el target (Spearman) - numericas
eda_df = raw.copy()

# price_clean temporal para EDA
if 'price' in eda_df.columns:
    eda_df['price_clean'] = (
        eda_df['price'].astype(str)
        .str.replace(r"[,\$]", "", regex=True)
        .str.strip()
    )
    eda_df['price_clean'] = pd.to_numeric(eda_df['price_clean'], errors='coerce')

num_cols = eda_df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != 'price_clean']

corr = eda_df[num_cols].corrwith(eda_df['price_clean'], method='spearman').sort_values(ascending=False)
print("Top correlaciones positivas:")
print(corr.head(10))
print("")
print("Top correlaciones negativas:")
print(corr.tail(10))


In [None]:

# Visualizacion de top correlaciones (absolutas)
if len(corr) > 0:
    top_corr = corr.abs().sort_values(ascending=False).head(15)
    plt.figure(figsize=(8, 5))
    top_corr.sort_values().plot(kind='barh')
    plt.title('Top correlaciones (Spearman) con price_clean')
    plt.xlabel('abs(corr)')
    plt.show()


## 4) Limpieza y feature engineering

In [None]:

# Copia de trabajo
_df = raw.copy()

# Limpieza de price -> numerico
if 'price' in _df.columns:
    _df['price_clean'] = (
        _df['price'].astype(str)
        .str.replace(r"[,\$]", "", regex=True)
        .str.strip()
    )
    _df['price_clean'] = pd.to_numeric(_df['price_clean'], errors='coerce')
else:
    raise ValueError("La columna 'price' no existe en listings.csv")

# Bathrooms desde bathrooms_text (si existe)
if 'bathrooms_text' in _df.columns:
    _df['bathrooms_count'] = (
        _df['bathrooms_text'].astype(str)
        .str.extract(r"(\d+\.?\d*)")[0]
        .astype(float)
    )

# Amenities -> lista y conteo
if 'amenities' in _df.columns:
    def _clean_amenities(text):
        if pd.isna(text):
            return []
        text = text.replace('[', '').replace(']', '').replace('"', '')
        return [x.strip() for x in text.split(',') if x.strip()]

    _df['amenities_list'] = _df['amenities'].apply(_clean_amenities)
    _df['amenities_count'] = _df['amenities_list'].apply(len)

    # Algunas amenities binarias
    for amenity in [
        'Wifi', 'Air conditioning', 'Kitchen', 'Pool', 'Parking', 'Washer', 'Dryer'
    ]:
        col = f"has_{amenity.lower().replace(' ', '_')}"
        _df[col] = _df['amenities_list'].apply(
            lambda x: 1 if any(amenity.lower() in item.lower() for item in x) else 0
        )

# Variables binarias tipicas
for col in ['host_is_superhost', 'instant_bookable']:
    if col in _df.columns:
        _df[col] = _df[col].astype(str).str.lower().map({'t': 1, 'f': 0})

# Features de capacidad
for c in ['accommodates', 'bedrooms', 'beds']:
    if c not in _df.columns:
        _df[c] = np.nan

_df['total_capacity'] = _df['accommodates'].fillna(0) + _df['bedrooms'].fillna(0) + _df['beds'].fillna(0)
_df['bed_per_person'] = _df['beds'] / (_df['accommodates'].replace(0, np.nan))
_df['bedroom_per_person'] = _df['bedrooms'] / (_df['accommodates'].replace(0, np.nan))

# Reviews y calidad
for c in ['review_scores_rating', 'number_of_reviews', 'reviews_per_month']:
    if c not in _df.columns:
        _df[c] = np.nan

_df['reviews_per_month'] = pd.to_numeric(_df['reviews_per_month'], errors='coerce')

# Ubicacion: distancia al centro (CDMX aproximado)
if 'latitude' in _df.columns and 'longitude' in _df.columns:
    center_lat, center_lon = 19.4326, -99.1332
    _df['distance_from_center'] = np.sqrt(
        (_df['latitude'] - center_lat)**2 + (_df['longitude'] - center_lon)**2
    )
    _df['is_central_location'] = (_df['distance_from_center'] < 0.05).astype(int)

# Booking / disponibilidad
for c in ['minimum_nights', 'maximum_nights', 'availability_365']:
    if c not in _df.columns:
        _df[c] = np.nan

_df['booking_flexibility'] = _df['maximum_nights'] - _df['minimum_nights']
_df['availability_rate'] = _df['availability_365'] / 365
_df['scarcity_score'] = 1 - _df['availability_rate']

# Log transforms
for c in ['price_clean', 'number_of_reviews', 'reviews_per_month']:
    if c in _df.columns:
        _df[f'log_{c}'] = np.log1p(_df[c])



# Amenidades (todas) -> columnas binarias
if 'amenities_list' in _df.columns:
    # Lista completa de amenidades
    flat = [a for row in _df['amenities_list'] for a in row]
    amenity_counts = pd.Series(flat).value_counts()
    all_amenities = amenity_counts.index.tolist()

    def _sanitize_amenity(name: str) -> str:
        name = name.strip().lower()
        name = re.sub(r'[^0-9a-zA-Z]+', '_', name)
        return 'amenity_' + name.strip('_')

    # Evitar colisiones de nombres
    col_map = {}
    used = set()
    for a in all_amenities:
        base = _sanitize_amenity(a)
        col = base
        k = 2
        while col in used:
            col = f"{base}_{k}"
            k += 1
        col_map[a] = col
        used.add(col)

    # Crear matriz binaria (sparse) para todas las amenidades
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer(classes=all_amenities, sparse_output=True)
    amenity_mat = mlb.fit_transform(_df['amenities_list'])

    amenity_cols = [col_map[a] for a in mlb.classes_]
    amenity_df = pd.DataFrame.sparse.from_spmatrix(amenity_mat, index=_df.index, columns=amenity_cols)
    amenity_df = amenity_df.astype('int8')

    _df = pd.concat([_df, amenity_df], axis=1)

    print('Total amenidades creadas:', len(amenity_cols))

print("Shape despues de FE:", _df.shape)
_df[['price_clean']].describe()


### Correlacion de amenidades con price_clean


In [None]:
amenity_cols = [c for c in _df.columns if c.startswith('amenity_') or c.startswith('has_')]
if amenity_cols:
    corr_amen = _df[amenity_cols].corrwith(_df['price_clean'], method='spearman').sort_values(ascending=False)
    print('Top correlaciones positivas (amenities):')
    print(corr_amen.head(10))
    print('')
    print('Top correlaciones negativas (amenities):')
    print(corr_amen.tail(10))

    # Grafica top 15
    top = corr_amen.abs().sort_values(ascending=False).head(15)
    if len(top) > 0:
        plt.figure(figsize=(8, 5))
        top.sort_values().plot(kind='barh')
        plt.title('Top correlaciones (amenities) con price_clean')
        plt.xlabel('abs(corr)')
        plt.show()
else:
    print('No se generaron columnas de amenidades.')


## 5) Manejo de nulos y outliers

In [None]:

# Eliminar filas sin target
_df = _df.dropna(subset=['price_clean']).copy()

# Clip outliers en target
p01, p99 = _df['price_clean'].quantile([0.01, 0.99])
_df['price_clean'] = _df['price_clean'].clip(p01, p99)

# Clip outliers en variables numericas (opcional y suave)
num_cols = _df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if col == 'price_clean':
        continue
    q01, q99 = _df[col].quantile([0.01, 0.99])
    if np.isfinite(q01) and np.isfinite(q99):
        _df[col] = _df[col].clip(q01, q99)

print("Shape final modelado:", _df.shape)


## 6) Preparacion de datos para modelado

In [None]:
# Definir target y features
TARGET = 'price_clean'

# Columnas base a excluir
exclude_cols_base = [
    'price', 'price_clean', 'amenities', 'amenities_list',
    'log_price_clean', 'id', 'listing_url', 'scrape_id', 'last_scraped',
    'name', 'description', 'picture_url', 'host_name'
]

# Excluir candidatos del EDA
try:
    exclude_cols_base = list(set(exclude_cols_base + drop_candidates))
except Exception:
    pass

# Excluir texto largo o alta cardinalidad para controlar el one-hot
obj_cols = _df.select_dtypes(include=['object', 'category']).columns

def is_list_like_col(s, sample=200):
    sample_s = s.dropna().head(sample)
    return sample_s.apply(lambda x: isinstance(x, (list, dict, set))).any()

list_like_cols = [c for c in obj_cols if is_list_like_col(_df[c])]
obj_cols_safe = [c for c in obj_cols if c not in list_like_cols]

high_card_cols = [c for c in obj_cols_safe if _df[c].nunique(dropna=True) > 200]

long_text_cols = []
for c in obj_cols_safe:
    mean_len = _df[c].dropna().astype(str).str.len().mean()
    if mean_len and mean_len > 80:
        long_text_cols.append(c)

exclude_cols = sorted(set(exclude_cols_base + high_card_cols + long_text_cols + list_like_cols))

print("List-like (excluidas):", list_like_cols[:8])
print("Excluidas por alta cardinalidad:", high_card_cols[:8])
print("Excluidas por texto largo:", long_text_cols[:8])

features = [c for c in _df.columns if c not in exclude_cols]

X = _df[features].copy()
y = _df[TARGET].copy()

# Identificar columnas
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numericas:", len(num_features))
print("Categoricas:", len(cat_features))


In [None]:

# Seleccion rapida de features numericas (SelectKBest)
X_num = X[num_features].copy()
X_num = X_num.fillna(X_num.median())

k = min(20, X_num.shape[1])
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X_num, y)

scores = pd.Series(selector.scores_, index=X_num.columns).sort_values(ascending=False)
print("Top features (numericas):")
print(scores.head(10))


In [None]:

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## 7) Preprocesamiento (Pipeline)

In [None]:

# Preprocesamiento: imputacion + escalado + one-hot
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

# Preprocesador denso (para GaussianNB si se usa)
categorical_transformer_dense = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor_dense = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer_dense, cat_features)
    ]
)


## 8) Baseline

In [None]:

# Baseline simple: predecir media
baseline_pred = np.full_like(y_test, y_train.mean(), dtype=float)
rmse = mean_squared_error(y_test, baseline_pred, squared=False)
mae = mean_absolute_error(y_test, baseline_pred)
r2 = r2_score(y_test, baseline_pred)
print({"RMSE": rmse, "MAE": mae, "R2": r2})


## 9) Evaluacion de modelos de regresion

In [None]:

# Funcion de evaluacion

def eval_regression_models(models, X_train, X_test, y_train, y_test, preprocessor):
    rows = []
    for name, model in models:
        pipe = Pipeline(steps=[('preprocess', preprocessor), ('model', model)])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        rows.append({
            'Model': name,
            'RMSE': mean_squared_error(y_test, preds, squared=False),
            'MAE': mean_absolute_error(y_test, preds),
            'R2': r2_score(y_test, preds)
        })
    return pd.DataFrame(rows).sort_values(by='RMSE')

voter = VotingRegressor(
    estimators=[
        ('lr', LinearRegression()),
        ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
        ('gbr', GradientBoostingRegressor(random_state=42))
    ]
)

models_reg = [
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge(alpha=1.0)),
    ('Lasso', Lasso(alpha=0.001)),
    ('ElasticNet', ElasticNet(alpha=0.001, l1_ratio=0.5)),
    ('BayesianRidge', BayesianRidge()),
    ('SGDRegressor', SGDRegressor(max_iter=2000, tol=1e-3, random_state=42)),
    ('KernelRidge_RBF', KernelRidge(kernel='rbf', alpha=1.0, gamma=0.1)),
    ('SVR_RBF', SVR(C=10, gamma='scale')),
    ('KNN', KNeighborsRegressor(n_neighbors=10)),
    ('DecisionTree', DecisionTreeRegressor(max_depth=12, random_state=42)),
    ('RandomForest', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
    ('ExtraTrees', ExtraTreesRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
    ('GradientBoosting', GradientBoostingRegressor(random_state=42)),
    ('AdaBoost', AdaBoostRegressor(random_state=42)),
    ('MLPRegressor', MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)),
    ('VotingRegressor', voter)
]

results_reg = eval_regression_models(models_reg, X_train, X_test, y_train, y_test, preprocessor)
results_reg.head(10)


## 10) Ajuste de hiperparametros (subset)

In [None]:

# Hiperparametros en modelos clave (grids pequenos por performance)

param_grids = {
    'Ridge': {
        'model__alpha': [0.1, 1.0, 10.0]
    },
    'RandomForest': {
        'model__n_estimators': [200, 400],
        'model__max_depth': [10, 20, None],
        'model__min_samples_split': [2, 5]
    },
    'SVR_RBF': {
        'model__C': [1, 10],
        'model__gamma': ['scale', 0.1]
    },
    'MLPRegressor': {
        'model__hidden_layer_sizes': [(64, 32), (128, 64)],
        'model__alpha': [0.0001, 0.001]
    }
}

best_models = {}

for name, model in models_reg:
    if name not in param_grids:
        continue
    pipe = Pipeline(steps=[('preprocess', preprocessor), ('model', model)])
    grid = GridSearchCV(pipe, param_grids[name], cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(name, "best RMSE:", -grid.best_score_)

best_models


## 11) Interpretabilidad (features y coeficientes)

In [None]:

# Interpretabilidad para el mejor modelo arbol y uno lineal

def get_feature_names(preprocessor, num_features, cat_features):
    # Numericas
    feat_names = list(num_features)
    # Categoricas one-hot
    if cat_features:
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_names = ohe.get_feature_names_out(cat_features).tolist()
        feat_names.extend(cat_names)
    return feat_names

# Tomamos un modelo interpretable
if 'RandomForest' in best_models:
    model = best_models['RandomForest']
else:
    # fallback: entrena RF baseline
    model = Pipeline(steps=[('preprocess', preprocessor), ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))])
    model.fit(X_train, y_train)

# Extraer importancias
pre = model.named_steps['preprocess']
rf = model.named_steps['model']

feat_names = get_feature_names(pre, num_features, cat_features)
importances = pd.Series(rf.feature_importances_, index=feat_names).sort_values(ascending=False)
importances.head(15)


In [None]:

# Coeficientes de Ridge (lineal)
pipe_ridge = Pipeline(steps=[('preprocess', preprocessor), ('model', Ridge(alpha=1.0))])
pipe_ridge.fit(X_train, y_train)

pre = pipe_ridge.named_steps['preprocess']
model = pipe_ridge.named_steps['model']
feat_names = get_feature_names(pre, num_features, cat_features)

coef = pd.Series(model.coef_, index=feat_names).sort_values(key=np.abs, ascending=False)
coef.head(15)


## 12) Clasificacion auxiliar por segmentos de precio (aplica LDA/Naive Bayes/SVM)

In [None]:

# Crear segmentos de precio (bajo/medio/alto) para aplicar clasificacion
_df_class = _df.copy()
_df_class['price_segment'] = pd.qcut(_df_class['price_clean'], q=3, labels=['low', 'mid', 'high'])

Xc = _df_class[features].copy()
yc = _df_class['price_segment'].copy()

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

models_clf = [
    ('LogisticRegression', LogisticRegression(max_iter=1000)),
    ('KNN', KNeighborsClassifier(n_neighbors=7)),
    ('SVM_RBF', SVC(C=5, gamma='scale', probability=True)),
    ('NaiveBayes', GaussianNB()),
    ('DecisionTree', DecisionTreeClassifier(max_depth=10, random_state=42)),
    ('RandomForest', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
    ('GradientBoosting', GradientBoostingClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42))
]

rows = []
for name, model in models_clf:
    # GaussianNB requiere datos densos
    use_pre = preprocessor_dense if name == 'NaiveBayes' else preprocessor
    pipe = Pipeline(steps=[('preprocess', use_pre), ('model', model)])
    pipe.fit(Xc_train, yc_train)
    preds = pipe.predict(Xc_test)
    acc = accuracy_score(yc_test, preds)
    f1 = f1_score(yc_test, preds, average='macro')
    rows.append({'Model': name, 'Accuracy': acc, 'F1_macro': f1})

results_clf = pd.DataFrame(rows).sort_values(by='F1_macro', ascending=False)
results_clf.head(10)


In [None]:

# LDA para reduccion de dimensiones (visualizacion)
# Nota: LDA requiere datos numericos -> usamos solo numericas imputadas

num_only = _df_class[num_features].copy()
num_only = num_only.fillna(num_only.median())

lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(num_only, _df_class['price_segment'])

plt.figure(figsize=(7, 5))
for label in ['low', 'mid', 'high']:
    mask = _df_class['price_segment'] == label
    plt.scatter(X_lda[mask, 0], X_lda[mask, 1], s=10, alpha=0.5, label=label)
plt.title('LDA - Price Segments')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend()
plt.show()


## 13) Conclusiones y siguiente paso


- Selecciona el mejor modelo de regresion segun RMSE/R2.
- Reporta variables mas influyentes.
- Prepara el caso de uso (usuario final) con el modelo ganador.
