# 02_modelado.ipynb

Modelado y clasificación.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc
import joblib
from scipy.stats import randint

PROC_CSV = 'googlemaps/data/MAPS_locales_procesado.csv'
df = pd.read_csv(PROC_CSV)

# Preparar datos
df['clase_val'] = pd.qcut(df['valoracion_norm'], q=3, labels=['baja','media','alta'])
X = df[['latitud','longitud','categoria_negocio','dist_city_center_km',
        'density_500m','density_1000m','density_2000m','ratio_500m_2km','cluster_zone'] + list(names)]
y = df['clase_val']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Pipeline
num_feats = ['latitud','longitud','dist_city_center_km','density_500m','density_1000m','density_2000m','ratio_500m_2km']
cat_feats = ['categoria_negocio','cluster_zone']
num_pipe = Pipeline([('scaler', StandardScaler())])
cat_pipe = Pipeline([('onehot', OneHotEncoder(drop='first', sparse_output=False))])
preprocessor = ColumnTransformer([('num', num_pipe, num_feats),('cat', cat_pipe, cat_feats)])
clf = Pipeline([('prep', preprocessor),('rf', RandomForestClassifier(n_estimators=100, random_state=42))])

# Entrenamiento y evaluación
clf.fit(X_tr, y_tr)
y_pred = clf.predict(X_te)
y_proba = clf.predict_proba(X_te)
print("Accuracy:", accuracy_score(y_te, y_pred))
print("ROC-AUC:", roc_auc_score(y_te, y_proba, multi_class='ovr'))

# Guardar modelo
MODEL_PATH = 'models/rf_classifier.pkl'
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
joblib.dump(clf, MODEL_PATH)
print("Modelo guardado en:", MODEL_PATH)