# 01_preprocesado.ipynb

ETL y feature engineering.

In [None]:
import pandas as pd
import numpy as np
from haversine import haversine
from sklearn.neighbors import BallTree
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import PolynomialFeatures

RAW_CSV  = 'googlemaps/data/locales_todos.csv'
PROC_CSV = 'googlemaps/data/MAPS_locales_procesado.csv'

df = pd.read_csv(RAW_CSV)

# Limpieza básica
df = df[df['latitud'].notna() & df['longitud'].notna()]
df = df[df['puntuacion_media'].notna() & (df['numero_reviews'] >= 3)]

# Feature engineering básico
df['valoracion'] = df['puntuacion_media'] * (1 - np.exp(-df['numero_reviews'] / 10))
df['valoracion_norm'] = df.groupby('categoria_negocio')['valoracion']     .transform(lambda x: (x - x.min()) / (x.max() - x.min()))

# Feature engineering avanzado
city_center = (40.4168, -3.7038)
df['dist_city_center_km'] = df.apply(lambda r: haversine((r.latitud, r.longitud), city_center), axis=1)
coords_rad = np.deg2rad(df[['latitud','longitud']].values)
tree = BallTree(coords_rad, metric='haversine')
for r_km in [0.5, 1.0, 2.0]:
    df[f'density_{int(r_km*1000)}m'] = tree.query_radius(coords_rad, r=r_km/6371.0, count_only=True)
df['ratio_500m_2km'] = df['density_500m'] / (df['density_2000m'] + 1)
db = DBSCAN(eps=0.5/6371.0, min_samples=10, metric='haversine')
df['cluster_zone'] = db.fit_predict(coords_rad).astype(str)
poly_feats = ['dist_city_center_km', 'density_1000m']
poly = PolynomialFeatures(degree=2, include_bias=False)
arr = poly.fit_transform(df[poly_feats])
names = poly.get_feature_names_out(poly_feats)
poly_df = pd.DataFrame(arr, columns=names, index=df.index)
df = pd.concat([df, poly_df], axis=1)

# Guardar CSV procesado
os.makedirs(os.path.dirname(PROC_CSV), exist_ok=True)
df.to_csv(PROC_CSV, index=False)
print("CSV procesado guardado en:", PROC_CSV)