# EDA Unificado (Practica 4)

Objetivo: limpiar y estandarizar variables clave, documentar el tratamiento de nulos/outliers y dejar una base consistente para modelado posterior.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 120)

In [None]:
# === 1) Carga de datos (prioriza data/listings.csv.gz) ===
DATA_CANDIDATES = [
    Path('data/listings.csv.gz'),
    Path('listings.csv.gz'),
    Path('listings.csv'),
]

data_path = None
for p in DATA_CANDIDATES:
    if p.exists():
        data_path = p
        break

if data_path is None:
    raise FileNotFoundError('No se encontro listings.csv.gz ni listings.csv')

compression = 'gzip' if data_path.suffix == '.gz' else None
df = pd.read_csv(data_path, compression=compression, low_memory=False)

print('Data path:', data_path)
print('Shape:', df.shape)
df.head(3)

In [None]:
# === 2) Resumen rapido ===
df.info()

In [None]:
# === 3) Nulos y cardinalidad (idea de practica4_dmc) ===

def detect_drop_candidates(df, missing_threshold=60, high_card_threshold=200, long_text_len=80):
    rows = []
    n = len(df)
    for col in df.columns:
        s = df[col]
        n_missing = s.isna().sum()
        pct_missing = (n_missing / n) * 100
        n_unique = s.nunique(dropna=True)
        sample = s.dropna().astype(str).head(3).tolist()
        mean_len = None
        if s.dtype == 'object':
            mean_len = s.dropna().astype(str).str.len().mean()
        reasons = []
        if pct_missing > missing_threshold:
            reasons.append(f'missing>{missing_threshold}%')
        if n_unique > high_card_threshold:
            reasons.append('high_card')
        if s.dtype == 'object' and mean_len and mean_len > long_text_len:
            reasons.append('long_text')
        if s.dtype == 'object' and s.astype(str).str.contains('http', case=False, na=False).mean() > 0.2:
            reasons.append('url_like')
        rows.append({
            'col': col,
            'pct_missing': round(pct_missing, 2),
            'n_unique': n_unique,
            'mean_len': None if mean_len is None else round(mean_len, 1),
            'sample': sample,
            'reasons': ', '.join(reasons)
        })
    return pd.DataFrame(rows).sort_values(by=['pct_missing','n_unique'], ascending=False)

missing_report = detect_drop_candidates(df)
missing_report.head(20)

In [None]:
# === 4) Price clean + log (dmc + main + jimena) ===
if 'price' in df.columns:
    df['price_clean'] = (
        df['price'].astype(str)
        .str.replace(r'[,$]', '', regex=True)
    )
    df['price_clean'] = pd.to_numeric(df['price_clean'], errors='coerce')
    df['log_price_clean'] = np.log1p(df['price_clean'])

df[['price', 'price_clean', 'log_price_clean']].describe()

In [None]:
# === 5) Normalizacion de rates (jjvv) ===
for col in ['host_response_rate', 'host_acceptance_rate']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace('%', '', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

df[['host_response_rate','host_acceptance_rate']].describe() if 'host_response_rate' in df.columns else 'rates not found'

In [None]:
# === 6) Amenities: lista, conteo, grupos, y amenities clave ===

def parse_amenities(text):
    if pd.isna(text):
        return []
    text = str(text).strip('{}')
    parts = [p.strip().strip('"').strip("'") for p in text.split(',')]
    return [p for p in parts if p]

if 'amenities' in df.columns:
    df['amenities_list'] = df['amenities'].apply(parse_amenities)
    df['amenities_count'] = df['amenities_list'].apply(len)

    # Amenidades clave (main)
    key_amenities = ['Wifi', 'Air conditioning', 'Pool', 'Kitchen', 'Parking']
    for amenity in key_amenities:
        col = f"has_{amenity.lower().replace(' ', '_')}"
        df[col] = df['amenities_list'].apply(
            lambda lst: int(any(amenity.lower() in a.lower() for a in lst))
        )

    # Amenidades por grupo (jimena)
    amenity_groups = {
        'comfort': ['air conditioning', 'heating', 'washer', 'dryer', 'tv'],
        'kitchen': ['kitchen', 'microwave', 'refrigerator', 'oven', 'coffee'],
        'laundry': ['washer', 'dryer', 'iron'],
        'leisure': ['pool', 'gym', 'hot tub', 'balcony', 'patio'],
        'business': ['wifi', 'workspace', 'desk'],
        'parking': ['parking', 'garage']
    }
    for group, kws in amenity_groups.items():
        col = f'amenities_{group}'
        df[col] = df['amenities_list'].apply(
            lambda lst: int(any(any(kw in a.lower() for kw in kws) for a in lst))
        )

df[['amenities_count']].describe() if 'amenities_count' in df.columns else 'amenities not found'

In [None]:
# === 7) Features de capacidad / layout (dmc + zg) ===
for col in ['accommodates','bedrooms','beds']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df['total_capacity'] = df['accommodates'].fillna(0) + df['bedrooms'].fillna(0) + df['beds'].fillna(0)
df['bed_per_person'] = df['beds'] / df['accommodates'].replace(0, np.nan)
df['bedroom_per_person'] = df['bedrooms'] / df['accommodates'].replace(0, np.nan)
df['space_per_person'] = df['total_capacity'] / df['accommodates'].replace(0, np.nan)

df[['total_capacity','bed_per_person','bedroom_per_person','space_per_person']].describe()

In [None]:
# === 8) Geografia: distancias (dmc + jjvv) ===

def haversine(lat, lon, lat0, lon0):
    R = 6371.0
    lat1 = np.radians(lat)
    lon1 = np.radians(lon)
    lat2 = np.radians(lat0)
    lon2 = np.radians(lon0)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

if 'latitude' in df.columns and 'longitude' in df.columns:
    # Centro CDMX (Zocalo)
    ZOCALO = (19.4326, -99.1332)
    AICM = (19.4361, -99.0719)
    df['dist_zocalo_km'] = haversine(df['latitude'], df['longitude'], *ZOCALO)
    df['dist_aicm_km'] = haversine(df['latitude'], df['longitude'], *AICM)
    df['distance_from_center_km'] = df['dist_zocalo_km']
    df['is_central_location'] = (df['distance_from_center_km'] < 5).astype(int)

df[['dist_zocalo_km','dist_aicm_km','distance_from_center_km']].describe() if 'dist_zocalo_km' in df.columns else 'geo not found'

In [None]:
# === 9) Temporalidad y recencia (jimena) ===
for col in ['last_scraped','last_review','first_review','host_since']:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

if 'last_scraped' in df.columns and 'last_review' in df.columns:
    df['days_since_last_review'] = (df['last_scraped'] - df['last_review']).dt.days

if 'last_scraped' in df.columns and 'host_since' in df.columns:
    df['host_tenure_days'] = (df['last_scraped'] - df['host_since']).dt.days

# Recency groups
if 'days_since_last_review' in df.columns:
    bins = [-1, 30, 90, 180, 365, 99999]
    labels = ['<=30', '31-90', '91-180', '181-365', '>365']
    df['recency_group'] = pd.cut(df['days_since_last_review'], bins=bins, labels=labels)

df[['days_since_last_review','host_tenure_days']].describe()

In [None]:
# === 10) Disponibilidad / actividad (dmc + zg) ===
if 'availability_365' in df.columns:
    df['availability_rate'] = df['availability_365'] / 365
    df['scarcity_score'] = 1 - df['availability_rate']

if 'maximum_nights' in df.columns and 'minimum_nights' in df.columns:
    df['booking_flexibility'] = df['maximum_nights'] - df['minimum_nights']

df[['availability_rate','scarcity_score','booking_flexibility']].describe()

In [None]:
# === 11) NLP simple para lujo (equipo_5) ===
# No requiere nltk. Es un conteo simple de palabras clave en descripcion.
luxury_keywords = [
    'luxury','lujo','premium','exclusivo','exclusive','elegante','elegant',
    'boutique','vista','panoramica','private','privado','spacious','amplio'
]

if 'description' in df.columns:
    desc = df['description'].fillna('').str.lower()
    df['luxury_keyword_count'] = desc.apply(
        lambda x: sum(1 for kw in luxury_keywords if kw in x)
    )
    df['is_luxury_property'] = (df['luxury_keyword_count'] > 0).astype(int)

df[['luxury_keyword_count','is_luxury_property']].describe() if 'description' in df.columns else 'description not found'

In [None]:
# === 12) Outliers (IQR / winsor / clip) ===
if 'price_clean' in df.columns:
    # Clip suave 1%-99%
    q01, q99 = df['price_clean'].quantile([0.01, 0.99])
    df['price_clean_clip'] = df['price_clean'].clip(q01, q99)

    # IQR
    q1 = df['price_clean'].quantile(0.25)
    q3 = df['price_clean'].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df['price_clean_iqr'] = df['price_clean'].clip(lower, upper)

df[['price_clean','price_clean_clip','price_clean_iqr']].describe()

In [None]:
# === 13) Leakage candidates (main) ===
LEAKAGE_KEYWORDS = [
    'price', 'review', 'revenue', 'availability', 'occupancy', 'estimated',
    'calculated_host_listings_count'
]
leakage_cols = [c for c in df.columns if any(k in c.lower() for k in LEAKAGE_KEYWORDS)]
leakage_cols

In [None]:
# === 14) Correlaciones Spearman con price_clean (dmc) ===
if 'price_clean' in df.columns:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    num_cols = [c for c in num_cols if c != 'price_clean']
    corr = df[num_cols].corrwith(df['price_clean'], method='spearman').sort_values(ascending=False)
    corr.head(20)

## Siguientes pasos
- Definir el target final (price_clean vs log_price_clean).
- Seleccionar features base (y excluir leakage).
- Crear pipeline de preprocesamiento y baseline de modelos.