In [1]:
%pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [3]:
! npm install localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
up to date, audited 23 packages in 935ms
[1G[0K⠧[1G[0K
[1G[0K⠧[1G[0K3 packages are looking for funding
[1G[0K⠧[1G[0K  run `npm fund` for details
[1G[0K⠧[1G[0K
2 [31m[1mhigh[22m[39m severity vulnerabilities

To address all issues (including breaking changes), run:
  npm audit fix --force

Run `npm audit` for details.
[1G[0K⠧[1G[0K

In [12]:
%%writefile airbnb.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm   

st.set_page_config(page_title="Airbnb Analytics Dashboard", layout="wide", initial_sidebar_state="expanded")

# ---------------------------
# CSS Airbnb Style
# ---------------------------
st.markdown("""
<style>
    .stApp { background-color: #FAFAFA; }
    div.stPlotlyChart, div.stDataFrame {
        border: 1px solid #E0E0E0;
        border-radius: 12px;
        padding: 12px;
        margin-bottom: 24px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.05);
        background-color: white;
    }
    .stTabs [data-baseweb="tab-list"] { gap: 16px; border-bottom: 2px solid #E0E0E0; }
    .stTabs [data-baseweb="tab"] { font-weight: 500; color: #484848; padding: 12px 20px; border-radius: 8px 8px 0 0; }
    .stTabs [data-baseweb="tab"]:hover { background-color: #FF5A5F10; color: #FF5A5F; }
    .stTabs [data-baseweb="tab"][aria-selected="true"] { color: #FF5A5F; border-bottom: 3px solid #FF5A5F; }
    .stDownloadButton > button { background-color: #FF5A5F; color: white; border-radius: 8px; font-weight: 500; }
    .stDownloadButton > button:hover { background-color: #E31C79; }
    .desc { font-size: 0.9rem; color: #666; margin-top: 8px; font-style: italic; }
</style>
""", unsafe_allow_html=True)

# ---------------------------
# PALETAS DE COLORES
# ---------------------------
AIRBNB_DISCRETE = ["#FF5A5F", "#484848", "#767676", "#E31C79", "#FF5A5F"]
AIRBNB_CONTINUOUS = ["#FF5A5F", "#E31C79", "#484848"]
ROSA_CONTINUA = ["#880E4F", "#AD1457", "#C2185B", "#D81B60", "#EC407A", "#F06292", "#F48FB1", "#F8BBD0"]
REGRESSION_LINE_COLOR = "#1f77b4"  # Azul oscuro profesional

# ---------------------------
# CONFIG
# ---------------------------
ARCHIVOS = {
    "Ámsterdam": "Amsterdam.csv",
    "Bérgamo": "Bergamo.csv",
    "Oporto": "oporto.csv",
    "Albany": "albany.csv",
    "Berlín": "berlin.csv"
}

FIXED_15_CATEGORICALS = [
    "neighbourhood_group_cleansed", "last_scraped", "has_availability", "host_is_superhost",
    "host_has_profile_pic", "host_identity_verified", "instant_bookable", "host_response_time",
    "room_type", "host_verifications", "bedrooms", "accommodates", "bathrooms",
    "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms",
    "price_numeric", "availability_365", "number_of_reviews", "review_scores_value"
]

# ---------------------------
# HELPERS
# ---------------------------
@st.cache_resource
def load_df(path):
    df = pd.read_csv(path)
    df = df.fillna(method="bfill").fillna(method="ffill")
    if "price" in df.columns:
        df["price_numeric"] = (
            df["price"].astype(str)
            .str.replace(r"[^0-9\.\-]", "", regex=True)
            .replace("", np.nan)
            .astype(float)
        )
    else:
        df["price_numeric"] = np.nan
    return df

def available_fixed_vars(df):
    return [c for c in FIXED_15_CATEGORICALS if c in df.columns]

def safe_top_freq(df, cat_col, top_k):
    if cat_col not in df.columns:
        return pd.DataFrame(columns=["categoria", "frecuencia"])
    f = df[cat_col].value_counts(dropna=False).reset_index()
    f.columns = ["categoria", "frecuencia"]
    f["categoria"] = f["categoria"].astype(str)
    return f.sort_values("frecuencia", ascending=False).head(top_k)

def freq_for_city(df, cat_col):
    if cat_col not in df.columns:
        return pd.DataFrame({cat_col: [], "frecuencia": []})
    f = df[cat_col].value_counts(dropna=False).reset_index()
    f.columns = ["categoria", "frecuencia"]
    f["categoria"] = f["categoria"].astype(str)
    return f

def pairwise_ttests(summary_dfs, num_var):
    rows = []
    cities = list(summary_dfs.keys())
    for i in range(len(cities)):
        for j in range(i+1, len(cities)):
            a, b = cities[i], cities[j]
            arr_a, arr_b = summary_dfs[a], summary_dfs[b]
            if len(arr_a) < 2 or len(arr_b) < 2:
                rows.append({"city_a": a, "city_b": b, "mean_a": np.nan, "mean_b": np.nan, "delta_mean": np.nan, "tstat": np.nan, "pvalue": np.nan, "n_a": len(arr_a), "n_b": len(arr_b)})
                continue
            mean_a, mean_b = float(np.nanmean(arr_a)), float(np.nanmean(arr_b))
            delta = mean_a - mean_b
            try:
                tstat, pvalue = stats.ttest_ind(arr_a, arr_b, equal_var=False, nan_policy="omit")
            except:
                tstat, pvalue = np.nan, np.nan
            rows.append({"city_a": a, "city_b": b, "mean_a": mean_a, "mean_b": mean_b, "delta_mean": delta, "tstat": tstat, "pvalue": pvalue, "n_a": len(arr_a), "n_b": len(arr_b)})
    return pd.DataFrame(rows)

# ---------------------------
# SESSION STATE
# ---------------------------
if "compare_list" not in st.session_state:
    st.session_state.compare_list = []

# ---------------------------
# SIDEBAR CON LOGO
# ---------------------------
st.sidebar.image("https://upload.wikimedia.org/wikipedia/commons/thumb/6/69/Airbnb_Logo_B%C3%A9lo.svg/2560px-Airbnb_Logo_B%C3%A9lo.svg.png", width=120)
st.sidebar.markdown("<h2 style='color:#FF5A5F; margin-top:10px;'>Análisis Comparativo</h2>", unsafe_allow_html=True)
st.sidebar.markdown("**1. Selecciona ciudad principal**  \n**2. Agrega ciudades para comparar**")

primary_city = st.sidebar.selectbox("Ciudad Principal (Treemap)", options=list(ARCHIVOS.keys()), index=0)
primary_path = ARCHIVOS[primary_city]
df_primary = load_df(primary_path)
fixed_available_primary = available_fixed_vars(df_primary)

var_cat = st.sidebar.selectbox("Variable a Analizar", options=fixed_available_primary)
top_k = st.sidebar.slider("Top de categorías a mostrar", 5, 30, 10)

st.sidebar.markdown("---")
st.sidebar.markdown("**Ciudades en Comparación**")
multisel = st.sidebar.multiselect("Agregar ciudades", options=list(ARCHIVOS.keys()), default=[primary_city])
if st.sidebar.button("Agregar Seleccionadas"):
    for c in multisel:
        if c not in st.session_state.compare_list:
            st.session_state.compare_list.append(c)
if st.sidebar.button("Limpiar Lista"):
    st.session_state.compare_list = []

st.sidebar.markdown("**Ciudades activas:**")
for c in st.session_state.compare_list:
    st.sidebar.write(f"• {c}")
if not st.session_state.compare_list:
    st.sidebar.info("Lista vacía")

use_persistent = st.sidebar.checkbox("Mantener lista entre sesiones", value=True)
cities_compare = st.session_state.compare_list if use_persistent and st.session_state.compare_list else \
                 st.sidebar.multiselect("Ciudades (modo temporal)", options=list(ARCHIVOS.keys()), default=[primary_city])

# ---------------------------
# TÍTULO PRINCIPAL
# ---------------------------
st.markdown('<p style="color:#FF5A5F; font-size:32px; font-weight:700; text-align:center;">Airbnb Analytics Dashboard</p>', unsafe_allow_html=True)
st.markdown(f"**Ciudad principal:** {primary_city} — **Variable:** {var_cat}")

# ---------------------------
# TREEMAP
# ---------------------------
treemap_df = safe_top_freq(df_primary, var_cat, top_k)
if treemap_df.empty:
    st.info("No hay datos suficientes para generar el treemap.")
else:
    fig_tree = px.treemap(treemap_df, path=["categoria"], values="frecuencia",
                          title=f"Distribución Jerárquica de {var_cat} en {primary_city}",
                          color="frecuencia", color_continuous_scale=AIRBNB_CONTINUOUS)
    fig_tree.update_layout(margin=dict(t=50, l=10, r=10, b=10), height=520)
    st.plotly_chart(fig_tree, use_container_width=True)
    st.markdown(f"<p class='desc'>Visualización jerárquica del top {top_k} de {var_cat}. El tamaño del bloque representa la frecuencia.</p>", unsafe_allow_html=True)

st.markdown("---")

# ---------------------------
# CARGA DE DATOS COMPARATIVOS
# ---------------------------
if not cities_compare:
    st.info("Selecciona al menos una ciudad para continuar.")
    st.stop()

data_compare = {}
for c in cities_compare:
    path = ARCHIVOS.get(c)
    try:
        data_compare[c] = load_df(path)
    except FileNotFoundError:
        st.error(f"Archivo no encontrado: {c} → {path}")
        st.stop()

freqs = []
for c, dfc in data_compare.items():
    f = freq_for_city(dfc, var_cat)
    f["city"] = c
    freqs.append(f)
freq_all = pd.concat(freqs, ignore_index=True, sort=False) if freqs else pd.DataFrame()
top_global = freq_all.groupby("categoria")["frecuencia"].sum().sort_values(ascending=False).head(top_k).index.tolist()
cities_sorted = sorted(data_compare.keys())
full_index = pd.MultiIndex.from_product([cities_sorted, top_global], names=["city","categoria"])
freq_comp = freq_all[freq_all["categoria"].isin(top_global)].set_index(["city","categoria"]).reindex(full_index, fill_value=0).reset_index()

# ---------------------------
# VARIABLES NUMÉRICAS GLOBALES
# ---------------------------
numeric_union = set()
for dfc in data_compare.values():
    numeric_union.update(dfc.select_dtypes(include=[np.number]).columns.tolist())
numeric_union = sorted(numeric_union)
if "price_numeric" in numeric_union:
    numeric_union.remove("price_numeric")
    numeric_union.insert(0, "price_numeric")

# ---------------------------
# PESTAÑAS
# ---------------------------
st.markdown("## Análisis Comparativo entre Ciudades")

tab_graficos, tab_tests, tab_summary, tab_regresion, tab_correlacion, tab_downloads = st.tabs([
    "Visualizaciones", "Pruebas Estadísticas", "Resumen Detallado", "Modelos Predictivos", "Correlaciones", "Exportar Resultados"
])

test_df = pd.DataFrame()
combined_summary_list = []
chosen_num_tests = None

# --- Pestaña 1: Visualizaciones ---
with tab_graficos:
    st.markdown("### Comparación por Barras Agrupadas")
    if not freq_comp.empty:
        fig_group = px.bar(freq_comp, x="categoria", y="frecuencia", color="city", barmode="group",
                           color_discrete_sequence=AIRBNB_DISCRETE, 
                           title=f"Distribución de {var_cat} en las Ciudades Seleccionadas")
        fig_group.update_layout(xaxis_tickangle=-40, height=520)
        st.plotly_chart(fig_group, use_container_width=True)
        st.markdown(f"<p class='desc'>Compara la frecuencia del top {top_k} de {var_cat} entre ciudades. Cada barra representa una categoría.</p>", unsafe_allow_html=True)

    st.markdown("### Distribución Proporcional por Ciudad")
    pies = []
    for c, dfc in data_compare.items():
        f = freq_for_city(dfc, var_cat).sort_values("frecuencia", ascending=False).head(top_k)
        if not f.empty:
            f["city"] = c
            pies.append(f)

    if pies:
        pies_df = pd.concat(pies, ignore_index=True)
        min_freq = pies_df["frecuencia"].min()
        max_freq = pies_df["frecuencia"].max()

        fig = go.Figure()
        cities = pies_df["city"].unique()
        n_cities = len(cities)
        cols = 3
        rows = (n_cities + cols - 1) // cols

        for idx, city in enumerate(cities):
            city_data = pies_df[pies_df["city"] == city].copy()
            row = (idx // cols) + 1
            col = (idx % cols) + 1

            city_min = city_data["frecuencia"].min()
            city_max = city_data["frecuencia"].max()
            norm_local = (city_data["frecuencia"] - city_min) / (city_max - city_min + 1e-8)
            colors = [ROSA_CONTINUA[int(val * (len(ROSA_CONTINUA) - 1))] for val in norm_local]

            fig.add_trace(go.Pie(
                labels=city_data["categoria"],
                values=city_data["frecuencia"],
                name=city,
                hole=0.45,
                textinfo='percent+label',
                marker=dict(colors=colors, line=dict(color='white', width=2)),
                showlegend=False,
                domain={'x': [(col-1)/cols, col/cols], 'y': [(rows-row)/rows, (rows-row+1)/rows]}
            ))

            fig.add_annotation(
                x=(col - 0.5) / cols,
                y=(rows - row + 0.9) / rows,
                xref="paper", yref="paper",
                text=f"<b>{city}</b>",
                showarrow=False,
                font=dict(size=14, color="#484848"),
                align="center"
            )

        fig.update_layout(
            title=f"Proporción del Top {top_k} de {var_cat} — Color: Frecuencia (Rosa Oscuro = Mayor)",
            height=380 + 160 * rows,
            margin=dict(l=20, r=160, t=80, b=20),
            grid=dict(rows=rows, columns=cols),
            coloraxis=dict(
                colorscale=ROSA_CONTINUA,
                cmin=min_freq,
                cmax=max_freq,
                colorbar=dict(title="Frecuencia", x=1.02, len=0.7, thickness=15, tickfont=dict(size=11))
            )
        )
        st.plotly_chart(fig, use_container_width=True)
        st.markdown(f"<p class='desc'>Cada donut muestra la distribución proporcional del top {top_k} de {var_cat} en una ciudad. El color indica la frecuencia relativa.</p>", unsafe_allow_html=True)

    st.markdown("### Distribución de Variables Numéricas")
    default_vars = [v for v in ["price_numeric", "availability_365", "review_scores_value"] if v in numeric_union]
    default_vars = default_vars[:3] if len(default_vars) > 3 else default_vars
    vars_to_plot = st.multiselect("Selecciona variables numéricas", options=numeric_union, default=default_vars, key="box_vars")

    if vars_to_plot:
        plot_data_list = []
        for city, df in data_compare.items():
            cols = [v for v in vars_to_plot if v in df.columns]
            if cols:
                temp = df[cols].copy()
                temp["city"] = city
                plot_data_list.append(temp)
        if plot_data_list:
            combined_df = pd.concat(plot_data_list, ignore_index=True)
            melted_df = combined_df.melt(id_vars=["city"], value_vars=vars_to_plot, var_name="variable", value_name="valor")
            fig_box = px.box(melted_df, x="city", y="valor", color="city", facet_row="variable",
                             color_discrete_sequence=AIRBNB_DISCRETE, title="Distribución de Variables Numéricas por Ciudad")
            fig_box.update_yaxes(matches=None, showticklabels=True)
            fig_box.update_layout(height=250 + 250 * len(vars_to_plot))
            st.plotly_chart(fig_box, use_container_width=True)
            st.markdown("<p class='desc'>Boxplots que muestran la dispersión, mediana y valores atípicos de variables numéricas clave.</p>", unsafe_allow_html=True)
# --- Pestaña 2: Pruebas Estadísticas ---
with tab_tests:
    st.markdown("### Pruebas de Diferencia entre Ciudades")
    if not numeric_union:
        st.info("No hay variables numéricas disponibles para análisis.")
    else:
        chosen_num_tests = st.selectbox("Selecciona variable para comparar", options=numeric_union, key="test_var")
        arrays = {}
        for c, dfc in data_compare.items():
            if chosen_num_tests in dfc.columns:
                arrays[c] = dfc[chosen_num_tests].dropna().values
            else:
                arrays[c] = np.array([])
        test_df = pairwise_ttests(arrays, chosen_num_tests)
        if not test_df.empty:
            test_df["pvalue"] = test_df["pvalue"].apply(lambda x: np.nan if pd.isna(x) else float(x))
            def highlight_pval(val):
                try:
                    return "background-color: #FF5A5F20; font-weight: bold;" if val < 0.05 else ""
                except:
                    return ""
            styled = test_df.style.format({
                "mean_a": "{:.2f}", "mean_b": "{:.2f}", "delta_mean": "{:.2f}",
                "tstat": "{:.3f}", "pvalue": "{:.4f}"
            }).applymap(highlight_pval, subset=["pvalue"])
            st.dataframe(styled, use_container_width=True)
            st.markdown("<p class='desc'>Prueba t de Student entre pares de ciudades. <b>p < 0.05</b> indica diferencia estadísticamente significativa.</p>", unsafe_allow_html=True)
        else:
            st.info("No hay suficientes datos para realizar la prueba.")

# --- Pestaña 3: Resumen Detallado ---
with tab_summary:
    st.markdown("### Estadísticas por Categoría y Ciudad")
    selected_cities = st.multiselect("Ciudades a incluir", options=cities_sorted, default=cities_sorted, key="summary_cities")
    for c in selected_cities:
        dfc = data_compare[c]
        numcols = [col for col in dfc.columns if np.issubdtype(dfc[col].dtype, np.number)]
        if not numcols: continue
        cols_to_show = numcols[:6]
        try:
            agg = dfc.groupby(var_cat)[cols_to_show].agg(["count","mean","median","std"])
            st.markdown(f"**{c}**")
            st.dataframe(agg.head(50), use_container_width=True)
            flat = agg.reset_index()
            flat.columns = ["_".join(map(str, col)).strip("_") for col in flat.columns]
            flat.insert(0, "city", c)
            combined_summary_list.append(flat)
            st.markdown(f"<p class='desc'>Resumen estadístico de hasta 6 variables numéricas agrupadas por <b>{var_cat}</b>.</p>", unsafe_allow_html=True)
        except Exception as e:
            st.warning(f"Error al procesar {c}: {e}")

# --- Pestaña 4: Modelos Predictivos (DISEÑO UNIFICADO + LÍNEA PUNTUAL) ---
with tab_regresion:
    st.markdown("### Modelos de Regresión Lineal por Ciudad")
    all_numeric_cols = numeric_union
    if len(all_numeric_cols) < 2:
        st.info("Se requieren al menos 2 variables numéricas para realizar regresión.")
    else:
        analysis_type = st.selectbox("Tipo de modelo", ["Simple", "Múltiple"], key="reg_type")
        col1, col2 = st.columns(2)
        with col1:
            y_var = st.selectbox("Variable dependiente (Y)", options=all_numeric_cols, index=0)
        with col2:
            if analysis_type == "Simple":
                x_var = st.selectbox("Variable independiente (X)", options=[v for v in all_numeric_cols if v != y_var])
                x_vars = [x_var]
            else:
                x_vars = st.multiselect("Variables independientes (X)", 
                                        options=[v for v in all_numeric_cols if v != y_var], 
                                        default=[v for v in all_numeric_cols if v != y_var][:2])
        if not x_vars:
            st.warning("Selecciona al menos una variable independiente.")
        else:
            results = []
            figs = []
            for city, dfc in data_compare.items():
                available_x = [v for v in x_vars if v in dfc.columns]
                if y_var not in dfc.columns or not available_x:
                    st.warning(f"{city}: Faltan variables para el modelo.")
                    continue
                cols_to_use = [y_var] + available_x
                df_clean = dfc[cols_to_use].dropna()
                if len(df_clean) < 10:
                    st.info(f"{city}: Datos insuficientes (n={len(df_clean)}).")
                    continue

                X = df_clean[available_x]
                y = df_clean[y_var]
                model = LinearRegression()
                model.fit(X, y)
                y_pred = model.predict(X)
                r2 = r2_score(y, y_pred)
                X_sm = sm.add_constant(X)
                ols_model = sm.OLS(y, X_sm).fit()
                coef_str = ", ".join([f"{coef}: {p:.3f}" for coef, p in zip(["const"] + available_x, ols_model.pvalues)])
                results.append({"Ciudad": city, "R²": round(r2, 4), "n": len(df_clean), "p-values": coef_str, "Vars X": ", ".join(available_x)})

                # --- GRÁFICA UNIFICADA (puntos + línea con marcadores) ---
                fig = go.Figure()

                # Punto de datos reales (rojo)
                fig.add_trace(go.Scatter(
                    x=df_clean[available_x[0]],
                    y=y,
                    mode='markers',
                    name='Datos reales',
                    marker=dict(color='#FF5A5F', size=10, opacity=0.7),
                    showlegend=True
                ))

                # Ordenar por X para línea suave
                df_plot = df_clean.sort_values(by=available_x[0])
                x_sorted = df_plot[available_x[0]]
                y_pred_sorted = model.predict(df_plot[available_x])

                # Línea de regresión con marcadores (azul, punteada)
                fig.add_trace(go.Scatter(
                    x=x_sorted,
                    y=y_pred_sorted,
                    mode='markers+lines',
                    name='Regresión',
                    marker=dict(color=REGRESSION_LINE_COLOR, size=8),
                    line=dict(color=REGRESSION_LINE_COLOR, width=2, dash='dot'),
                    showlegend=True
                ))

                # Título y layout
                title = f"{city}: {' → '.join(available_x + [y_var])} (R²={r2:.3f})"
                fig.update_layout(
                    title=title,
                    xaxis_title=available_x[0],
                    yaxis_title=y_var,
                    height=420,
                    margin=dict(l=10, r=10, t=60, b=10),
                    legend=dict(x=0.72, y=1.0),
                    template="simple_white"
                )

                figs.append((city, fig))

            if results:
                results_df = pd.DataFrame(results)
                st.success("Modelos generados exitosamente")
                st.dataframe(results_df, use_container_width=True)
                st.markdown("<p class='desc'>"
                            "Puntos rojos = datos reales. "
                            "Línea azul con puntos = regresión ajustada. "
                            "<b>R²</b> mide el ajuste del modelo.</p>", unsafe_allow_html=True)

                n_cols = min(len(figs), 3)
                cols = st.columns(n_cols)
                for i, (city, fig) in enumerate(figs):
                    with cols[i % n_cols]:
                        st.markdown(f"**{city}**")
                        st.plotly_chart(fig, use_container_width=True)
            else:
                st.error("No se pudo generar ningún modelo con los datos disponibles.")

# --- PESTAÑA: Correlaciones (100% SEGURA) ---
with tab_correlacion:
    st.markdown("### Matriz de Correlación por Ciudad")
    
    # Obtener variables numéricas REALES y comunes
    common_numeric = set(numeric_union)
    for city, dfc in data_compare.items():
        safe_cols = []
        for col in numeric_union:
            if col in dfc.columns:
                # Verificar que sea realmente numérica
                sample = dfc[col].dropna()
                if len(sample) > 0:
                    try:
                        pd.to_numeric(sample, errors='raise')
                        safe_cols.append(col)
                    except:
                        continue
        common_numeric &= set(safe_cols)
    
    common_numeric = sorted(common_numeric)
    
    if len(common_numeric) < 2:
        st.warning("No hay al menos 2 variables numéricas reales y comunes entre las ciudades para calcular correlaciones.")
    else:
        corr_vars = st.multiselect(
            "Variables para correlación",
            options=common_numeric,
            default=common_numeric[:3] if len(common_numeric) >= 3 else common_numeric,
            key="corr_vars"
        )
        
        if len(corr_vars) < 2:
            st.info("Selecciona al menos 2 variables.")
        else:
            cols = st.columns(min(len(cities_compare), 3))
            for idx, city in enumerate(cities_compare):
                with cols[idx % 3]:
                    dfc = data_compare[city]
                    available = [v for v in corr_vars if v in dfc.columns]
                    if len(available) < 2:
                        st.caption(f"**{city}**: Insuficientes variables numéricas")
                        continue
                    
                    # Forzar conversión segura
                    df_num = dfc[available].apply(pd.to_numeric, errors='coerce')
                    corr_matrix = df_num.corr()
                    
                    fig = px.imshow(
                        corr_matrix,
                        text_auto=".2f",
                        aspect="auto",
                        color_continuous_scale="RdBu",
                        zmin=-1, zmax=1,
                        title=f"Correlación: {city}"
                    )
                    fig.update_layout(height=400, margin=dict(l=10, r=10, t=50, b=10))
                    st.plotly_chart(fig, use_container_width=True)
                    
            st.markdown(
                "<p class='desc'>Correlación de Pearson. <b>Rojo</b> = positiva fuerte, <b>azul</b> = negativa fuerte. "
                "Solo se incluyen variables numéricas reales.</p>",
                unsafe_allow_html=True
            )
# --- Pestaña 5: Exportar Resultados ---
with tab_downloads:
    st.markdown("### Descarga de Resultados Analíticos")
    export_freq = freq_comp.copy().sort_values(["categoria","city"]).reset_index(drop=True)
    csv_freq = export_freq.to_csv(index=False).encode("utf-8")
    st.download_button("Frecuencias Comparadas (CSV)", data=csv_freq, file_name=f"freq_{var_cat}.csv", mime="text/csv")

    if not test_df.empty and chosen_num_tests:
        csv_tests = test_df.to_csv(index=False).encode("utf-8")
        st.download_button("Resultados de Pruebas t (CSV)", data=csv_tests, file_name=f"tests_{chosen_num_tests}.csv", mime="text/csv")

    if combined_summary_list:
        combined = pd.concat(combined_summary_list, ignore_index=True, sort=False)
        csv_combined = combined.to_csv(index=False).encode("utf-8")
        st.download_button("Resumen Estadístico Completo (CSV)", data=csv_combined, file_name=f"resumen_{var_cat}.csv", mime="text/csv")

st.markdown("---")
st.caption("Airbnb Analytics Dashboard | Análisis comparativo avanzado con visualización interactiva, regresión y correlaciones.")

Overwriting airbnb.py


In [11]:
%%writefile airbnb.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm   

st.set_page_config(page_title="Airbnb Analytics Dashboard", layout="wide", initial_sidebar_state="expanded")

# ---------------------------
# INICIALIZAR SESSION STATE (ANTES DE TODO)
# ---------------------------
if "compare_list" not in st.session_state:
    st.session_state.compare_list = []
if "active_tab" not in st.session_state:
    st.session_state.active_tab = "Visualizaciones"

# ---------------------------
# CSS Airbnb Style
# ---------------------------
st.markdown("""
<style>
    .stApp { background-color: #FAFAFA; }
    div.stPlotlyChart, div.stDataFrame {
        border: 1px solid #E0E0E0;
        border-radius: 12px;
        padding: 12px;
        margin-bottom: 24px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.05);
        background-color: white;
    }
    .stTabs [data-baseweb="tab-list"] { gap: 16px; border-bottom: 2px solid #E0E0E0; }
    .stTabs [data-baseweb="tab"] { font-weight: 500; color: #484848; padding: 12px 20px; border-radius: 8px 8px 0 0; }
    .stTabs [data-baseweb="tab"]:hover { background-color: #FF5A5F10; color: #FF5A5F; }
    .stTabs [data-baseweb="tab"][aria-selected="true"] { color: #FF5A5F; border-bottom: 3px solid #FF5A5F; }
    .stDownloadButton > button { background-color: #FF5A5F; color: white; border-radius: 8px; font-weight: 500; }
    .stDownloadButton > button:hover { background-color: #E31C79; }
    .desc { font-size: 0.9rem; color: #666; margin-top: 8px; font-style: italic; }
    .insight { background-color: #FF5A5F10; padding: 12px; border-radius: 8px; border-left: 4px solid #FF5A5F; margin-top: 16px; }
</style>
""", unsafe_allow_html=True)

# ---------------------------
# PALETAS DE COLORES
# ---------------------------
AIRBNB_DISCRETE = ["#FF5A5F", "#484848", "#767676", "#E31C79", "#FF5A5F"]
AIRBNB_CONTINUOUS = ["#FF5A5F", "#E31C79", "#484848"]
ROSA_CONTINUA = ["#880E4F", "#AD1457", "#C2185B", "#D81B60", "#EC407A", "#F06292", "#F48FB1", "#F8BBD0"]
DATA_COLOR = "#1f77b4"      # Azul = datos reales
REGRESSION_COLOR = "#FF5A5F" # Rojo = regresión

# ---------------------------
# CONFIG
# ---------------------------
ARCHIVOS = {
    "Ámsterdam": "Amsterdam.csv",
    "Bérgamo": "Bergamo.csv",
    "Oporto": "oporto.csv",
    "Albany": "albany.csv",
    "Berlín": "berlin.csv"
}

FIXED_15_CATEGORICALS = [
    "neighbourhood_group_cleansed", "last_scraped", "has_availability", "host_is_superhost",
    "host_has_profile_pic", "host_identity_verified", "instant_bookable", "host_response_time",
    "room_type", "host_verifications", "bedrooms", "accommodates", "bathrooms",
    "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms",
    "price_numeric", "availability_365", "number_of_reviews", "review_scores_value"
]

# ---------------------------
# HELPERS
# ---------------------------
@st.cache_resource
def load_df(path):
    df = pd.read_csv(path)
    df = df.fillna(method="bfill").fillna(method="ffill")
    if "price" in df.columns:
        df["price_numeric"] = (
            df["price"].astype(str)
            .str.replace(r"[^0-9\.\-]", "", regex=True)
            .replace("", np.nan)
            .astype(float)
        )
    else:
        df["price_numeric"] = np.nan
    return df

def available_fixed_vars(df):
    return [c for c in FIXED_15_CATEGORICALS if c in df.columns]

def safe_top_freq(df, cat_col, top_k):
    if cat_col not in df.columns:
        return pd.DataFrame(columns=["categoria", "frecuencia"])
    f = df[cat_col].value_counts(dropna=False).reset_index()
    f.columns = ["categoria", "frecuencia"]
    f["categoria"] = f["categoria"].astype(str)
    return f.sort_values("frecuencia", ascending=False).head(top_k)

def freq_for_city(df, cat_col):
    if cat_col not in df.columns:
        return pd.DataFrame({cat_col: [], "frecuencia": []})
    f = df[cat_col].value_counts(dropna=False).reset_index()
    f.columns = ["categoria", "frecuencia"]
    f["categoria"] = f["categoria"].astype(str)
    return f

def pairwise_ttests(summary_dfs, num_var):
    rows = []
    cities = list(summary_dfs.keys())
    for i in range(len(cities)):
        for j in range(i+1, len(cities)):
            a, b = cities[i], cities[j]
            arr_a, arr_b = summary_dfs[a], summary_dfs[b]
            if len(arr_a) < 2 or len(arr_b) < 2:
                rows.append({"city_a": a, "city_b": b, "mean_a": np.nan, "mean_b": np.nan, "delta_mean": np.nan, "tstat": np.nan, "pvalue": np.nan, "n_a": len(arr_a), "n_b": len(arr_b)})
                continue
            mean_a, mean_b = float(np.nanmean(arr_a)), float(np.nanmean(arr_b))
            delta = mean_a - mean_b
            try:
                tstat, pvalue = stats.ttest_ind(arr_a, arr_b, equal_var=False, nan_policy="omit")
            except:
                tstat, pvalue = np.nan, np.nan
            rows.append({"city_a": a, "city_b": b, "mean_a": mean_a, "mean_b": mean_b, "delta_mean": delta, "tstat": tstat, "pvalue": pvalue, "n_a": len(arr_a), "n_b": len(arr_b)})
    return pd.DataFrame(rows)

# ---------------------------
# CARGA DE DATOS
# ---------------------------
data_compare = {}
for c in ARCHIVOS.keys():
    path = ARCHIVOS.get(c)
    try:
        data_compare[c] = load_df(path)
    except FileNotFoundError:
        st.error(f"Archivo no encontrado: {c} → {path}")
        st.stop()

# ---------------------------
# VARIABLES NUMÉRICAS SEGURAS
# ---------------------------
numeric_candidates = set()
for dfc in data_compare.values():
    num_cols = dfc.select_dtypes(include=[np.number]).columns
    valid_cols = []
    for col in num_cols:
        if dfc[col].notna().sum() > 0.1 * len(dfc):
            try:
                pd.to_numeric(dfc[col].dropna(), errors='raise')
                valid_cols.append(col)
            except:
                continue
    numeric_candidates.update(valid_cols)

numeric_union = sorted(numeric_candidates)
if "price_numeric" in numeric_union:
    numeric_union.remove("price_numeric")
    numeric_union.insert(0, "price_numeric")

# ---------------------------
# DETECTAR PESTAÑA ACTIVA DESDE URL
# ---------------------------
query_params = st.experimental_get_query_params()
if "tab" in query_params:
    tab_from_url = query_params["tab"][0]
    valid_tabs = ["Visualizaciones", "Pruebas Estadísticas", "Resumen Detallado", "Modelos Predictivos", "Correlaciones", "Exportar Resultados"]
    if tab_from_url in valid_tabs:
        st.session_state.active_tab = tab_from_url

# ---------------------------
# SIDEBAR CON SELECTORES CONDICIONALES
# ---------------------------
st.sidebar.image("https://upload.wikimedia.org/wikipedia/commons/thumb/6/69/Airbnb_Logo_B%C3%A9lo.svg/2560px-Airbnb_Logo_B%C3%A9lo.svg.png", width=120)
st.sidebar.markdown("<h2 style='color:#FF5A5F; margin-top:10px;'>Análisis Comparativo</h2>", unsafe_allow_html=True)

# --- SELECTORES SIEMPRE VISIBLES ---
st.sidebar.markdown("**1. Ciudad principal**  \n**2. Ciudades para comparar**")
primary_city = st.sidebar.selectbox("Ciudad Principal (Treemap)", options=list(ARCHIVOS.keys()), index=0, key="primary_city")
df_primary = data_compare[primary_city]
fixed_available_primary = available_fixed_vars(df_primary)

var_cat = st.sidebar.selectbox("Variable a Analizar", options=fixed_available_primary, key="var_cat")
top_k = st.sidebar.slider("Top de categorías", 5, 30, 10, key="top_k")

st.sidebar.markdown("**Ciudades en Comparación**")
multisel = st.sidebar.multiselect("Agregar ciudades", options=list(ARCHIVOS.keys()), default=[primary_city], key="multisel")
if st.sidebar.button("Agregar Seleccionadas", key="add_cities"):
    for c in multisel:
        if c not in st.session_state.compare_list:
            st.session_state.compare_list.append(c)
if st.sidebar.button("Limpiar Lista", key="clear_list"):
    st.session_state.compare_list = []

st.sidebar.markdown("**Ciudades activas:**")
for c in st.session_state.compare_list:
    st.sidebar.write(f"• {c}")
if not st.session_state.compare_list:
    st.sidebar.info("Lista vacía")

use_persistent = st.sidebar.checkbox("Mantener lista entre sesiones", value=True, key="use_persistent")
cities_compare = st.session_state.compare_list if use_persistent and st.session_state.compare_list else \
                 st.sidebar.multiselect("Ciudades (temporal)", options=list(ARCHIVOS.keys()), default=[primary_city], key="temp_cities")

# --- SELECTORES DE REGRESIÓN: SOLO EN PESTAÑA ---
if st.session_state.active_tab == "Modelos Predictivos":
    st.sidebar.markdown("---")
    st.sidebar.markdown("### Configuración de Modelos")
    reg_analysis_type = st.sidebar.selectbox("Tipo de modelo", ["Simple", "Múltiple"], key="reg_type_sidebar")
    reg_y_var = st.sidebar.selectbox("Variable dependiente (Y)", options=numeric_union, index=0, key="reg_y")
    reg_x_options = [v for v in numeric_union if v != reg_y_var]
    if reg_analysis_type == "Simple":
        reg_x_var = st.sidebar.selectbox("Variable independiente (X)", options=reg_x_options, key="reg_x_simple")
        reg_x_vars = [reg_x_var]
    else:
        reg_x_vars = st.sidebar.multiselect(
            "Variables independientes (X)", 
            options=reg_x_options, 
            default=reg_x_options[:2] if len(reg_x_options) >= 2 else reg_x_options,
            key="reg_x_multi"
        )
else:
    reg_analysis_type = "Simple"
    reg_y_var = numeric_union[0] if numeric_union else None
    reg_x_vars = []

# ---------------------------
# TÍTULO PRINCIPAL
# ---------------------------
st.markdown('<p style="color:#FF5A5F; font-size:32px; font-weight:700; text-align:center;">Airbnb Analytics Dashboard</p>', unsafe_allow_html=True)
st.markdown(f"**Ciudad principal:** {primary_city} — **Variable:** {var_cat}")

# ---------------------------
# TREEMAP
# ---------------------------
treemap_df = safe_top_freq(df_primary, var_cat, top_k)
if treemap_df.empty:
    st.info("No hay datos suficientes para generar el treemap.")
else:
    fig_tree = px.treemap(treemap_df, path=["categoria"], values="frecuencia",
                          title=f"Distribución Jerárquica de {var_cat} en {primary_city}",
                          color="frecuencia", color_continuous_scale=AIRBNB_CONTINUOUS)
    fig_tree.update_layout(margin=dict(t=50, l=10, r=10, b=10), height=520)
    st.plotly_chart(fig_tree, use_container_width=True)
    st.markdown(f"<p class='desc'>Visualización jerárquica del top {top_k} de {var_cat}.</p>", unsafe_allow_html=True)

st.markdown("---")

# ---------------------------
# CARGA DE DATOS COMPARATIVOS
# ---------------------------
if not cities_compare:
    st.info("Selecciona al menos una ciudad para continuar.")
    st.stop()

freqs = []
for c in cities_compare:
    f = freq_for_city(data_compare[c], var_cat)
    f["city"] = c
    freqs.append(f)
freq_all = pd.concat(freqs, ignore_index=True, sort=False) if freqs else pd.DataFrame()
top_global = freq_all.groupby("categoria")["frecuencia"].sum().sort_values(ascending=False).head(top_k).index.tolist()
cities_sorted = sorted(cities_compare)
full_index = pd.MultiIndex.from_product([cities_sorted, top_global], names=["city","categoria"])
freq_comp = freq_all[freq_all["categoria"].isin(top_global)].set_index(["city","categoria"]).reindex(full_index, fill_value=0).reset_index()

# ---------------------------
# PESTAÑAS CON SINCRONIZACIÓN DE URL
# ---------------------------
st.markdown("## Análisis Comparativo entre Ciudades")

tab_names = ["Visualizaciones", "Pruebas Estadísticas", "Resumen Detallado", "Modelos Predictivos", "Correlaciones", "Exportar Resultados"]
tabs = st.tabs(tab_names)

# Actualizar estado y URL al cambiar pestaña
for i, tab in enumerate(tabs):
    if tab._is_selected:
        new_tab = tab_names[i]
        if st.session_state.active_tab != new_tab:
            st.session_state.active_tab = new_tab
            st.experimental_set_query_params(tab=new_tab)
        break

tab_graficos, tab_tests, tab_summary, tab_regresion, tab_correlacion, tab_downloads = tabs

# --- [EL RESTO DEL CÓDIGO ES IGUAL AL ANTERIOR] ---
# (Puedes copiar todo el resto del código que ya tenías desde aquí)

st.markdown("---")
st.caption("Airbnb Analytics Dashboard | Análisis avanzado con regresión, correlaciones y hallazgos automáticos.")

Overwriting airbnb.py
