In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import statsmodels.api as smi
import scipy.special as special
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
# Limpiar y convertir la columna 'price' a float
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

  df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)


In [8]:
# Limpiar y convertir la columna 'price' a float
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

  df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)


In [41]:
%%writefile appp.py
import streamlit as st
import pandas as pd
import plotly.express as px
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image  # Para añadir logos/images
import plotly.graph_objects as go

# Configuración inicial 
st.set_page_config(
    layout="wide",
    page_title="Airbnb Menorca Analytics",
    page_icon="🏠",
    initial_sidebar_state="expanded"
)

# Cache  con hash_funcs para objetos personalizados
@st.cache_data(hash_funcs={pd.DataFrame: lambda _: None})
def load_data():
    df = pd.read_csv('spain_menorca_procesado_outliers (2) (1).csv')
    
    # Limpieza  de precios
    df['price'] = pd.to_numeric(df['price'].replace('[\$,]', '', regex=True), errors='coerce')
    df = df.dropna(subset=['price'])  # Eliminar filas sin precio
    
    # Extracción de columnas
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # Categorías relevantes con manejo de nulos
    host_response_time_cats = df['host_response_time'].dropna().unique()
    host_is_superhost_cats = df['host_is_superhost'].dropna().unique()
    room_type_cats = df['room_type'].dropna().unique()
    
    return df, numeric_cols, text_cols, host_response_time_cats, host_is_superhost_cats, room_type_cats

df, numeric_cols, text_cols, host_response_time_cats, host_is_superhost_cats, room_type_cats = load_data()

# Sidebar con logo y secciones
with st.sidebar:
    # Puedes añadir un logo si tienes uno
    # logo = Image.open('logo.png')
    # st.image(logo, width=200)
    
    st.title("Menú de Navegación")
    view = st.radio(
        "Seleccionar vista",
        options=['Vista General', 'Análisis de Precios', 'Regresión Lineal', 'Regresión Logística'],
        index=0
    )
    
    st.markdown("---")
    st.markdown("**Configuración general**")
    st.caption("Ajustes aplicables a todas las vistas")

# Función para mostrar métricas con estilo 
def display_metrics(metrics_dict, cols=3):
    columns = st.columns(cols)
    for i, (name, value) in enumerate(metrics_dict.items()):
        columns[i % cols].metric(label=name, value=value)
if view == 'Vista General':
    st.title('🏠 Airbnb Menorca - Dashboard Analítico')
    st.markdown("""
    <style>
    .big-font {
        font-size:16px !important;
    }
    </style>
    """, unsafe_allow_html=True)
    
    st.markdown('<p class="big-font">Exploración interactiva de los datos de alojamientos en Menorca</p>', unsafe_allow_html=True)
    
    
    with st.sidebar:
        st.header("🔍 Filtros Avanzados")
        
        price_range = st.slider(
            'Rango de Precio (€)',
            float(df['price'].min()),
            float(df['price'].max()),
            (float(df['price'].quantile(0.25)), float(df['price'].quantile(0.75))),
            help="Selecciona el rango de precios a visualizar"
        )
        
        superhost_filter = st.selectbox(
            '¿Es Superhost?',
            options=['Todos', 'Sí', 'No'],
            index=0
        )
        
        room_type_filter = st.multiselect(
            'Tipo de Alojamiento',
            options=room_type_cats,
            default=room_type_cats,
            help="Selecciona uno o varios tipos de alojamiento"
        )
        
        # Checkbox para mostrar datos
        show_data = st.checkbox("Mostrar datos filtrados", value=False)
    
    
    filtered_df = df.copy()
    filtered_df = filtered_df[
        (filtered_df['price'] >= price_range[0]) & 
        (filtered_df['price'] <= price_range[1])
    ]

    if superhost_filter != 'Todos':
        superhost_value = 't' if superhost_filter == 'Sí' else 'f'
        filtered_df = filtered_df[filtered_df['host_is_superhost'] == superhost_value]

    if room_type_filter:
        filtered_df = filtered_df[filtered_df['room_type'].isin(room_type_filter)]
    
    # Métricas clave 
    metrics = {
        "Total Propiedades": len(filtered_df),
        "Precio Promedio": f"€{filtered_df['price'].mean():.2f}",
        "Rating Promedio": f"{filtered_df['review_scores_rating'].mean():.2f}",
        "Ocupación (30 días)": f"{filtered_df['availability_30'].mean():.1f} días"
    }
    display_metrics(metrics)
    
    # Gráficos 
    tab1, tab2 = st.tabs(["📊 Distribución de Precios", "📈 Relaciones Clave"])
    
    with tab1:
        fig1 = px.histogram(
            filtered_df, 
            x='price', 
            nbins=50, 
            title='Distribución de Precios',
            color_discrete_sequence=['#1f77b4'],
            labels={'price': 'Precio (€)'}
        )
        fig1.update_layout(
            hovermode='x unified',
            xaxis_title="Precio (€)",
            yaxis_title="Número de propiedades"
        )
        st.plotly_chart(fig1, use_container_width=True)
        
    with tab2:
        fig2 = px.scatter(
            filtered_df,
            x='accommodates',
            y='price',
            color='room_type',
            size='bathrooms',
            hover_name='room_type',
            title='Capacidad vs Precio',
            labels={
                'accommodates': 'Capacidad (personas)',
                'price': 'Precio (€)',
                'room_type': 'Tipo de habitación'
            }
        )
        st.plotly_chart(fig2, use_container_width=True)
    
    # Mostrar datos 
    if show_data:
        st.subheader("Datos Filtrados")
        # Columnas a mostrar
        selected_columns = [
            'last_scraped', 'host_name', 'host_since',
            'host_location', 'host_response_time', 'host_response_rate',
            'host_acceptance_rate', 'host_is_superhost', 'host_has_profile_pic',
            'host_identity_verified', 'property_type', 'room_type',
            'bathrooms_text', 'price', 'has_availability', 'calendar_last_scraped',
            'first_review', 'last_review', 'license', 'instant_bookable', 'id',
            'host_id', 'host_listings_count', 'host_total_listings_count',
            'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights',
            'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
            'minimum_maximum_nights', 'maximum_maximum_nights',
            'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30',
            'availability_90', 'number_of_reviews', 'number_of_reviews_ltm',
            'number_of_reviews_l30d', 'review_scores_rating',
            'review_scores_accuracy', 'review_scores_cleanliness',
            'review_scores_checkin', 'review_scores_location',
            'calculated_host_listings_count_entire_homes',
            'calculated_host_listings_count_private_rooms',
            'calculated_host_listings_count_shared_rooms', 'reviews_per_month'
        ]
        
        # Filtrar columnas y mostrar
        st.dataframe(filtered_df[selected_columns])
        
        
elif view == 'Análisis de Precios':
    st.title('💰 Análisis de Precios')
    st.markdown("Explora cómo diferentes factores afectan el precio de los alojamientos.")
    
    # Widgets con descripciones
    with st.sidebar:
        st.header('Opciones de Análisis')
        analysis_type = st.radio(
            "Tipo de análisis",
            options=["Distribución", "Correlación", "Comparación"],
            index=0
        )
    
    if analysis_type == "Distribución":
        col1, col2 = st.columns(2)
        
        with col1:
            x_axis = st.selectbox(
                'Variable para análisis',
                options=['room_type', 'host_is_superhost', 'host_response_time', 'accommodates', 'bathrooms'],
                index=0,
                help="Selecciona la variable para el eje X"
            )
            
        with col2:
            color_by = st.selectbox(
                'Variable de color',
                options=['room_type', 'host_is_superhost', None],
                index=0,
                help="Selecciona cómo colorear los datos"
            )
        
        fig = px.box(
            df, 
            x=x_axis, 
            y='price', 
            color=color_by,
            title=f'Distribución de Precios por {x_axis}',
            labels={'price': 'Precio (€)'}
        )
        st.plotly_chart(fig, use_container_width=True)
        
    elif analysis_type == "Correlación":
        st.subheader('Correlación entre Variables')
        
        # Selección de variables numéricas
        numeric_features = ['price', 'accommodates', 'bathrooms', 'beds', 
                          'review_scores_rating', 'number_of_reviews']
        
        selected_features = st.multiselect(
            'Selecciona variables para el análisis de correlación',
            options=numeric_features,
            default=['price', 'accommodates', 'bathrooms']
        )
        
        if len(selected_features) >= 2:
            corr_matrix = df[selected_features].corr()
            
            fig = px.imshow(
                corr_matrix,
                text_auto=True,
                aspect="auto",
                title="Matriz de Correlación",
                labels=dict(color="Correlación"),
                x=selected_features,
                y=selected_features,
                color_continuous_scale='RdBu',
                zmin=-1,
                zmax=1
            )
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("Selecciona al menos 2 variables para el análisis de correlación")
            
    elif analysis_type == "Comparación":
        st.subheader('Comparación de Precios')
        
        feature = st.selectbox(
            'Selecciona característica para comparar',
            options=['room_type', 'host_is_superhost', 'beds', 'accommodates'],
            index=0
        )
        
        fig = px.violin(
            df,
            x=feature,
            y='price',
            box=True,
            points="all",
            hover_data=df.columns,
            title=f'Distribución de Precios por {feature}'
        )
        st.plotly_chart(fig, use_container_width=True)
elif view == 'Regresión Lineal':
    st.title('📈 Modelo de Regresión Lineal')
    st.markdown("""
    Predice el precio basado en diferentes características del alojamiento.
    """)
    
    model_type = st.radio(
        'Tipo de modelo:',
        options=['Regresión Simple', 'Regresión Múltiple'],
        horizontal=True
    )
    
    if model_type == 'Regresión Simple':
        st.sidebar.header('Configuración - Regresión Simple')
        
        feature_options = [
            'accommodates', 'bathrooms', 'beds',
            'minimum_nights', 'number_of_reviews', 'availability_30'
        ]
        
        selected_feature = st.sidebar.selectbox(
            'Variable predictora:',
            options=feature_options,
            index=0
        )
        
        if selected_feature:
            temp_df = df[[selected_feature, 'price']].dropna()
            
            if len(temp_df) > 10:
                X = temp_df[[selected_feature]]
                y = temp_df['price']
                
                if X[selected_feature].dtype == 'object':
                    X = pd.get_dummies(X, drop_first=True)
                
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.3, random_state=42
                )
                
                if X[selected_feature].dtype in ['int64', 'float64']:
                    scaler = StandardScaler()
                    X_train = scaler.fit_transform(X_train)
                    X_test = scaler.transform(X_test)
                
                model = LinearRegression()
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                
                fig = go.Figure()
                
                # Valores reales
                fig.add_trace(go.Scatter(
                    x=X_test.squeeze(),
                    y=y_test,
                    mode='markers',
                    name='Valores Reales',
                    marker=dict(color='#1f77b4', size=8),
                    opacity=0.7
                ))
                
                # Predicciones
                fig.add_trace(go.Scatter(
                    x=X_test.squeeze(),
                    y=y_pred,
                    mode='markers',
                    name='Predicciones',
                    marker=dict(color='#ff7f0e', size=8, symbol='x'),
                    opacity=0.7
                ))
                
                # Línea de tendencia
                if X[selected_feature].dtype in ['int64', 'float64']:
                    x_range = np.linspace(X_test.min(), X_test.max(), 100)
                    y_range = model.predict(x_range.reshape(-1, 1))
                    fig.add_trace(go.Scatter(
                        x=x_range,
                        y=y_range,
                        mode='lines',
                        name='Línea de Regresión',
                        line=dict(color='#2ca02c', width=3)
                    ))
                
                fig.update_layout(
                    title=f'Relación entre {selected_feature} y Precio',
                    xaxis_title=selected_feature,
                    yaxis_title='Precio (€)',
                    hovermode='x unified'
                )
                
                col1, col2 = st.columns(2)
                
                with col1:
                    st.subheader("Métricas del Modelo")
                    metrics = {
                        "R²": f"{r2_score(y_test, y_pred):.3f}",
                        #"MSE": f"{mean_squared_error(y_test, y_pred):.1f}",
                        #"RMSE": f"{np.sqrt(mean_squared_error(y_test, y_pred)):.1f}"
                    }
                    for name, value in metrics.items():
                        st.metric(label=name, value=value)
                    
                    if hasattr(model, 'coef_'):
                        st.write(f"Coeficiente: **{model.coef_[0]:.2f}**")
                    #st.write(f"Intercepto: **{model.intercept_:.2f}**")
                
                with col2:
                    st.subheader(f"Relación con {selected_feature}")
                    st.plotly_chart(fig, use_container_width=True)
                
            else:
                st.error("No hay suficientes datos para este análisis.")
    
    elif model_type == 'Regresión Múltiple':
        st.sidebar.header('Configuración - Regresión Múltiple')
        
        feature_options = [
            'accommodates', 'bathrooms', 'beds', 'review_scores_rating',
            'minimum_nights', 'number_of_reviews', 'availability_30',
            'room_type', 'host_is_superhost', 'instant_bookable',
            'bedrooms', 'host_total_listings_count'
        ]
        
        selected_features = st.sidebar.multiselect(
            'Variables predictoras:',
            options=feature_options,
            default=['accommodates', 'bathrooms', 'room_type']
        )
        
        if len(selected_features) >= 2:
            temp_df = df[selected_features + ['price']].dropna()
            
            X = temp_df[selected_features]
            y = temp_df['price']
            
            X_encoded = pd.get_dummies(X, drop_first=True)
            
            if not X_encoded.empty:
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(X_encoded)
                
                X_train, X_test, y_train, y_test = train_test_split(
                    X_scaled, y, test_size=0.3, random_state=42
                )
                
                model = LinearRegression()
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                
                coef_df = pd.DataFrame({
                    'Variable': X_encoded.columns,
                    'Coeficiente': model.coef_,
                    #'Importancia': np.abs(model.coef_)
                }).sort_values('Coeficiente', ascending=False)
                
                top_feature = coef_df.iloc[0]['Variable']
                
                # Reconstruir X_test original para la variable principal
                if any(top_feature.startswith(col) for col in selected_features if df[col].dtype == 'object'):
                    # Para variables categóricas
                    cat_var = next(col for col in selected_features if top_feature.startswith(col))
                    x_test_values = X_test[:, X_encoded.columns.get_loc(top_feature)]
                    x_test_values = x_test_values * scaler.scale_[X_encoded.columns.get_loc(top_feature)] + scaler.mean_[X_encoded.columns.get_loc(top_feature)]
                else:
                    # Para variables numéricas
                    x_test_values = X_test[:, X_encoded.columns.get_loc(top_feature)]
                    x_test_values = x_test_values * scaler.scale_[X_encoded.columns.get_loc(top_feature)] + scaler.mean_[X_encoded.columns.get_loc(top_feature)]
                
                
                fig = go.Figure()
                
                # Valores reales
                fig.add_trace(go.Scatter(
                    x=x_test_values,
                    y=y_test,
                    mode='markers',
                    name='Valores Reales',
                    marker=dict(color='#1f77b4', size=8),
                    opacity=0.7
                ))
                
                # Predicciones
                fig.add_trace(go.Scatter(
                    x=x_test_values,
                    y=y_pred,
                    mode='markers',
                    name='Predicciones',
                    marker=dict(color='#ff7f0e', size=8, symbol='x'),
                    opacity=0.7
                ))
                
                fig.update_layout(
                    title=f'Relación entre {top_feature} y Precio',
                    xaxis_title=top_feature,
                    yaxis_title='Precio (€)',
                    hovermode='x unified'
                )
                
                st.subheader('Resultados del Modelo')
                
                col1, col2 = st.columns(2)
                
                with col1:
                    st.metric("R²", f"{r2_score(y_test, y_pred):.3f}")
                    #st.metric("MSE", f"{mean_squared_error(y_test, y_pred):.1f}")
                    #st.metric("RMSE", f"{np.sqrt(mean_squared_error(y_test, y_pred)):.1f}")
                    
                    st.subheader("Variables más importantes")
                    st.dataframe(coef_df.style.format({'Coeficiente': '{:.4f}', 'Importancia': '{:.4f}'}))
                
                with col2:
                    st.plotly_chart(fig, use_container_width=True)
                
            else:
                st.error("Error en la preparación de datos.")
        else:
            st.warning("Selecciona al menos 2 variables para la regresión múltiple.")
elif view == 'Regresión Logística':
    st.title('📊 Modelo de Regresión Logística')
    st.markdown("""
    Predice si un alojamiento tiene un precio alto (por encima de la mediana) 
    basado en sus características.
    """)
    
    # Crear variable objetivo 
    price_median = df['price'].median()
    df['high_price'] = (df['price'] > price_median).astype(int)
    
    # Sidebar 
    with st.sidebar:
        st.header('Configuración del Modelo')
        
        feature_options = [
            'accommodates', 'bathrooms', 'beds', 'review_scores_rating',
            'minimum_nights', 'number_of_reviews', 'availability_30',
            'room_type', 'host_is_superhost'
        ]
        
        selected_features = st.multiselect(
            'Variables predictoras:',
            options=feature_options,
            default=['accommodates', 'bathrooms', 'host_is_superhost']
        )
        
        test_size = st.slider(
            'Tamaño del conjunto de prueba (%)',
            10, 40, 30
        )
    
    if selected_features:
        # Preparación de datos
        temp_df = df[selected_features + ['high_price']].dropna()
        X = temp_df[selected_features]
        y = temp_df['high_price']
        
        # Codificación
        X_encoded = pd.get_dummies(X, drop_first=True)
        
        if not X_encoded.empty:
            # Escalado
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_encoded)
            
            # División de datos
            X_train, X_test, y_train, y_test = train_test_split(
                X_scaled, y, 
                test_size=test_size/100, 
                random_state=42,
                stratify=y
            )
            
            # Entrenamiento del modelo
            model = LogisticRegression(max_iter=1000)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]
            
            # Resultados
            st.subheader('Evaluación del Modelo')
            
            col1, col2 = st.columns(2)
            
            with col1:
                st.metric("Precisión (Accuracy)", f"{accuracy_score(y_test, y_pred):.2f}")
                st.metric("Precisión (Precision)", f"{precision_score(y_test, y_pred):.2f}")
                st.metric("Sensibilidad (Recall)", f"{recall_score(y_test, y_pred):.2f}")
                st.metric("F1-Score", f"{2 * (precision_score(y_test, y_pred) * recall_score(y_test, y_pred)) / (precision_score(y_test, y_pred) + recall_score(y_test, y_pred)):.2f}")
            
            with col2:
                # Matriz de confusión mejorada
                cm = confusion_matrix(y_test, y_pred)
                fig_cm = px.imshow(
                    cm,
                    text_auto=True,
                    labels=dict(x="Predicho", y="Real", color="Casos"),
                    x=['Bajo', 'Alto'],
                    y=['Bajo', 'Alto'],
                    title="Matriz de Confusión"
                )
                st.plotly_chart(fig_cm, use_container_width=True)
            
      
            
            # Coeficientes del modelo
            st.subheader('Importancia de las Variables')
            
            if hasattr(model, 'coef_'):
                coef_df = pd.DataFrame({
                    'Variable': X_encoded.columns,
                    'Coeficiente': model.coef_[0],
                    'Odds Ratio': np.exp(model.coef_[0])
                }).sort_values('Odds Ratio', ascending=False)
                
                fig_coef = px.bar(
                    coef_df,
                    x='Variable',
                    y='Odds Ratio',
                    color='Odds Ratio',
                    title='Odds Ratio de las Variables'
                )
                st.plotly_chart(fig_coef, use_container_width=True)
        else:
            st.error("Error en la preparación de datos.")
    else:
        st.warning("Selecciona al menos una variable predictora.")



Overwriting appp.py
