In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

In [50]:
df = pd.read_csv("listings_tokyo_original.csv")

In [51]:
valores_nulos = df.isnull().sum()
valores_nulos

id                                                 0
listing_url                                        0
scrape_id                                          0
last_scraped                                       0
source                                             0
                                                ... 
calculated_host_listings_count                     0
calculated_host_listings_count_entire_homes        0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               2883
Length: 75, dtype: int64

In [52]:
df = df.dropna()
df = df.dropna(subset=['reviews_per_month'])

In [53]:
valores_nulos = df.isnull().sum()
valores_nulos

id                                              0
listing_url                                     0
scrape_id                                       0
last_scraped                                    0
source                                          0
                                               ..
calculated_host_listings_count                  0
calculated_host_listings_count_entire_homes     0
calculated_host_listings_count_private_rooms    0
calculated_host_listings_count_shared_rooms     0
reviews_per_month                               0
Length: 75, dtype: int64

In [54]:
#outliers con el método IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]



for col in df.select_dtypes(include=np.number).columns:
    df = remove_outliers_iqr(df, col)

print("listo, outliers eliminados con el método IQR")

listo, outliers eliminados con el método IQR


In [55]:
#análisis gráficas y tablas
categorical_columns = [
    col for col in df.select_dtypes(include=['object']).columns[:15]
    if df[col].notna().sum() > 0  # filtrar solo columnas con datos
]


In [56]:
print(f"Total columnas categóricas seleccionadas: {len(categorical_columns)}")
print("Columnas categóricas seleccionadas:", categorical_columns)

Total columnas categóricas seleccionadas: 0
Columnas categóricas seleccionadas: []


In [57]:
num_cols = len(categorical_columns)

if num_cols > 0:  # asegurar que hay columnas para graficar
    rows = (num_cols // 3) + (num_cols % 3 > 0)  # ajustar número de filas dinámicamente
    cols = min(num_cols, 3)  # si hay menos de 3 columnas, evitar error en subplots

    fig, axes = plt.subplots(rows, cols, figsize=(18, rows * 5))  # ajuste dinámico
    axes = np.array(axes).reshape(-1)  # convierto en array 1D para evitar errores

    for i, col in enumerate(categorical_columns):
        sns.countplot(y=df[col], order=df[col].value_counts().index, ax=axes[i])
        axes[i].set_title(f"Frecuencia de {col}")

    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()
    plt.show()
else:
    print("no hay columnas con datos suficientes para poder graficarlas")

no hay columnas con datos suficientes para poder graficarlas


In [58]:
plt.tight_layout()
plt.show()

<Figure size 640x480 with 0 Axes>

In [59]:
#categorias de variables con la regla de sturges. #más gráficas
variables = ["host_response_rate", "host_acceptance_rate", "host_total_listings_count",
             "accommodates", "bathrooms_text", "beds", "price", "maximum_nights_avg_ntm",
             "availability_365", "number_of_reviews", "review_scores_value", "reviews_per_month"]


In [60]:
for var in variables:
    if var in df.columns:
        df[var] = pd.to_numeric(df[var], errors="coerce")  # convertir a numérico (NaN) si no se puede


In [61]:
for var in variables:
    if var in df.columns and df[var].notna().sum() > 0:  # Verificar que no esté vacía
        n_bins = int(1 + math.log2(len(df.dropna(subset=[var]))))  # Regla de Sturges
        df[var + "_bins"] = pd.cut(df[var], bins=n_bins, labels=[f"Bin{i}" for i in range(n_bins)])

        # Graficar las frecuencias de las categorías
        plt.figure(figsize=(8, 5))
        df[var + "_bins"].value_counts().sort_index().plot(kind="bar", color="skyblue")
        plt.title(f"Frecuencia de Categorías - {var}")
        plt.xticks(rotation=45)
        plt.show()