In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import statsmodels.api as sm
from plotly.subplots import make_subplots
from distutils import util

In [None]:
# Jaime

df_calendar = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/calendar.csv")
df_listings = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/listings.csv")
df_neighbourhoods = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/neighbourhoods.csv")
df_reviews = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/reviews.csv")
df_reviews_det = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/reviews_detailed.csv")
df_listings_det = pd.read_csv("/Users/jaime/Documents/ICAI/Quinto/Desarrollo Apps de Visualización/Trabajo/listings_detailed.csv")

In [None]:
df_calendar.head()

In [None]:
df_calendar.isna().sum()

In [None]:
df_calendar['listing_id'][df_calendar['price'].isna()].unique()

In [None]:
df_calendar['listing_id'][df_calendar['adjusted_price'].isna()].unique()

In [None]:
data = [
    go.Histogram(
        x = df_calendar['price'].unique(),
        opacity=0.6,
        name = "Precio"
    )
]

layout = go.Layout(title = "Distribución del precio", xaxis_title = "Precio", yaxis_title = "Frecuencia",
                   barmode = "overlay")

fig = go.Figure(data = data, layout = layout)

fig.show()

In [None]:
data = [
    go.Histogram(
        x = df_calendar['adjusted_price'].unique(),
        opacity=0.6,
        name = "Precio Ajustado"
    )
]

layout = go.Layout(title = "Distribución del precio ajustado", xaxis_title = "Precio Ajustado", yaxis_title = "Frecuencia",
                   barmode = "overlay")

fig = go.Figure(data = data, layout = layout)

fig.show()

Adjusted Price y Price son iguales --> Nos quedamos solo con Price

In [None]:
df_calendar_v2 = df_calendar.drop(columns="adjusted_price")

In [None]:
len(df_calendar_v2['listing_id'].unique())

De un total de 19617 publicaciones, 10 tienen precio nulo (0,05%). Podemos por tanto eliminar estas publicaciones, ya que constituyen una mínima parte del dataset.

In [None]:
listings_nulos = []
for i in range(len(df_calendar_v2['listing_id'][df_calendar_v2['price'].isna()].unique())):
  listings_nulos.append(df_calendar_v2['listing_id'][df_calendar_v2['price'].isna()].unique()[i])

In [None]:
len(df_calendar_v2['listing_id'][df_calendar_v2['maximum_nights'].isna()].unique())

In [None]:
len(df_calendar_v2['listing_id'][df_calendar_v2['minimum_nights'].isna()].unique())

En este caso, el numero de listings es mayor, por lo que imputaremos la mediana como valor de referencia. 

In [None]:
df_calendar_v2['minimum_nights'] = df_calendar_v2['minimum_nights'].fillna(df_calendar_v2['minimum_nights'].median())
df_calendar_v2['maximum_nights'] = df_calendar_v2['maximum_nights'].fillna(df_calendar_v2['maximum_nights'].median())

In [None]:
df_calendar_v2.isna().sum()

Únicamente quedan los nulos del precio, los cuales vamos a eliminar.

In [None]:
df_calendar_v3 = df_calendar_v2.dropna()

In [None]:
df_calendar_v3.isna().sum()

In [None]:
df_calendar_v3.dtypes

Por ultimo, convertimos las variables a sus respectivos tipos, price a float, available a boolean y date a date

In [None]:
df_calendar_v3['date'] = pd.to_datetime(df_calendar_v3['date'])

In [None]:
def precio_a_float(x):
  if(',' in x):
    x = x.replace(',', '')

  y = float(x.split('$')[1])
  return y

In [None]:
df_calendar_v3['price'] = df_calendar_v3['price'].apply(lambda x: precio_a_float(x))

In [None]:
def available_to_bool(x):
  y = util.strtobool(x)
  return y

In [None]:
df_calendar_v3['available'] = df_calendar_v3['available'].apply(lambda x: available_to_bool(x))

In [None]:
df_calendar_vf = df_calendar_v3.copy()

# Preprocesado Listings

In [None]:
df_listings.head()

In [None]:
df_listings.isna().sum()

In [None]:
df_listings.dtypes

Primero, convertimos last_review a formato date

In [None]:
df_listings_v2 = df_listings.copy()
df_listings_v2['last_review'] = pd.to_datetime(df_listings_v2['last_review'])

Por otro lado, la columna host_name, no influencia el precio por lo que no nos aporta información. En el caso de la columna name, al existir unicamente 3 nulos, borraremos dichas filas

In [None]:
df_listings_v3 = df_listings_v2.drop(columns="host_name")
df_listings_v3 = df_listings_v3[df_listings_v3['name'].notna()]

In [None]:
df_listings_v3.isna().sum()

Finalmente, para el caso de reviews per month, se trata de publicaciones sin reviews, por lo que las convertiremos a 0. Para el caso de last_review, tendremos que tomar una decisión

In [None]:
df_listings_vf = df_listings_v3.copy()
df_listings_vf['reviews_per_month'] = df_listings_vf['reviews_per_month'].fillna(0)

In [None]:
df_listings_vf.head()

# Preprocesado Detailed Reviews

In [None]:
df_reviews_det.isna().sum()

In [None]:
len(df_reviews_det['comments'])

El número de reviews nulos es mínimo, por lo que borraremos dichas reviews


In [None]:
df_reviews_det_vf = df_reviews_det.dropna() 
df_reviews_det_vf.isna().sum()

In [None]:
df_reviews_det_vf['date'] = pd.to_datetime(df_reviews_det_vf['date'])
df_reviews_det_vf.dtypes

# Preprocesado Detailed Listings 

In [None]:
df_listings_det.isna().sum()

In [None]:
df_listings_det.head()

Comenzaremos borrando una serie de variables que no nos proporcionan información: listing_url, scrape_id, last_scraped, neighborhood_overview, picture_url, host_url, host_name, host_thumbnail_url, host_picture_url, neighbourhood, host_neighbourhood, etc.

In [None]:
df_listings_det_v2 = df_listings_det.drop(columns = ["listing_url","scrape_id","last_scraped",
                                                     "neighborhood_overview","picture_url","host_url",
                                                     "host_name", "host_since", "host_location",
                                                     "host_thumbnail_url", "host_picture_url",
                                                     "neighbourhood", "host_neighbourhood", "minimum_minimum_nights",
                                                     "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights",
                                                     "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", "calendar_updated",
                                                     "calendar_last_scraped", "number_of_reviews_ltm", "number_of_reviews_l30d",
                                                     "first_review", "last_review", "license", "calculated_host_listings_count_entire_homes",
                                                     "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"])

In [None]:
df_listings_det_v2.isna().sum()

Existen variables con valores nulos para mas del 30% de las filas, por lo que las borraremos

In [None]:
df_listings_det_v3 = df_listings_det_v2.drop(columns = ["review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness",
                                                        "review_scores_checkin", "review_scores_communication", "review_scores_location",
                                                        "review_scores_value", "host_about", "host_response_time", "host_response_rate",
                                                        "host_acceptance_rate"])

In [None]:
df_listings_det_v3['reviews_per_month'] = df_listings_det_v3['reviews_per_month'].fillna(0)
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['bedrooms'].notna()]
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['beds'].notna()]
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['bathrooms_text'].notna()]
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['name'].notna()]
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['description'].notna()]
df_listings_det_v3 = df_listings_det_v3[df_listings_det_v3['host_has_profile_pic'].notna()]

In [None]:
df_listings_det_v3.isna().sum()

In [None]:
def bathrooms_float(x):
  try:
    y = float(x.split(' ')[0])
  except:
    y = "Nan"
  return y

In [None]:
df_listings_det_v3['bathrooms'] = df_listings_det_v3['bathrooms_text'].apply(lambda x: bathrooms_float(x))

In [None]:
df_listings_det_v3['bathrooms'][df_listings_det_v3['bathrooms'] == "Nan"].count()

In [None]:
df_listings_det_vf = df_listings_det_v3[df_listings_det_v3['bathrooms'] != "Nan"]

In [None]:
df_listings_det_vf.isna().sum()