In [1]:
# importamos las librerías que necesitamos
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [2]:
df = pd.read_csv("data/finanzas_hotel_bookings.csv", index_col=0, low_memory=False)

In [3]:
df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'agent', 'company',
       'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', '0'],
      dtype='object')

In [4]:
df = df.iloc[:119390]

In [5]:
# →  Estandarizar para que en todos los casos sean números y cambiar el datatype de la columna a número entero.

def cambiar_formato_mes(df):
    # Diccionario de mapeo de meses y valores numéricos
    dic_map = {
        'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 
        'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 
        'November': 11, 'December': 12, 
        1: 1, 2: 2, 3: 3, 4:4, 5:5, 6:6, 7:7, 8:8, 9:9, 10:10, 11:11, 12:12, 'nan': np.nan}
    
    # Asegurarse de que los valores en la columna sean de tipo string para mapear correctamente
    df['arrival_date_month'] = df['arrival_date_month'].astype(str).map(dic_map).astype('Int64')

    return df

df = cambiar_formato_mes(df)

In [6]:
df['arrival_date_year'].isna().sum()

54561

In [7]:


def rellenar_año_llegada(df):  

    print(f'nulos antes de hacer la operación {df["arrival_date_year"].isna().sum()}')

    # Asegúrate de que las columnas de fecha estén en formato datetime
    df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], errors='coerce')

    # Crear una columna para el total de la estancia
    df['total_stays'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

    # Inicializar la columna de fecha de llegada estimada con valores NaT (Not a Time)
    df['estimated_arrival_date'] = pd.NaT

    # Crear filtro para cuando el estado de la reserva es "Checkout"
    filtro_checkout = df['reservation_status'] == 'Checkout'

    # Calcular fecha de llegada estimada solo para "Checkout"
    df.loc[filtro_checkout, 'estimated_arrival_date'] = df.loc[filtro_checkout, 'reservation_status_date'] - pd.to_timedelta(df.loc[filtro_checkout, 'total_stays'], unit='D')

    # Extraer el año de la fecha de llegada estimada
    df['estimated_arrival_year'] = df['estimated_arrival_date'].dt.year

    # Para los que no son "Checkout", usar el año de la fecha de estado de reserva
    df.loc[~filtro_checkout, 'estimated_arrival_year'] = df.loc[~filtro_checkout, 'reservation_status_date'].dt.year

    # Rellenar los valores nulos en arrival_date_year con el año estimado
    df['arrival_date_year'] = df['arrival_date_year'].fillna(df['estimated_arrival_year'])

    print(f'nulos después de hacer la operación {df["arrival_date_year"].isna().sum()}')

    return df.head()


In [8]:
rellenar_año_llegada(df)

nulos antes de hacer la operación 54561
nulos después de hacer la operación 5936


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,0,total_stays,estimated_arrival_date,estimated_arrival_year
0,Resort Hotel,False,342.0,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,PRT,,Direct,0.0,,0.0,C,C,3.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01,,0.0,NaT,2015.0
1,Resort Hotel,False,737.0,2015.0,7,27.0,1.0,0.0,0.0,2.0,,0.0,BB,,,Direct,0.0,0.0,0.0,,C,4.0,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01,,0.0,NaT,2015.0
2,Resort Hotel,False,7.0,2015.0,7,27.0,1.0,0.0,1.0,1.0,0.0,0.0,BB,GBR,,Direct,0.0,0.0,0.0,A,C,0.0,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02,,1.0,NaT,2015.0
3,Resort Hotel,False,13.0,2015.0,7,27.0,1.0,0.0,1.0,1.0,,0.0,BB,GBR,Corporate,Corporate,0.0,0.0,0.0,A,A,0.0,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02,,1.0,NaT,2015.0
4,Resort Hotel,False,14.0,2015.0,7,,1.0,0.0,2.0,2.0,,0.0,BB,,Online TA,TA/TO,0.0,0.0,0.0,A,A,0.0,240.0,,0.0,Transient,98.0,0.0,1.0,Check-Out,2015-07-03,,2.0,NaT,2015.0
