In [314]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
import numpy as np


In [315]:
data = pd.read_csv('../data.csv')
backup = pd.read_csv('../data.csv')
test = pd.read_csv('../data.csv')

In [316]:
class DataImputer:
    def __init__(self, column_to_value):
        self.column_to_value = column_to_value
    
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        for column in self.column_to_value.keys():
            data[column] = data[column].fillna(self.column_to_value[column])
            return data

In [317]:
class ArrivalDateTransformer:
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        # Converting arrival_date_month values from string to integer
        data['arrival_date_month'].replace(
            ['July', 'August', 'September', 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June'],
            [7,8,9,10,11,12,1,2,3,4,5,6],
            inplace=True
        )

        data.rename(
            columns = {
                'arrival_date_year': 'year', 
                'arrival_date_month': 'month',
                'arrival_date_day_of_month': 'day'
            }, 
            inplace=True
        )

        # Creating new arrival_date column of type datetime
        data.insert(3,'arrival_date',pd.to_datetime(data[['year', 'month', 'day']]))
        return data

In [318]:
class ColumnRemover:
    def __init__(self, columns):
        self.columns = columns

    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        data.drop(self.columns, axis = 1, inplace=True)
        return data


In [319]:
class RoomTypeTransformer:
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        data['reserved_assigned_match'] = np.where(data['reserved_room_type'] == data['assigned_room_type'], 0, 1)
        data.drop(['reserved_room_type','assigned_room_type'], axis=1, inplace=True)
        return data

In [320]:
class MealTypeTransformer:
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        data['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)
        return data

In [321]:
class UncleanDataPointsRemover:
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        data.drop(
            data[
                (data['adults']==0) &
                (data['children']==0) &
                (data['babies']==0) &
                (data['stays_in_weekend_nights']==0) & 
                (data['stays_in_week_nights']==0) &
                (data['is_canceled']==0)
            ].index,
            inplace=True
        )

        data.drop(
            data[
                (data["adr"]==0) &  
                (data["is_canceled"]==0)  &
                (data["market_segment"] != "Complementary") &
                (data["market_segment"] != "Corporate") &
                (data["market_segment"] != "Aviation") 
            ].index, 
            inplace = True
        )        

        data.drop(
            data[
                (data["stays_in_week_nights"]==0) &
                (data["stays_in_weekend_nights"]==0) &
                (data["arrival_date"] == data["reservation_status_date"]) &
                (data["reservation_status"]!="Check-Out")
            ].index,
            inplace=True
        )

        return data

In [322]:
class CancellationsDaysInserter:
    def __call__(self, data:pd.DataFrame, y=None) -> pd.DataFrame:
        # Creating cancellation_days column which represents how many days in advance the guest cancels 
        data['reservation_status_date'] = pd.to_datetime(data['reservation_status_date'], format = '%Y-%m-%d')
        data['cancellation_days'] = data['arrival_date'] - data['reservation_status_date']
        data['cancellation_days'] = data['cancellation_days'].dt.days
        return data

In [323]:
selector = FunctionTransformer(DataImputer(column_to_value={"children":0}))
data.head()
selector.fit_transform(data)

backup['children'] = backup['children'].fillna(0)
print(backup.equals(data))

True


In [324]:
selector = FunctionTransformer(ArrivalDateTransformer())
data.head()
selector.fit_transform(data)

# Converting arrival_date_month values from string to integer
backup['arrival_date_month'].replace(
    ['July', 'August', 'September', 'October', 'November', 'December', 'January', 'February', 'March', 'April', 'May', 'June'],
    [7,8,9,10,11,12,1,2,3,4,5,6],
    inplace=True
)

backup.rename(
    columns = {
        'arrival_date_year': 'year', 
        'arrival_date_month': 'month',
        'arrival_date_day_of_month': 'day'
    }, 
    inplace=True
)

# Creating new arrival_date column of type datetime
backup.insert(3,'arrival_date',pd.to_datetime(backup[['year', 'month', 'day']]))
print(backup.equals(data))

True


In [325]:
# Removing country column for now or we can use target encoding
backup = backup.drop(['country'], axis=1)

# Removing agent and company columns since market_segment gives same information. 
backup = backup.drop(['agent','company'], axis=1)

# Removing all columns relating to arrival date except week number and year.
backup = backup.drop(['month','day'], axis=1)

# We can remove the distribution channels column as market_segment column gives us same information with more granularity
backup = backup.drop(['distribution_channel'], axis=1)

selector = FunctionTransformer(ColumnRemover(['country','agent','company','month','day','distribution_channel']))
data.head()
data = selector.fit_transform(data)

print(backup.equals(data))


True


In [326]:
# We can combine reserved_room_type and assigned_room_type columns into one that has boolean values for whether reserved and assigned matched
# 0 -> Reserved and assigned room types matched
# 1 -> Reserved and assigned room types didn't match
backup['reserved_assigned_match'] = np.where(data['reserved_room_type'] == backup['assigned_room_type'], 0, 1)
backup = backup.drop(['reserved_room_type','assigned_room_type'], axis=1)

selector = FunctionTransformer(RoomTypeTransformer())
data.head()
data = selector.fit_transform(data)

print(backup.equals(data))

True


In [327]:
# We can change the values in the meal types column. 0, 0.33, 0.67, 1 since the meal types are incremental
backup['meal'].replace(['Undefined', 'SC', 'BB', 'HB', 'FB'], [0, 0, 0.33, 0.67, 1], inplace=True)
selector = FunctionTransformer(MealTypeTransformer())
data.head()
data = selector.fit_transform(data)

print(backup.equals(data))

True


In [328]:
# Remove records where adults, babies, children, stays_in_weekend_nights, stays_in_week_nights, is_canceled values are 0
backup.drop(
    backup[
        (backup['adults']==0) &
        (backup['children']==0) &
        (backup['babies']==0) &
        (backup['stays_in_weekend_nights']==0) & 
        (backup['stays_in_week_nights']==0) &
        (backup['is_canceled']==0)
    ].index,
    inplace=True
)

# Remove records with adr and is_canceled values as 0 and whose market segment is neither Complementary, Corporate nor Aviation
backup.drop(
    backup[
        (backup["adr"]==0) &  
        (backup["is_canceled"]==0)  &
        (backup["market_segment"] != "Complementary") &
        (backup["market_segment"] != "Corporate") &
        (backup["market_segment"] != "Aviation") 
    ].index, 
    inplace = True
)

# Remove records with stays_in_weekend_nights and stays_in_week_nights values as 0 and arrival and reservation status date are the same
# and reservation status is not "Check Out"
backup.drop(
    backup[
        (backup["stays_in_week_nights"]==0) &
        (backup["stays_in_weekend_nights"]==0) &
        (backup["arrival_date"] == backup["reservation_status_date"]) &
        (backup["reservation_status"]!="Check-Out")
    ].index,
    inplace=True
)

selector = FunctionTransformer(UncleanDataPointsRemover())
data.head()
data = selector.fit_transform(data)

print(backup.reset_index(drop=True).equals(data.reset_index(drop=True)))

True


In [329]:
# Creating cancellation_days column which represents how many days in advance the guest cancels 
backup['reservation_status_date'] = pd.to_datetime(backup['reservation_status_date'], format = '%Y-%m-%d')
backup['cancellation_days'] = backup['arrival_date'] - backup['reservation_status_date']
backup['cancellation_days'] = backup['cancellation_days'].dt.days

selector = FunctionTransformer(CancellationsDaysInserter())
data.head()
data = selector.fit_transform(data)

print(backup.equals(data))

True


In [330]:
# Reservation Status can be ignored since it is being predicted     -> classification
# Reservation Status can be ignored since we will only be training  -> regression
# on those points where the booking has been cancelled
backup = backup.drop(['reservation_status','reservation_status_date','arrival_date'], axis=1)
selector = FunctionTransformer(ColumnRemover(['reservation_status','reservation_status_date','arrival_date']))
data.head()
data = selector.fit_transform(data)

print(backup.equals(data))

True


In [331]:
pipeline = make_pipeline(
    FunctionTransformer(DataImputer(column_to_value={"children":0})),
    FunctionTransformer(ArrivalDateTransformer()),
    FunctionTransformer(ColumnRemover(['country','agent','company','month','day','distribution_channel'])),
    FunctionTransformer(RoomTypeTransformer()),
    FunctionTransformer(MealTypeTransformer()),
    FunctionTransformer(UncleanDataPointsRemover()),
    FunctionTransformer(CancellationsDaysInserter()),
    FunctionTransformer(ColumnRemover(['reservation_status','reservation_status_date','arrival_date']))
)

pipeline.fit_transform(test)
test.equals(backup)

True