# Feature Engineering - MLE2

In [76]:
import numpy as np
import pandas as pd
import uuid

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from loguru import logger

In [2]:
raw_data = pd.read_csv("../data/raw/hotel_bookings.csv")

### Train - Test Split

In [30]:
TRAIN_SIZE = round(raw_data.shape[0] * 0.8)
TEST_SIZE =  raw_data.shape[0] - TRAIN_SIZE

In [42]:
train_indices = raw_data.sample(TRAIN_SIZE).index

In [60]:
test_indices = [i for i in raw_data.index if i not in train_indices]

In [63]:
train_raw_data = raw_data.iloc[train_indices]
test_raw_data = raw_data.iloc[test_indices]

In [93]:
train_raw_data["reserved_room_type"].value_counts()

reserved_room_type
A    68802
D    15345
E     5222
F     2312
G     1683
B      903
C      736
H      494
P        9
L        6
Name: count, dtype: int64

In [67]:
train_raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95512 entries, 58958 to 109869
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           95512 non-null  object 
 1   is_canceled                     95512 non-null  int64  
 2   lead_time                       95512 non-null  int64  
 3   arrival_date_year               95512 non-null  int64  
 4   arrival_date_month              95512 non-null  object 
 5   arrival_date_week_number        95512 non-null  int64  
 6   arrival_date_day_of_month       95512 non-null  int64  
 7   stays_in_weekend_nights         95512 non-null  int64  
 8   stays_in_week_nights            95512 non-null  int64  
 9   adults                          95512 non-null  int64  
 10  children                        95508 non-null  float64
 11  babies                          95512 non-null  int64  
 12  meal                            

In [86]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(train_raw_data[["hotel"]])
encoder.categories_[0]

array(['City Hotel', 'Resort Hotel'], dtype=object)

In [139]:
class FeatureEngineeringProcessor:

    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        self.raw_data = raw_data
        self.pipeline_name = pipeline_name


    def impute_scale(self, n_components: int) -> pd.DataFrame:
        """Pipeline que imputa variables numericas y luego las escala, para finalmente
        Aplicar PCA y quedarse con N componentes principales
        """
        numeric_cols= [
            "lead_time",
            "adults",
            "children",
            "babies",
            "adr"
            
        ]
        pipe = Pipeline(
            steps=[
                ("imputer_mean", SimpleImputer(strategy="mean")),
                ("std_scaling", StandardScaler()),
                ("pca", PCA(n_components=n_components))
            ]
        )
        return pd.DataFrame(
            pipe.fit_transform(self.raw_data[numeric_cols]),
            columns=["pc1", "pc2"]
        )
        

    def encode_categoricals(self) -> pd.DataFrame:

        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            logger.info(f"Codificando con OHE {var}")
            encoder = OneHotEncoder()
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols  = [f"{var}_{col}" for col in encoder.categories_[0]]
            _dataframe = pd.DataFrame(
                encoded,
                columns= cols
            )
            encoded_vars.append(_dataframe)
        return pd.concat(encoded_vars,axis=1)


    def run(self) -> pd.DataFrame:
        # acá pondremos nuestro codigo
        logger.info(f"Inicializando pipeline {self.pipeline_name}")

        categorical = self.encode_categoricals()
        numerics = self.impute_scale()

        modeling_dataset = pd.concat([categorical, numerics], axis=1)
        # Dataset Previo el pipeline
        pipe = Pipeline(
            steps=[
                ("feature_selection", VarianceThreshold()),
                ("scaling_robust", RobustScaler())
            ]
        )
        return pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns=modeling_dataset.columns
        )
        
        

In [140]:
train_processor = FeatureEngineeringProcessor(
    raw_data=train_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TRAIN"
) 
train_processor.run()

[32m2025-07-07 20:50:27.266[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m48[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering[0m
[32m2025-07-07 20:50:27.272[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-07 20:50:27.301[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-07 20:50:27.322[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE reserved_room_type[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,pc1,pc2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276838,0.023273
1,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.439499,0.472357
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.740757,-0.384869
3,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.499002,-0.185201
4,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.558233,-0.289795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.902179,-0.390804
95508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.140063,-0.588582
95509,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157756,-0.256200
95510,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.203393,-0.225992


In [141]:
test_processor = FeatureEngineeringProcessor(
    raw_data=test_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TEST"
) 
test_processor.run()

[32m2025-07-07 20:56:07.109[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m48[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering - TEST[0m
[32m2025-07-07 20:56:07.110[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-07 20:56:07.122[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-07 20:56:07.131[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m34[0m - [1mCodificando con OHE reserved_room_type[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_P,pc1,pc2
0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.319793,1.942104
1,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.415230,4.440340
2,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.929625,-1.213835
3,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.101449,0.220992
4,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.232331,0.130326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.421842,1.528162
23874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.072202,1.239926
23875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.922096,0.287608
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208606,0.346647
