# Feature Engineering - MLE2

In [72]:
import numpy as np
import pandas as pd
import uuid
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from loguru import logger

In [2]:
raw_data = pd.read_csv("../data/raw/hotel_bookings.csv")

### Train - Test Split

In [3]:
TRAIN_SIZE = round(raw_data.shape[0] * 0.8)
TEST_SIZE =  raw_data.shape[0] - TRAIN_SIZE

In [4]:
train_indices = raw_data.sample(TRAIN_SIZE).index

In [5]:
test_indices = [i for i in raw_data.index if i not in train_indices]

In [6]:
train_raw_data = raw_data.iloc[train_indices]
test_raw_data = raw_data.iloc[test_indices]

In [7]:
train_raw_data["reserved_room_type"].value_counts()

reserved_room_type
A    68688
D    15418
E     5210
F     2324
G     1703
B      917
C      767
H      469
P       10
L        6
Name: count, dtype: int64

In [119]:
train_raw_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
62631,City Hotel,1,62,2017,January,3,18,0,2,2,...,No Deposit,9.0,,0,Transient,93.60,0,1,Canceled,2016-12-08
6383,Resort Hotel,1,12,2016,June,24,7,0,1,2,...,No Deposit,240.0,,0,Transient,109.00,0,0,Canceled,2016-05-27
29293,Resort Hotel,0,386,2016,October,43,20,2,3,2,...,No Deposit,,,0,Transient-Party,55.00,0,0,Check-Out,2016-10-25
102235,City Hotel,0,6,2016,November,48,23,0,2,1,...,No Deposit,53.0,,0,Transient-Party,75.00,0,0,Check-Out,2016-11-25
89114,City Hotel,0,0,2016,May,21,17,0,0,2,...,No Deposit,6.0,,0,Transient,0.00,0,0,Check-Out,2016-05-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32422,Resort Hotel,0,0,2017,January,3,19,2,5,1,...,No Deposit,240.0,,0,Transient,43.00,0,1,Check-Out,2017-01-26
69023,City Hotel,1,243,2017,May,21,24,0,4,3,...,No Deposit,9.0,,0,Transient,157.45,0,2,Canceled,2017-04-22
73146,City Hotel,1,49,2017,August,33,15,0,4,2,...,No Deposit,9.0,,0,Transient,140.00,0,1,Canceled,2017-06-27
74479,City Hotel,1,318,2015,August,36,31,1,1,2,...,Non Refund,1.0,,0,Contract,62.00,0,0,Canceled,2015-01-01


In [9]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(train_raw_data[["hotel"]])
encoder.categories_[0]

array(['City Hotel', 'Resort Hotel'], dtype=object)

In [73]:
class FeatureEngineeringProcessor:

    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        self.raw_data = raw_data
        self.pipeline_name = pipeline_name
        self.feature_table = None


    def impute_scale(self, n_components: int = 2) -> pd.DataFrame:
        """Pipeline que imputa variables numericas y luego las escala, para finalmente
        Aplicar PCA y quedarse con N componentes principales
        """
        numeric_cols= [
            "lead_time",
            "adults",
            "children",
            "babies",
            "adr"
            
        ]
        pipe = Pipeline(
            steps=[
                ("imputer_mean", SimpleImputer(strategy="mean")),
                ("std_scaling", StandardScaler()),
                ("pca", PCA(n_components=n_components))
            ]
        )
        return pd.DataFrame(
            pipe.fit_transform(self.raw_data[numeric_cols]),
            columns=["great_feature1", "great_feature2"]
        )
        

    def encode_categoricals(self) -> pd.DataFrame:

        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            logger.info(f"Codificando con OHE {var}")
            encoder = OneHotEncoder()
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols  = [f"{var}_{col}" for col in encoder.categories_[0]]
            _dataframe = pd.DataFrame(
                encoded,
                columns= cols
            )
            encoded_vars.append(_dataframe)
        return pd.concat(encoded_vars,axis=1)


    def run(self) -> pd.DataFrame:
        # acá pondremos nuestro codigo
        logger.info(f"Inicializando pipeline {self.pipeline_name}")

        categorical = self.encode_categoricals()
        numerics = self.impute_scale()

        modeling_dataset = pd.concat([categorical, numerics], axis=1)
        # Dataset Previo el pipeline
        pipe = Pipeline(
            steps=[
                ("feature_selection", VarianceThreshold()),
                ("scaling_robust", RobustScaler())
            ]
        )
        self.feature_table =  pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns=modeling_dataset.columns
        )
        self.feature_table["booking_id"] = [str(uuid.uuid4()) for _ in range(self.feature_table.shape[0])]
        self.feature_table["event_timestamp"] = [datetime.now() for _ in range(self.feature_table.shape[0])]

        return self.feature_table

    def write_feature_table(self, filepath: str) -> None:
        """Escribimos la feature table final para modelamiento
        """

        if not self.feature_table.empty: # -> True o False
            self.feature_table.to_parquet(filepath, index=False)
        else:
            raise Exception("La feature table no ha sido creada. Ejecutar el comando .run()")            
        
        
        

In [74]:
train_processor = FeatureEngineeringProcessor(
    raw_data=train_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TRAIN"
) 
train_processor.run()
train_processor.write_feature_table(
    filepath="../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/bookings_feature_table.parquet"
)

[32m2025-07-09 20:25:26.798[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m52[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering - TRAIN[0m
[32m2025-07-09 20:25:26.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-09 20:25:26.874[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-09 20:25:26.899[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE reserved_room_type[0m


In [75]:
pd.read_parquet("../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/bookings_feature_table.parquet")

Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,market_segment_Undefined,...,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,great_feature1,great_feature2,booking_id,event_timestamp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.070891,0.059801,557afdfe-f573-47b6-8ca4-ff9e33d3b06e,2025-07-09 20:25:27.212008
1,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.307593,-0.275533,0d40e7bc-6f22-4512-a2e3-8a87b4ef14bd,2025-07-09 20:25:27.212012
2,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.612714,2.177072,809f68c8-30eb-429f-9575-5ed2ad4e39f7,2025-07-09 20:25:27.212012
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.882206,-1.196781,a48ceebd-50a5-4658-a8e6-f36e2a97f0fc,2025-07-09 20:25:27.212012
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.201434,-0.253438,9b5c3e71-39ae-4d16-ba42-df469336467c,2025-07-09 20:25:27.212013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95507,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.324097,-1.206213,504d992b-79e2-4924-9b85-b74862956ab8,2025-07-09 20:25:27.238299
95508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.595884,2.078492,61a0c0a7-91ae-4c8a-bead-b339328b7097,2025-07-09 20:25:27.238300
95509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.721498,-0.065974,3ea15369-10fd-490d-b6aa-f2485ac5b149,2025-07-09 20:25:27.238300
95510,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.484542,1.733709,eb0ca654-c8d4-48a3-97b7-c318e8b1aed1,2025-07-09 20:25:27.238300


In [53]:
test_processor = FeatureEngineeringProcessor(
    raw_data=test_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TEST"
) 
test_processor.run()

[32m2025-07-09 19:53:26.196[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m52[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering - TEST[0m
[32m2025-07-09 19:53:26.198[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-09 19:53:26.213[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-09 19:53:26.222[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE reserved_room_type[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,...,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_P,great_feature1,great_feature2,booking_id
0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.415185,4.490960,b9e4812c-1383-4d20-91fd-d72dfa54e755
1,-1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.974246,-1.139600,beb681d5-85f8-415e-9547-69328c3532f3
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.820389,-0.175608,702a47c7-cabd-45ae-a28f-768f39b94d7c
3,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.096703,-0.091719,bf444b87-b239-4215-bf3d-3554c46e8eaf
4,-1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287413,-0.344850,400e2c8b-2a2a-4b2c-807f-a240eeb82e8b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126730,0.842777,3965844a-2a62-430d-ab2b-69d1708474d3
23874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.061275,1.253995,1b1621ad-3984-410a-8f95-358bd25956a8
23875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.032728,0.725453,b7d87e0a-adb5-495b-a90a-27ea86b967bd
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120265,-0.182715,3e31a74b-e028-4aae-ab0f-601b093fa7ca


# Trabajando con Feast

In [50]:
str(uuid.uuid4())

'914294a7-c27c-4724-8142-9e1279ab9dfc'

In [97]:
from feast import FeatureStore

In [103]:
fs = FeatureStore("../feast_service/fs_dsrp_mle2_jul9/feature_repo/")

In [104]:
feature_service = fs.get_feature_service("dsrp_feature_service")

In [129]:
features_retrieved = fs.get_online_features(features=feature_service, entity_rows=[
    {"booking_id": "557afdfe-f573-47b6-8ca4-ff9e33d3b06e", "test": 1},
])

KeyError: "Missing join key values for keys: ['booking_id']. No values provided for keys: ['booking_id']. Provided join_key_values: []"

In [123]:
features_retrieved.to_df()

AttributeError: 'DataFrame' object has no attribute 'persist'

In [96]:
train_processor.feature_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95512 entries, 0 to 95511
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   hotel_City Hotel              95512 non-null  float64       
 1   hotel_Resort Hotel            95512 non-null  float64       
 2   market_segment_Aviation       95512 non-null  float64       
 3   market_segment_Complementary  95512 non-null  float64       
 4   market_segment_Corporate      95512 non-null  float64       
 5   market_segment_Direct         95512 non-null  float64       
 6   market_segment_Groups         95512 non-null  float64       
 7   market_segment_Offline TA/TO  95512 non-null  float64       
 8   market_segment_Online TA      95512 non-null  float64       
 9   market_segment_Undefined      95512 non-null  float64       
 10  reserved_room_type_A          95512 non-null  float64       
 11  reserved_room_type_B        