# Feature Engineering - MLE2

In [16]:
import numpy as np
import pandas as pd
import uuid
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from loguru import logger

In [17]:
raw_data = pd.read_csv("../data/raw/hotel_bookings.csv")

### Train - Test Split

In [18]:
TRAIN_SIZE = round(raw_data.shape[0] * 0.8)
TEST_SIZE =  raw_data.shape[0] - TRAIN_SIZE

In [19]:
train_indices = raw_data.sample(TRAIN_SIZE).index

In [20]:
test_indices = [i for i in raw_data.index if i not in train_indices]

In [21]:
train_raw_data = raw_data.iloc[train_indices]
test_raw_data = raw_data.iloc[test_indices]

In [22]:
train_raw_data["reserved_room_type"].value_counts()

reserved_room_type
A    68798
D    15387
E     5254
F     2315
G     1654
B      886
C      720
H      487
P        6
L        5
Name: count, dtype: int64

In [23]:
train_raw_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
70671,City Hotel,1,106,2017,June,25,20,0,4,2,...,No Deposit,9.0,,0,Transient,126.00,0,0,Canceled,2017-03-07
10964,Resort Hotel,1,32,2017,April,15,13,0,3,2,...,No Deposit,240.0,,0,Transient,120.00,0,1,Canceled,2017-04-01
60558,City Hotel,1,34,2016,November,47,14,2,5,1,...,No Deposit,9.0,,0,Transient,74.80,0,2,Canceled,2016-11-06
74876,City Hotel,1,153,2015,August,32,3,1,2,2,...,Non Refund,1.0,,0,Transient,60.00,0,0,Canceled,2015-05-19
104343,City Hotel,0,27,2017,January,2,12,0,1,2,...,No Deposit,,,0,Transient-Party,78.00,0,2,Check-Out,2017-01-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87762,City Hotel,0,31,2016,April,17,23,2,1,2,...,No Deposit,83.0,,0,Transient,87.04,0,0,Check-Out,2016-04-26
14165,Resort Hotel,0,104,2016,July,30,17,4,10,2,...,No Deposit,,,0,Transient,188.14,1,1,Check-Out,2016-07-31
51551,City Hotel,1,90,2016,May,21,21,1,1,2,...,No Deposit,9.0,,0,Transient,116.10,0,0,Canceled,2016-03-21
94124,City Hotel,0,198,2016,July,31,30,1,1,2,...,No Deposit,1.0,,0,Transient-Party,96.00,0,1,Check-Out,2016-08-01


In [24]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(train_raw_data[["hotel"]])
encoder.categories_[0]

array(['City Hotel', 'Resort Hotel'], dtype=object)

In [103]:
class FeatureEngineeringProcessor:

    def __init__(self, raw_data: pd.DataFrame, pipeline_name: str) -> None:
        self.raw_data = raw_data
        self.pipeline_name = pipeline_name
        self.feature_table = None


    def impute_scale(self, n_components: int = 2) -> pd.DataFrame:
        """Pipeline que imputa variables numericas y luego las escala, para finalmente
        Aplicar PCA y quedarse con N componentes principales
        """
        numeric_cols= [
            "lead_time",
            "adults",
            "children",
            "babies",
            "adr"
            
        ]
        pipe = Pipeline(
            steps=[
                ("imputer_mean", SimpleImputer(strategy="mean")),
                ("std_scaling", StandardScaler()),
                ("pca", PCA(n_components=n_components))
            ]
        )
        return pd.DataFrame(
            pipe.fit_transform(self.raw_data[numeric_cols]),
            columns=["great_feature1", "great_feature2"]
        )
        

    def encode_categoricals(self) -> pd.DataFrame:

        encoded_vars = []
        for var in ["hotel", "market_segment", "reserved_room_type"]:
            logger.info(f"Codificando con OHE {var}")
            encoder = OneHotEncoder()
            encoded = encoder.fit_transform(self.raw_data[[var]]).toarray()
            cols  = [f"{var}_{col}" for col in encoder.categories_[0]]
            _dataframe = pd.DataFrame(
                encoded,
                columns= cols
            )
            encoded_vars.append(_dataframe)
        return pd.concat(encoded_vars,axis=1)


    def run(self) -> pd.DataFrame:
        # acá pondremos nuestro codigo
        logger.info(f"Inicializando pipeline {self.pipeline_name}")

        categorical = self.encode_categoricals()
        numerics = self.impute_scale()

        modeling_dataset = pd.concat([categorical, numerics], axis=1)
        # Dataset Previo el pipeline
        pipe = Pipeline(
            steps=[
                ("feature_selection", VarianceThreshold()),
                ("scaling_robust", RobustScaler())
            ]
        )
        self.feature_table =  pd.DataFrame(
            pipe.fit_transform(modeling_dataset),
            columns=modeling_dataset.columns
        )
        self.feature_table["booking_id"] = [str(uuid.uuid4()) for _ in range(self.feature_table.shape[0])]
        self.feature_table["event_timestamp"] = [datetime.now() for _ in range(self.feature_table.shape[0])]
        import time
        time.sleep(1)
        self.feature_table["created"] = [datetime.now() for _ in range(self.feature_table.shape[0])]

        return self.feature_table

    def write_feature_table(self, filepath: str) -> None:
        """Escribimos la feature table final para modelamiento
        """

        if not self.feature_table.empty: # -> True o False
            self.feature_table.to_parquet(f"{filepath}.parquet", index=False)
            self.feature_table.to_csv(f"{filepath}.csv", index=False)
        else:
            raise Exception("La feature table no ha sido creada. Ejecutar el comando .run()")            
        
        
        

In [104]:
train_processor = FeatureEngineeringProcessor(
    raw_data=train_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TRAIN"
) 
train_processor.run()
train_processor.write_feature_table(
    filepath="../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/bookings_feature_table"
)

[32m2025-07-14 19:04:49.632[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m52[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering - TRAIN[0m
[32m2025-07-14 19:04:49.634[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-14 19:04:49.666[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-14 19:04:49.687[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE reserved_room_type[0m


In [90]:
test_processor = FeatureEngineeringProcessor(
    raw_data=test_raw_data, 
    pipeline_name="DSRP MLE2 Feature Engineering - TEST"
) 
test_processor.run()

[32m2025-07-14 19:01:40.660[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m52[0m - [1mInicializando pipeline DSRP MLE2 Feature Engineering - TEST[0m
[32m2025-07-14 19:01:40.661[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE hotel[0m
[32m2025-07-14 19:01:40.664[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE market_segment[0m
[32m2025-07-14 19:01:40.668[0m | [1mINFO    [0m | [36m__main__[0m:[36mencode_categoricals[0m:[36m38[0m - [1mCodificando con OHE reserved_room_type[0m


Unnamed: 0,hotel_City Hotel,hotel_Resort Hotel,market_segment_Aviation,market_segment_Complementary,market_segment_Corporate,market_segment_Direct,market_segment_Groups,market_segment_Offline TA/TO,market_segment_Online TA,reserved_room_type_A,...,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P,great_feature1,great_feature2,booking_id,created
0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.147522,-0.238090,aea5c00b-0142-4e7f-bf59-fe8734583578,2025-07-14 19:01:40.751961
1,-1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.282881,-0.335235,42bd7223-e895-4ac1-b21a-32c8c1f77ec1,2025-07-14 19:01:40.751965
2,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.051920,0.473755,03e543eb-b135-4cdf-859c-64c3e35f5ffd,2025-07-14 19:01:40.751965
3,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.653528,0.074575,9174fa50-b3e2-42f2-bca0-5b50e1ef5c25,2025-07-14 19:01:40.751966
4,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.472667,-1.320230,7ebb59a1-43ad-435e-b608-4b3c34d1bed2,2025-07-14 19:01:40.751966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.026628,1.009695,f50b7df2-af5b-40eb-bb54-1d3c74d3e151,2025-07-14 19:01:40.758948
23874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.116510,1.016209,dedfd18a-cd31-42f7-85ff-30ffe56b8411,2025-07-14 19:01:40.758948
23875,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.031492,0.974292,4f49a6ee-caba-4eda-8ef5-80d7eb52db15,2025-07-14 19:01:40.758948
23876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.118773,-0.192339,1e218994-511b-4215-ad6d-92d90e33f730,2025-07-14 19:01:40.758948


# Trabajando con Feast

In [105]:
_test_fs_df = pd.read_parquet("../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/bookings_feature_table.parquet")
_test_fs_df[["booking_id"]].head()

Unnamed: 0,booking_id
0,00bb4fea-525b-4757-9f1d-540f131dd5c3
1,1236fc59-d92c-4227-8fe8-41f464c5565c
2,95f636b8-cbcd-411c-b1fb-4eecebb0e725
3,eaed376f-bc93-45aa-b173-3aaa028347c8
4,1629e178-d66f-4fef-a67b-327387aa2e1a


In [106]:
from feast import FeatureStore

In [107]:
fs = FeatureStore("../feast_service/fs_dsrp_mle2_jul9/feature_repo/")

## Históricas

In [110]:
pd.read_parquet("../feast_service/fs_dsrp_mle2_jul9/feature_repo/data/driver_stats.parquet").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1807 entries, 0 to 1806
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   event_timestamp  1807 non-null   datetime64[ns, UTC]
 1   driver_id        1807 non-null   int64              
 2   conv_rate        1807 non-null   float32            
 3   acc_rate         1807 non-null   float32            
 4   avg_daily_trips  1807 non-null   int32              
 5   created          1807 non-null   datetime64[us]     
dtypes: datetime64[ns, UTC](1), datetime64[us](1), float32(2), int32(1), int64(1)
memory usage: 63.7 KB


In [113]:
entity_df = pd.DataFrame.from_dict(
        {
            # entity's join key -> entity values
            "booking_id": ["1629e178-d66f-4fef-a67b-327387aa2e1a"],

            # (optional) label name -> label values. Feast does not process these
            "great_feature1": [1, ],
            "great_feature2": [1, ],
            # values we're using for an on-demand transformation
            "kpi1": [2, ],
            "kpi2": [10, ],
        }
    )
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)
entity_df

Unnamed: 0,booking_id,great_feature1,great_feature2,kpi1,kpi2,event_timestamp
0,1629e178-d66f-4fef-a67b-327387aa2e1a,1,1,2,10,2025-07-15 00:07:28.285786+00:00


In [114]:
fs.get_historical_features(
        entity_df=entity_df,
        features=[
            "pc_booking_view:great_feature1",
            "pc_booking_view:great_feature2",
            "great_feature_view:great_feature1_kpi1",
            "great_feature_view:great_feature2_kpi2",
        ],
    ).to_df()

Unnamed: 0,booking_id,great_feature1,great_feature2,kpi1,kpi2,event_timestamp,great_feature1__,great_feature2__,great_feature1_kpi1,great_feature2_kpi2
0,1629e178-d66f-4fef-a67b-327387aa2e1a,1,1,2,10,2025-07-15 00:07:28.285786+00:00,-0.131863,-0.149217,2,10
