In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

df = pd.read_csv('../data/hotel_bookings_clean.csv')

#### Handling Missing Values
- Children: fill with 0
- Country: fill with 'Unknown'
- Agent/Company: Drop

In [8]:

df['children'] = df['children'].fillna(0)
df['country'] = df['country'].fillna('Unknown')

drop_cols = ['reservation_status', 'reservation_status_date', 'company', 'agent', 'arrival_date_year']
df = df.drop(columns=drop_cols)

#### Separate Features and Target

In [9]:
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']

# Define Categorical and Numerical features
# We explicitly list them to ensure the model knows what to expect
numeric_features = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                    'adults', 'children', 'adr', 'total_of_special_requests', 
                    'required_car_parking_spaces']

categorical_features = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
                        'distribution_channel', 'reserved_room_type', 'assigned_room_type', 
                        'deposit_type', 'customer_type']

#### Create Preprocessing Pipeline
1. Numerical: Scale
2. Categorical: OneHotEncode

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

#### Split Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data: {X_train.shape}")
print(f"Testing Data: {X_test.shape}")

Training Data: (69916, 26)
Testing Data: (17480, 26)


#### Save the split raw data:

In [12]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

# Save the preprocessor definition (We will fit it in the next notebook inside the full pipeline)
# Alternatively, we fit it here and save it. Let's save the structural lists.
feature_meta = {
    'numeric': numeric_features,
    'categorical': categorical_features
}
joblib.dump(feature_meta, '../models/feature_metadata.pkl')
print("Feature metadata saved to '../models/feature_metadata.pkl'")

Feature metadata saved to '../models/feature_metadata.pkl'
