In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load Split Data from previous step
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv').values.ravel() # ravel to make it a 1D array
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

# Load Metadata
meta = joblib.load('../models/feature_metadata.pkl')
numeric_features = meta['numeric']
categorical_features = meta['categorical']

#### Reconstruct the Preprocessor

In [3]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

#### Define Model Pipeline
We use Random `Forest`

In [4]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

#### Train

In [5]:
print("Training Random Forest Model...")
model_pipeline.fit(X_train, y_train)
print("Training Complete.")

Training Random Forest Model...
Training Complete.


#### Evaluate

In [6]:
y_pred = model_pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8308

Classification Report:

              precision    recall  f1-score   support

           0       0.86      0.92      0.89     12733
           1       0.74      0.59      0.65      4747

    accuracy                           0.83     17480
   macro avg       0.80      0.75      0.77     17480
weighted avg       0.82      0.83      0.82     17480



#### Save the FINAL Model
This .pkl file contains the Scaler, Encoder, and the Classifier

In [7]:
joblib.dump(model_pipeline, '../models/hotel_cancellation_model.pkl')
print("Model saved to '../models/hotel_cancellation_model.pkl'")

Model saved to '../models/hotel_cancellation_model.pkl'
