# Phase 4: Machine Learning Models

This notebook trains multiple machine learning models to predict hotel booking cancellations:
1. Naive Bayes
2. Decision Tree

All models are trained using PySpark MLlib for distributed processing.


In [None]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.ml.classification import (
    NaiveBayes, DecisionTreeClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported")


## Step 1: Initialize Spark and Load Data

**Note**: This notebook assumes you've run the previous preprocessing notebook. If not, run `03_spark_preprocessing.ipynb` first.


In [None]:
# Create or get existing Spark session
try:
    spark
    print("✓ Using existing Spark session")
except NameError:
    spark = SparkSession.builder \
        .appName("HotelBookingML") \
        .config("spark.sql.adaptive.enabled", "true") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")
    print("✓ New Spark session created")

# If you need to reload data, uncomment and run preprocessing steps
# Or use the train_df and test_df from previous notebook
print("\nNote: Make sure train_df and test_df are available from previous notebook")


## Step 2: Model Training Functions

Define functions to train and evaluate models.


In [None]:
def train_naive_bayes(train_df):
    """Train Naive Bayes model."""
    print("Training Naive Bayes...")
    nb = NaiveBayes(
        featuresCol='features',
        labelCol='label'
    )
    model = nb.fit(train_df)
    print("✓ Naive Bayes trained")
    return model

def train_decision_tree(train_df, max_depth=10):
    """Train Decision Tree model."""
    print("Training Decision Tree...")
    dt = DecisionTreeClassifier(
        featuresCol='features',
        labelCol='label',
        maxDepth=max_depth,
        impurity='gini'
    )
    model = dt.fit(train_df)
    print("✓ Decision Tree trained")
    return model

print("✓ Model training functions defined")


In [None]:
def evaluate_model(predictions, model_name):
    """Evaluate model and return metrics."""
    # Binary classification evaluator for AUC
    binary_evaluator = BinaryClassificationEvaluator(
        labelCol='label',
        rawPredictionCol='rawPrediction',
        metricName='areaUnderROC'
    )
    
    # Multiclass evaluator for other metrics
    multiclass_evaluator = MulticlassClassificationEvaluator(
        labelCol='label',
        predictionCol='prediction',
        metricName='accuracy'
    )
    
    metrics = {
        'model': model_name,
        'accuracy': multiclass_evaluator.evaluate(predictions),
        'auc': binary_evaluator.evaluate(predictions)
    }
    
    # Calculate precision, recall, F1
    for metric_name in ['weightedPrecision', 'weightedRecall', 'f1']:
        evaluator = MulticlassClassificationEvaluator(
            labelCol='label',
            predictionCol='prediction',
            metricName=metric_name
        )
        metrics[metric_name] = evaluator.evaluate(predictions)
    
    return metrics

print("✓ Evaluation function defined")


## Step 3: Train Model 1 - Naive Bayes


In [None]:
# Train Naive Bayes
nb_model = train_naive_bayes(train_df)

# Make predictions
nb_predictions = nb_model.transform(test_df)

# Evaluate
nb_metrics = evaluate_model(nb_predictions, "Naive Bayes")

print("\n=== Naive Bayes Results ===")
for metric, value in nb_metrics.items():
    if metric != 'model':
        print(f"{metric.capitalize()}: {value:.4f}")


In [None]:
# Show sample predictions
print("\n=== Sample Predictions ===")
nb_predictions.select("label", "prediction", "probability").show(10)


## Step 4: Train Model 2 - Decision Tree


In [None]:
# Train Decision Tree
dt_model = train_decision_tree(train_df, max_depth=10)

# Make predictions
dt_predictions = dt_model.transform(test_df)

# Evaluate
dt_metrics = evaluate_model(dt_predictions, "Decision Tree")

print("\n=== Decision Tree Results ===")
for metric, value in dt_metrics.items():
    if metric != 'model':
        print(f"{metric.capitalize()}: {value:.4f}")


## Step 5: Decision Tree Feature Importance


In [None]:
# Decision Tree feature importance (if available)
try:
    feature_importance = dt_model.featureImportances
    print("\n=== Top 10 Most Important Features (Decision Tree) ===")
    # Note: Feature names would need to be mapped from indices
    # This is a simplified version
    importances = feature_importance.toArray()
    top_indices = np.argsort(importances)[-10:][::-1]
    for idx in top_indices:
        print(f"Feature {idx}: {importances[idx]:.4f}")
except:
    print("Feature importance not available for this model")


## Step 6: Model Comparison


In [None]:
# Collect all metrics
all_metrics = [nb_metrics, dt_metrics]

# Create comparison DataFrame
metrics_df = pd.DataFrame(all_metrics)
metrics_df = metrics_df.set_index('model')

print("=== Model Comparison ===")
display(metrics_df.round(4))


In [None]:
# Save predictions for evaluation notebook
# Store predictions in variables for next notebook
print("✓ All models trained and evaluated")
print("\nModels and predictions available:")
print("  - nb_model, nb_predictions, nb_metrics")
print("  - dt_model, dt_predictions, dt_metrics")
print("  - metrics_df (comparison table)")

# Save metrics to CSV for report
metrics_df.to_csv('/content/model_metrics.csv')
print("\n✓ Metrics saved to model_metrics.csv")


## Summary


In [None]:
## Summary

✓ Naive Bayes trained and evaluated
✓ Decision Tree trained and evaluated
✓ Model comparison completed

**Next Steps**: Proceed to `05_evaluation_visualization.ipynb` for detailed evaluation and visualizations.
