# Lab 1: Data Preprocessing for Hotel Booking Cancellations
Machine Learning in Tourism - Lab 1

## Learning objectives:

1. Preprocess and clean hotel booking data, handling missing values and data conversions
2. Create meaningful derived variables through feature engineering
3. Apply exploratory data analysis with visualization tools
4. Build a machine learning pipeline for numerical and categorical features
5. Evaluate a booking cancellation prediction model and interpret feature importance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Set the visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. DATA LOADING

In [None]:
# For this lab, we'll use the Hotel Booking Demand dataset
# The dataset contains booking information for a city hotel and a resort hotel
# including information such as when the booking was made, length of stay, 
# number of adults, children, and/or babies, and many other features

# You would typically download this dataset or access it from your local system
# For the purpose of this lab, we'll use a direct URL
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv"
try:
    # Try to load the data from the URL
    df = pd.read_csv(url)
    print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
except Exception as e:
    print(f"Error loading the dataset: {e}")
    print("Using backup approach with sample data...")
    
    # If the URL fails, we'll create a synthetic dataset for demonstration
    np.random.seed(42)
    n_samples = 5000
    
    # Create synthetic data that mimics hotel booking features
    df = pd.DataFrame({
        'hotel': np.random.choice(['Resort Hotel', 'City Hotel'], n_samples),
        'is_canceled': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
        'lead_time': np.random.randint(0, 365, n_samples),
        'arrival_date_year': np.random.choice([2018, 2019], n_samples),
        'arrival_date_month': np.random.choice(['January', 'February', 'March', 'April', 'May', 'June', 
                                               'July', 'August', 'September', 'October', 'November', 'December'], n_samples),
        'arrival_date_week_number': np.random.randint(1, 53, n_samples),
        'arrival_date_day_of_month': np.random.randint(1, 31, n_samples),
        'stays_in_weekend_nights': np.random.randint(0, 5, n_samples),
        'stays_in_week_nights': np.random.randint(0, 15, n_samples),
        'adults': np.random.randint(1, 4, n_samples),
        'children': np.random.choice([0, 1, 2], n_samples, p=[0.6, 0.3, 0.1]),
        'babies': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        'meal': np.random.choice(['BB', 'HB', 'FB', 'SC'], n_samples),
        'country': np.random.choice(['PRT', 'GBR', 'FRA', 'ESP', 'DEU'], n_samples),
        'market_segment': np.random.choice(['Online TA', 'Offline TA', 'Direct', 'Corporate', 'Groups'], n_samples),
        'distribution_channel': np.random.choice(['TA/TO', 'Direct', 'Corporate'], n_samples),
        'is_repeated_guest': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
        'previous_cancellations': np.random.choice([0, 1, 2], n_samples, p=[0.8, 0.15, 0.05]),
        'previous_bookings_not_canceled': np.random.choice([0, 1, 2, 3], n_samples, p=[0.7, 0.2, 0.05, 0.05]),
        'reserved_room_type': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_samples),
        'assigned_room_type': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_samples),
        'booking_changes': np.random.choice([0, 1, 2, 3], n_samples, p=[0.7, 0.2, 0.05, 0.05]),
        'deposit_type': np.random.choice(['No Deposit', 'Refundable', 'Non Refund'], n_samples, p=[0.8, 0.1, 0.1]),
        'days_in_waiting_list': np.random.choice([0, 1, 2, 3, 5, 10], n_samples, p=[0.7, 0.1, 0.05, 0.05, 0.05, 0.05]),
        'customer_type': np.random.choice(['Transient', 'Contract', 'Transient-Party', 'Group'], n_samples),
        'adr': np.random.uniform(50, 300, n_samples),  # Average Daily Rate
        'required_car_parking_spaces': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'total_of_special_requests': np.random.choice([0, 1, 2, 3], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
        'reservation_status': np.random.choice(['Check-Out', 'Canceled', 'No-Show'], n_samples, p=[0.6, 0.35, 0.05]),
    })
    
    # Introduce some missing values to demonstrate handling
    for col in ['children', 'country', 'agent', 'adr']:
        mask = np.random.choice([True, False], size=df.shape[0], p=[0.05, 0.95])
        df.loc[mask, col] = np.nan
    
    print(f"Synthetic dataset created with {df.shape[0]} rows and {df.shape[1]} columns.")

# Display the first few rows to understand the structure
print("\nFirst few rows of the dataset:")
df.head()

## 2. EXPLORATORY DATA ANALYSIS

In [None]:
# Basic information about the dataset
print("\nDataset information:")
df.info()

In [None]:
# Statistical summary of the numerical columns
print("\nStatistical summary:")
df.describe().T

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values count:")
missing_values[missing_values > 0]

In [None]:
# Distribution of the target variable (is_canceled)
print("\nDistribution of booking cancellations:")
cancellation_counts = df['is_canceled'].value_counts(normalize=True) * 100
print(cancellation_counts)

plt.figure(figsize=(8, 6))
ax = sns.countplot(x='is_canceled', data=df, palette='viridis', hue='is_canceled', legend=False)
plt.title('Distribution of Booking Cancellations')
plt.xlabel('Canceled (1) vs. Not Canceled (0)')
plt.ylabel('Count')

# Add percentages on top of the bars
total = len(df)
for p in ax.patches:
    percentage = f'{100 * p.get_height() / total:.1f}%'
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(percentage, (x, y), ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 3. DATA CLEANING

In [None]:
# Make a copy of the dataframe for preprocessing
df_clean = df.copy()

# Check datatypes and convert if necessary
print("\nData types before conversion:")
display(df_clean.dtypes)

# Convert 'children' column to numeric if it's not already
if df_clean['children'].dtype == 'object':
    df_clean['children'] = pd.to_numeric(df_clean['children'], errors='coerce')

# Handle missing values
print("\nHandling missing values...")

# For numerical columns, impute with median
numerical_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    if df_clean[col].isnull().sum() > 0:
        median_value = df_clean[col].median()
        df_clean[col].fillna(median_value, inplace=True)
        print(f"Imputed {col} with median value: {median_value}")

# For categorical columns, impute with mode
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_value = df_clean[col].mode()[0]
        df_clean[col].fillna(mode_value, inplace=True)
        print(f"Imputed {col} with mode value: {mode_value}")

# Verify no missing values remain
missing_after = df_clean.isnull().sum().sum()
print(f"\nTotal missing values after imputation: {missing_after}")

In [None]:
df_clean.dtypes.head()

## 4. FEATURE ENGINEERING

In [None]:
# 4.1 Create a total nights feature
df_clean['total_nights'] = df_clean['stays_in_weekend_nights'] + df_clean['stays_in_week_nights']
print("Created 'total_nights' feature")

# 4.2 Create a total guests feature
df_clean['total_guests'] = df_clean['adults'] + df_clean['children'] + df_clean['babies']
print("Created 'total_guests' feature")

# 4.3 Extract season from arrival_date_month
def get_season(month):
    if month in ['December', 'January', 'February']:
        return 'Winter'
    elif month in ['March', 'April', 'May']:
        return 'Spring'
    elif month in ['June', 'July', 'August']:
        return 'Summer'
    else:
        return 'Fall'

df_clean['season'] = df_clean['arrival_date_month'].apply(get_season)
print("Created 'season' feature based on arrival month")

# 4.4 Create a binary feature for high or low booking lead time
median_lead_time = df_clean['lead_time'].median()
df_clean['high_lead_time'] = (df_clean['lead_time'] > median_lead_time).astype(int)
print(f"Created 'high_lead_time' feature (1 if lead time > {median_lead_time} days)")

# 4.5 Create price per person
df_clean['price_per_person'] = df_clean['adr'] / df_clean['total_guests'].replace(0, 1)  # Avoid division by zero
print("Created 'price_per_person' feature")

# 4.6 Check if the assigned room type matches the reserved room type
df_clean['got_requested_room'] = (df_clean['reserved_room_type'] == df_clean['assigned_room_type']).astype(int)
print("Created 'got_requested_room' feature")

# 4.7 Create weekend vs. weekday stay ratio
df_clean['weekend_ratio'] = df_clean['stays_in_weekend_nights'] / df_clean['total_nights'].replace(0, 1)
print("Created 'weekend_ratio' feature")

# Display the new features
print("\nDataset with new features:")
display(df_clean[['total_nights', 'total_guests', 'season', 'high_lead_time', 
                'price_per_person', 'got_requested_room', 'weekend_ratio']].head())

## 5. EXPLORATORY DATA ANALYSIS

In [None]:
# 5.1 Cancellation rate by hotel type
plt.figure(figsize=(10, 6))
sns.countplot(x='hotel', hue='is_canceled', data=df_clean, palette='viridis')
plt.title('Cancellation Rate by Hotel Type')
plt.xlabel('Hotel Type')
plt.ylabel('Count')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

# 5.2 Cancellation rate by season
plt.figure(figsize=(12, 6))
cancellation_by_season = pd.crosstab(df_clean['season'], df_clean['is_canceled'], normalize='index') * 100
cancellation_by_season.plot(kind='bar', stacked=True, colormap='viridis', figsize=(10, 6))
plt.title('Cancellation Rate by Season')
plt.xlabel('Season')
plt.ylabel('Percentage')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

# 5.3 Lead time distribution for canceled vs. not canceled bookings
plt.figure(figsize=(12, 6))
sns.histplot(data=df_clean, x='lead_time', hue='is_canceled', bins=30, kde=True, element='step')
plt.title('Lead Time Distribution by Cancellation Status')
plt.xlabel('Lead Time (days)')
plt.ylabel('Count')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

# 5.4 Average Daily Rate (ADR) by hotel type and cancellation status
plt.figure(figsize=(12, 6))
sns.boxplot(x='hotel', y='adr', hue='is_canceled', data=df_clean, palette='viridis')
plt.title('ADR Distribution by Hotel Type and Cancellation Status')
plt.xlabel('Hotel Type')
plt.ylabel('Average Daily Rate (ADR)')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

# 5.5 Correlation matrix of numerical features
numerical_features = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 
                     'adults', 'children', 'babies', 'is_repeated_guest',
                     'previous_cancellations', 'previous_bookings_not_canceled',
                     'booking_changes', 'days_in_waiting_list', 'adr',
                     'required_car_parking_spaces', 'total_of_special_requests',
                     'total_nights', 'total_guests', 'high_lead_time', 
                     'price_per_person', 'got_requested_room', 'weekend_ratio']

# Select only columns that exist in the dataframe
numerical_features = [col for col in numerical_features if col in df_clean.columns]

correlation_matrix = df_clean[numerical_features].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            linewidths=0.5, square=True)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# 5.6 Cancellation rate by deposit type
plt.figure(figsize=(10, 6))
sns.countplot(x='deposit_type', hue='is_canceled', data=df_clean, palette='viridis')
plt.title('Cancellation Rate by Deposit Type')
plt.xlabel('Deposit Type')
plt.ylabel('Count')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

# 5.7 Cancellation rate by market segment
plt.figure(figsize=(12, 6))
sns.countplot(x='market_segment', hue='is_canceled', data=df_clean, palette='viridis')
plt.title('Cancellation Rate by Market Segment')
plt.xlabel('Market Segment')
plt.ylabel('Count')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:

# 5.4 Average Daily Rate (ADR) by hotel type and cancellation status (ZOOM)
plt.figure(figsize=(12, 6))
sns.boxplot(x='hotel', y='adr', hue='is_canceled', data=df_clean[df_clean['adr']<1000], palette='viridis')
plt.title('ADR Distribution by Hotel Type and Cancellation Status (ZOOM)')
plt.xlabel('Hotel Type')
plt.ylabel('Average Daily Rate (ADR)')
plt.legend(title='Canceled', labels=['No', 'Yes'])
plt.tight_layout()
plt.show()

## 6. FEATURE SELECTION AND PREPROCESSING PIPELINE

In [None]:
# 6.1 Select features based on business knowledge and correlation analysis
selected_features = [
    'hotel', 'lead_time', 'arrival_date_month', 'arrival_date_week_number',
    'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
    'meal', 'market_segment', 'distribution_channel', 'is_repeated_guest',
    'previous_cancellations', 'previous_bookings_not_canceled',
    'reserved_room_type', 'assigned_room_type', 'booking_changes',
    'deposit_type', 'days_in_waiting_list', 'customer_type', 'adr',
    'required_car_parking_spaces', 'total_of_special_requests',
    'total_nights', 'season', 'high_lead_time', 'price_per_person',
    'got_requested_room', 'weekend_ratio'
]

# Select only columns that exist in the dataframe
selected_features = [col for col in selected_features if col in df_clean.columns]

# Split the dataset into features and target
X = df_clean[selected_features]
y = df_clean['is_canceled']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# 6.2 Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")
print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")

# 6.3 Create preprocessing pipelines
# Numerical pipeline: imputation + scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: imputation + one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Column transformer to apply the appropriate preprocessing to each column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 6.4 Fit the preprocessing pipeline to the training data
print("\nFitting the preprocessing pipeline...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# Get the feature names after one-hot encoding
cat_feature_names = []
for i, col in enumerate(categorical_features):
    # Get the categories for this feature
    categories = preprocessor.transformers_[1][1].named_steps['onehot'].categories_[i]
    # Create feature names by combining column name with category
    for category in categories:
        cat_feature_names.append(f"{col}_{category}")

# Combine with numerical feature names
processed_feature_names = numerical_features + cat_feature_names

## 7. MODEL TRAINING

In [None]:
# 7.1 Feature importance using a simple model (Random Forest)
from sklearn.ensemble import RandomForestClassifier

# Train a simple Random Forest model
print("\nTraining a Random Forest model for feature importance...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_processed, y_train)

# Get feature importances
if len(processed_feature_names) == X_train_processed.shape[1]:
    importances = rf_model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': processed_feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)
    
    # Display top 20 features
    print("\nTop 20 most important features:")
    print(feature_importance.head(20))
    
    # Plot feature importances
    plt.figure(figsize=(12, 10))
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(20), palette='viridis', hue='Feature', legend=False)
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.show()
else:
    print("\nWarning: Feature names length does not match processed data width.")
    print(f"Feature names length: {len(processed_feature_names)}")
    print(f"Processed data width: {X_train_processed.shape[1]}")

## 8. MODEL EVALUATION

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = rf_model.predict(X_test_processed)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

## 9. SUMMARY AND CONCLUSIONS
In this lab, we have:
1. Loaded and explored hotel booking data
2. Cleaned the data by handling missing values
3. Engineered new features to improve predictive power
4. Performed exploratory data analysis to understand relationships
5. Created a preprocessing pipeline for both numerical and categorical features
6. Built a simple Random Forest model to analyze feature importance
7. Evaluated the model's performance in predicting booking cancellations

Key insights:
- Lead time is one of the most important features for predicting cancellations
- Deposit type significantly affects cancellation rates
- Price and room type assignment also play important roles
- Seasonal patterns exist in booking cancellations

Next steps:
- Further explore interaction effects between features
- Try more advanced models and compare performance
- Implement hyperparameter tuning to optimize model performance
- Consider business context when interpreting and applying models