# Load Datasets

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from keras import Sequential, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.metrics import Precision, Recall
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
import os

def get_data_path(slug_name):
    base1 = f"/kaggle/input/{slug_name}"
    base2 = "/kaggle/input"
    if os.path.exists(base1):
        return base1
    elif os.path.exists(base2):
        return base2
    else:
        return "./Data"

data_dir = get_data_path("international-hotel-booking-analytics")

users = pd.read_csv(f"{data_dir}/users.csv")
hotels = pd.read_csv(f"{data_dir}/hotels.csv")
reviews = pd.read_csv(f"{data_dir}/reviews.csv")

# Data Cleaning

In [None]:
def clean_data(hotels, reviews, users):
    """
    Clean and merge datasets before model training.
    Focuses only on data consistency and structure.
    """
    
    # --- Clean Users ---
    users_clean = users.copy()
    age_mapping = {'18-24': 0, '25-34': 1, '35-44': 2, '45-54': 3, '55+': 4}
    users_clean['age_group'] = users_clean['age_group'].map(age_mapping)

    # --- Clean Reviews ---
    reviews_clean = reviews.copy()
    reviews_clean['review_date'] = pd.to_datetime(reviews_clean['review_date'])

    # --- Clean Hotels ---
    hotels_clean = hotels.copy()

    # --- Merge ---
    df = pd.merge(reviews_clean, users_clean, on='user_id', how='inner')
    df = pd.merge(df, hotels_clean, on='hotel_id', how='inner')

    df.rename(columns={'country_x': 'hotel_country', 'country_y': 'user_country'}, inplace=True)

    # --- Map countries to regions ---
    country_to_group = {
        'United States': 'North_America', 'Canada': 'North_America',
        'Germany': 'Western_Europe', 'France': 'Western_Europe',
        'United Kingdom': 'Western_Europe', 'Netherlands': 'Western_Europe',
        'Spain': 'Western_Europe', 'Italy': 'Western_Europe',
        'Russia': 'Eastern_Europe', 'China': 'East_Asia', 'Japan': 'East_Asia',
        'South Korea': 'East_Asia', 'Thailand': 'Southeast_Asia', 'Singapore': 'Southeast_Asia',
        'United Arab Emirates': 'Middle_East', 'Turkey': 'Middle_East',
        'Egypt': 'Africa', 'Nigeria': 'Africa', 'South Africa': 'Africa',
        'Australia': 'Oceania', 'New Zealand': 'Oceania',
        'Brazil': 'South_America', 'Argentina': 'South_America',
        'India': 'South_Asia', 'Mexico': 'North_America_Mexico'
    }
    df['country_group'] = df['user_country'].map(country_to_group).fillna('Other')

    # --- Final cleanup ---
    cols_to_drop = [
        'user_id', 'hotel_id', 'review_id', 'join_date',
        'lat', 'lon', 'user_country', 'hotel_name',
        'review_text', 'review_date'
    ]
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')
    

    df = df.drop_duplicates()
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    text_cols = df.select_dtypes(include=['object']).columns
    df[text_cols] = df[text_cols].fillna('Unknown')

    global DEFAULT_VALUES
    DEFAULT_VALUES = {}

    DEFAULT_VALUES |= df[numeric_cols].mean().to_dict()
    DEFAULT_VALUES |= df[text_cols].mode().iloc[0].to_dict()

    return df

df = clean_data(hotels, reviews, users)

In [None]:
def prepare_for_model_training(df):
    """
    Prepare dataframe for model training using one-hot encoding.
    Handles categorical encoding and feature removal.
    """
    
    # Copy to keep original clean
    df_model = df.copy()

    # Get all categorical columns after dropping all irrelevant columns for fitting and transformation
    categorical_cols = df_model.select_dtypes(include=['object']).columns.tolist()
    
    # Remove 'country_group' if it exists
    if 'country_group' in categorical_cols:
        categorical_cols.remove('country_group')

    # one-hot encode categorical columns using sklearn and store the encoder for later use
    ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.uint8, handle_unknown='ignore')
    global ohe_fit
    ohe_fit = ohe.fit(df[categorical_cols])
    
    # Transform and replace the categorical columns in df
    ohe_features = ohe_fit.transform(df[categorical_cols])
    ohe_feature_names = ohe_fit.get_feature_names_out(categorical_cols)
    df_model = df.drop(columns=categorical_cols)
    df_model[ohe_feature_names] = ohe_features

    return df_model

In [None]:
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample data:")
df.head()

In [None]:
print("\nSummary statistics for numeric columns:")
display(df.describe())

In [None]:
print("\nValue counts for categorical columns:")
categorical_cols = ['age_group', 'traveller_type', 'user_gender', 'city', 'hotel_country', 'country_group']
for col in categorical_cols:
    print(f"\n{col} distribution:")
    print(df[col].value_counts())

In [None]:
users

In [None]:
hotels

In [None]:
reviews

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(data=df, x='age_group', palette='pastel')
plt.title('Number of Reviews per Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count')

# Add counts above bars
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 20,  # adjust +20 for spacing
            f'{int(height)}', ha="center", fontsize=10)

plt.show()

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(data=df, x='traveller_type', palette='muted')
plt.title('Number of Reviews per Traveller Type')
plt.xlabel('Traveller Type')
plt.ylabel('Count')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 5, str(int(p.get_height())), 
            ha='center', fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.countplot(data=df, x='user_gender', palette='coolwarm')
plt.title('Number of Reviews by Gender')
plt.xlabel('Gender (0=Male,1=Female)')
plt.ylabel('Count')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 5, str(int(p.get_height())), 
            ha='center', fontsize=10)
plt.show()

In [None]:
top_cities = df.groupby('city')['score_overall'].mean().sort_values(ascending=False).head(10).reset_index()
plt.figure(figsize=(12,6))
ax = sns.barplot(data=top_cities, x='city', y='score_overall', palette='viridis')
plt.xticks(rotation=55)
plt.title('Top 10 Cities by Average Overall Score')
plt.xlabel('City')
plt.ylabel('Average Overall Score')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 0.02, f'{p.get_height():.2f}', 
            ha='center', fontsize=9)
plt.show()

In [None]:
top_countries = df.groupby('hotel_country')['score_value_for_money'].mean().sort_values(ascending=False).head(10).reset_index()
plt.figure(figsize=(12,6))
ax = sns.barplot(data=top_countries, x='hotel_country', y='score_value_for_money', palette='magma')
plt.xticks(rotation=55)
plt.title('Top 10 Countries by Average Value-for-Money Score')
plt.xlabel('Country')
plt.ylabel('Average Value-for-Money Score')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 0.02, f'{p.get_height():.2f}', 
            ha='center', fontsize=9)
plt.show()

# Histogram Analysis
Histograms show the distribution of numerical variables, helping us understand the shape, skewness, center, and spread of data.

In [None]:
# Histogram of Overall Scores
plt.figure(figsize=(10, 6))
plt.hist(df['score_overall'], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.title('Distribution of Overall Scores', fontsize=14, fontweight='bold')
plt.xlabel('Overall Score')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

print(f"Mean Overall Score: {df['score_overall'].mean():.2f}")
print(f"Standard Deviation: {df['score_overall'].std():.2f}")
print(f"Skewness: {df['score_overall'].skew():.2f}")

In [None]:
# Bar Plot: Average Overall Score by Age Group (using Seaborn)
plt.figure(figsize=(10, 6))

# Create age groups aggregation - using original columns before one-hot encoding
age_score_data = df.groupby('age_group')['score_overall'].mean().reset_index()

sns.barplot(data=age_score_data, x='age_group', y='score_overall', palette='viridis')
plt.title('Average Overall Score by Age Group', fontweight='bold')
plt.xlabel('Age Group')
plt.ylabel('Average Overall Score')
plt.xticks(rotation=45)

# Add value labels on top of bars
for i, v in enumerate(age_score_data['score_overall']):
    plt.text(i, v + 0.02, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("Age Group Score Analysis:")
for idx, row in age_score_data.iterrows():
    print(f"• {row['age_group']}: {row['score_overall']:.2f} average score")

In [None]:
# Bar Plot: Average Scores by Traveler Type (Comparison)
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Left plot: Overall Score by Traveler Type  
traveler_overall = df.groupby('traveller_type')['score_overall'].mean().reset_index()
sns.barplot(data=traveler_overall, x='traveller_type', y='score_overall', 
           palette='Set2', ax=axes[0])
axes[0].set_title('Average Overall Score by Traveler Type', fontweight='bold')
axes[0].set_xlabel('Traveler Type')
axes[0].set_ylabel('Average Overall Score')

# Add value labels
for i, v in enumerate(traveler_overall['score_overall']):
    axes[0].text(i, v + 0.02, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

# Right plot: Value for Money Score by Traveler Type
traveler_value = df.groupby('traveller_type')['score_value_for_money'].mean().reset_index()
sns.barplot(data=traveler_value, x='traveller_type', y='score_value_for_money', 
           palette='Set3', ax=axes[1])
axes[1].set_title('Average Value for Money Score by Traveler Type', fontweight='bold')
axes[1].set_xlabel('Traveler Type')
axes[1].set_ylabel('Average Value for Money Score')

# Add value labels
for i, v in enumerate(traveler_value['score_value_for_money']):
    axes[1].text(i, v + 0.02, f'{v:.2f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("Traveler Type Comparison:")
print("Overall Scores:")
for idx, row in traveler_overall.iterrows():
    print(f"• {row['traveller_type']}: {row['score_overall']:.2f}")
print("\nValue for Money Scores:")  
for idx, row in traveler_value.iterrows():
    print(f"• {row['traveller_type']}: {row['score_value_for_money']:.2f}")

In [None]:
grouped1_df = df.groupby(['traveller_type', 'city'])['score_overall'].mean().reset_index()
n = 10
top_n_cities = (
    grouped1_df
    .sort_values(['traveller_type', 'score_overall'], ascending=[True, False])
    .groupby('traveller_type')
    .head(n)
)
top_n_cities.head(20)
plt.figure(figsize=(12, 8))
traveller_types = top_n_cities['traveller_type'].unique()

for i, ttype in enumerate(traveller_types):
    plt.subplot(2, 2, i + 1)
    subset = (
        top_n_cities[top_n_cities['traveller_type'] == ttype]
        .sort_values('score_overall', ascending=False)
    )
    
    # Plot vertical bar chart for each traveller type
    sns.barplot(data=subset, x='city', y='score_overall', color='skyblue')
    
    # Annotate each bar with its value
    for index, value in enumerate(subset['score_overall']):
        plt.text(index, value + 0.009, f'{value:.2f}', ha='center', fontsize=7)
    
    # Title and axis labels
    plt.title(f'{ttype} Travelers - Top {n} Cities by Avg Overall Score', fontsize=10)
    plt.xlabel('City')
    plt.ylabel('Average Overall Score')
    plt.ylim(subset['score_overall'].min() - 0.02, subset['score_overall'].max() + 0.02)
    plt.xticks(rotation=55, ha='right')

plt.tight_layout()
plt.show()

### Based on the average overall scores from traveler reviews, the best city for each traveler type is:

- **Business Travelers:** **Dubai** (Average Score: 8.97)
- **Couple Travelers:** **Amsterdam** (Average Score: 9.10)
- **Family Travelers:** **Dubai** (Average Score: 9.21)
- **Solo Travelers:** **Amsterdam** (Average Score: 9.11)

These recommendations are based on the top-rated cities according to user reviews for each traveler category.

In [None]:
# Group by age_group and hotel_country to get mean value_for_money score
x = 3
grouped_age = (
    df.groupby(['age_group', 'hotel_country'])['score_value_for_money']
      .mean()
      .reset_index()
      .sort_values(['age_group', 'score_value_for_money'], ascending=[True, False])
      .groupby('age_group')
      .head(x)
      .reset_index(drop=True)
)

# Plot top 3 countries per age group
plt.figure(figsize=(12, 8))
age_groups = grouped_age['age_group'].unique()

ncols = 3
nrows = (len(age_groups) + ncols - 1) // ncols

for i, ag in enumerate(age_groups):
    ax = plt.subplot(nrows, ncols, i + 1)
    subset = grouped_age[grouped_age['age_group'] == ag].sort_values('score_value_for_money', ascending=False)
    sns.barplot(data=subset, x='hotel_country', y='score_value_for_money', ax=ax, color='skyblue')
    for idx, val in enumerate(subset['score_value_for_money']):
        ax.text(idx, val + 0.005, f'{val:.2f}', ha='center', fontsize=9)
    ax.set_title(f'Age Group: {ag}')
    ax.set_xlabel('')
    ax.set_ylabel('Avg Value for Money')
    ax.set_ylim(subset['score_value_for_money'].min() - 0.05, subset['score_value_for_money'].max() + 0.05)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.suptitle('Top 3 Countries by Avg Value-for-Money per Age Group', fontsize=14, y=1.02)
plt.show()

### Top 3 Countries by Avg Value-for-Money per Traveler Age Group

- **Age Group 18-24:**  
  1. Egypt – 9.01  
  2. Spain – 8.77 
  3. Argentina – 8.69  

- **Age Group 25-34:**  
  1. Spain – 8.73  
  2. South Korea – 8.63  
  3. Netherlands – 8.54

- **Age Group 35-44:**  
  1. Singapore – 8.80  
  2. Argentina – 8.70  
  3. New Zealand – 8.64  

- **Age Group 45-54:**  
  1. Turkey – 8.64  
  2. Singapore – 8.56  
  3. China – 8.55  

- **Age Group 55+:**  
  1. New Zealand  – 8.69  
  2. Canada – 8.62  
  3. Japan – 8.52

In [None]:
# Correlation Heatmap of Key Score Features
plt.figure(figsize=(10, 8))

# Select key score columns for correlation analysis
score_columns = ['score_overall', 'score_cleanliness', 'score_comfort', 
                'score_facilities', 'score_location', 'score_staff', 'score_value_for_money']

correlation_matrix = df[score_columns].corr()

# Create correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0,
           square=True, fmt='.3f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Hotel Score Dimensions', fontweight='bold')
plt.tight_layout()
plt.show()

print("Correlation Heatmap Analysis:")
print("• Values range from -1 to 1")
print("• Red colors indicate strong positive correlation")  
print("• Blue colors indicate strong negative correlation")
print("• White/neutral colors indicate weak correlation")

# Predictive Modeling Task

## Model Development Process

Before jumping into model training, we first performed **Exploratory Data Analysis (EDA)** to understand how different variables relate to each other.  
This included generating **correlation heatmaps** to visualize the relationships between hotel review scores and identifying which features might contribute most to prediction accuracy.

---

### 🔹 Step 1 — Initial Experiments

We started with a **simple baseline model** that included only a few basic hotel and user attributes.  
This initial setup was mainly used to test the modeling pipeline and identify potential issues such as **underfitting** or poor feature representation.  

As expected, the model achieved **inconsistent precision** between training and validation sets, suggesting that more contextual information was needed.

---

### 🔹 Step 2 — Feature Expansion

Next, we **expanded the feature set** by including additional review-based scores and user demographic attributes (like *traveler type* and *age group*).  
This step significantly improved the model’s **performance and stability**, as it allowed the neural network to capture more complex relationships across different traveler profiles and hotel characteristics.

---

### 🔹 Step 3 — Correlation and Feature Understanding

To better understand how features interacted, we revisited the **correlation analysis results** and used them to refine our final input set.  
The correlation study helped **eliminate redundant variables** and ensured that the model relied on **informative features** rather than overlapping ones.

---

### 🔹 Step 4 — Model Interpretability with SHAP

After achieving a stable model, we used **SHAP (SHapley Additive Explanations)** to interpret the model’s predictions.  
The SHAP analysis revealed the **relative importance** of each feature — for example, how much *score_location* or *star_rating* contributed to the predicted traveler group.  

This interpretability step confirmed that the model’s decisions aligned with **logical and data-driven patterns** rather than random correlations.


Data Pre-processing and Feature Engineering
Understanding the step
Need for the step
Effect on the step on the data distribution, range, values, etc

# Data Pre-processing and Feature Engineering

## Data Pre-processing
- **Purpose:** Clean and prepare raw data for modeling.  
- **Steps:** Map `age_group` to numbers, fill missing values, drop irrelevant columns.  
- **Effect:** Ensures consistent, complete, and usable data.

## Feature Engineering
- **Purpose:** Transform data for the model.  
- **Steps:** One-hot encode `user_gender` and `traveller_type`, map countries to regions.  
- **Effect:** Converts categorical text to numeric, adds useful features.

## Example: Before & After

**Before:**  
| user_gender | traveller_type | age_group | cleanliness |
|-------------|----------------|-----------|------------|
| Male        | Solo           | 25-34     | 9          |
| Female      | Family         | 35-44     | NaN        |

**After:**  
| age_group | cleanliness | user_gender_Male | traveller_type_Solo | traveller_type_Family |
|-----------|------------|-----------------|-------------------|---------------------|
| 1         | 9.0        | 1               | 1                 | 0                   |
| 2         | 9.0        | 0               | 0                 | 1                   |


In [None]:
# Data Preparation
df_model = prepare_for_model_training(df)

In [None]:
df_model.columns.tolist()

In [None]:
features = [
    'star_rating', 'cleanliness_base', 'comfort_base', 'facilities_base',
    'location_base', 'staff_base', 'value_for_money_base',
    'score_overall', 'score_cleanliness', 'score_comfort',
    'score_facilities', 'score_location', 'score_staff',
    'score_value_for_money',
    # Encoded user attributes
    'user_gender_Male',
    'user_gender_Other',
    'traveller_type_Couple',
    'traveller_type_Family',
    'traveller_type_Solo'
]

X = df_model[features]

y = df_model['country_group']

# Label encoding and train-test split
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42)

In [None]:
# Model Architecture
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(y_train.shape[1], activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy', Precision(name='precision'), Recall(name='recall')]
)

model.summary()

In [None]:
# Model Training
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    callbacks=[early_stop]
)

In [None]:
# Training Progress Visualization
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['precision'], label='Train Precision')
plt.plot(history.history['val_precision'], label='Val Precision')
plt.plot(history.history['recall'], label='Train Recall')
plt.plot(history.history['val_recall'], label='Val Recall')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Precision & Recall')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Model Evaluation
from sklearn.metrics import f1_score
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)

print("Test Results:")
print(f"Loss: {test_loss:.4f}")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")

# Predictions
y_pred = model.predict(X_test, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# F1 Scores
f1_macro = f1_score(y_true_classes, y_pred_classes, average='macro')
f1_weighted = f1_score(y_true_classes, y_pred_classes, average='weighted')

print(f"\nF1 Scores:")
print(f"Macro: {f1_macro:.4f}")
print(f"Weighted: {f1_weighted:.4f}")

In [None]:
# Confusion Matrix and Classification Report
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_,
            cbar=False,
            annot_kws={"size": 12})
plt.title('Confusion Matrix', fontsize=14, pad=15)
plt.xlabel('Predicted', fontsize=12, labelpad=10)
plt.ylabel('Actual', fontsize=12, labelpad=10)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print Detailed Classification Report below
print("Detailed Classification Report:\n")
print(classification_report(
    y_true_classes, 
    y_pred_classes, 
    target_names=label_encoder.classes_, 
    digits=3
))


# Model Explainability

In [None]:
import shap
shap.initjs()

import lime
from lime.lime_tabular import LimeTabularExplainer

In [None]:
def shapley_explainer(input_data, model, baseline_data = [], sample_count = 100):
    if baseline_data is None or len(baseline_data) == 0:
        explainer = shap.Explainer(model)
        return explainer(input_data)
    
    background = shap.sample(baseline_data, sample_count, random_state=42)
    explainer = shap.Explainer(model, background)
    return explainer(input_data)

In [None]:
shap_values = shapley_explainer(X_test[:100], model, X_train)


# Absolute Mean SHAP
This plot shows the contribution of each feature globally and ranks them

In [None]:
class_names = label_encoder.classes_
print("Classes:", class_names)

for class_idx in range(11):
    # Extract SHAP values for this specific class
    shap_values_class = shap_values[:, :, class_idx]
    
    # Create bar plot for this class
    fig, ax = plt.subplots(figsize=(16, 6))  # new figure and axis
    shap.plots.bar(shap_values_class, show=False)
    ax.set_title(f'{class_names[class_idx]}', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()


# Waterfall Plot
This plot shows the contribution of each feature, moving it from the baseline value, to the prediction value.

# Force Plot
This is a similar perspective, in a different format
from IPython.display import display

In [None]:
observation = 0

# plt.rcParams['figure.figsize'] = (12, 6)  # default width/height for new figures


for class_idx in range(11):
    # Extract SHAP values for this specific class and observation
    shap_values_class_observation = shap_values[observation, :, class_idx]

    shap.plots.waterfall(shap_values_class_observation, max_display=5, show=False)

    fig = plt.gcf()   # get the current figure
    ax = plt.gca()    # get the current axes

    ax.set_title(f'{class_names[class_idx]}', fontsize=12, fontweight='bold')
    ax.set_xlabel("Feature contribution to prediction")
    fig.suptitle("SHAP Waterfall Explanation", fontsize=16, fontweight="bold", y=1.03)
    plt.gcf().set_size_inches(16, 6)

    plt.tight_layout()
    plt.show()

    display(shap.plots.force(shap_values_class_observation))


# LIME
Using LIME to make local explanations for the NN classifier

In [None]:
explainer_lime = LimeTabularExplainer(
    X_train.values,
    feature_names=X.columns,
    class_names=class_names,
    mode="classification",
    discretize_continuous=True
    )

exp = explainer_lime.explain_instance(
    X_test.values[i],
    predict_fn= lambda x: model.predict(x, verbose=0),
    labels=range(len(class_names)),
    num_features=5    
)

from IPython.display import HTML

for class_idx, class_name in enumerate(class_names):
    html = exp.as_html(labels=[class_idx])  # generate figure for this class
    display(HTML(html))

In [None]:
df.info()

# Inference Function
This function takes the same raw input features which were preprocessed for the model to train over, and uses the model to make a prediction.

In [None]:
schema = {
    'review_id': 'int64',
    'user_id': 'int64',
    'hotel_id': 'int64',
    'review_date': 'object',
    'score_overall': 'float64',
    'score_cleanliness': 'float64',
    'score_comfort': 'float64',
    'score_facilities': 'float64',
    'score_location': 'float64',
    'score_staff': 'float64',
    'score_value_for_money': 'float64',
    'review_text': 'object',
    'hotel_id': 'int64',
    'hotel_name': 'object',
    'city': 'object',
    'country': 'object',
    'star_rating': 'int64',
    'lat': 'float64',
    'lon': 'float64',
    'cleanliness_base': 'float64',
    'comfort_base': 'float64',
    'facilities_base': 'float64',
    'location_base': 'float64',
    'staff_base': 'float64',
    'value_for_money_base': 'float64',   
    'user_id': 'int64',
    'user_gender': 'object',
    'country': 'object',
    'age_group': 'int64',
    'traveller_type': 'object',
    'join_date': 'object'
}

def inference(user, hotel, review):

    global df
    
    inputs = [user, hotel, review]
    
    for i, x in enumerate(inputs):
        # If it's a dictionary, convert to Series
        if isinstance(x, dict):
            inputs[i] = pd.Series(x)
        # If it's not a Series (e.g., list, numpy array), raise an error
        elif not isinstance(x, pd.Series):
            raise TypeError(f"Input {i} must be a dict or pandas Series, not {type(x)}")
    
    user, hotel, review = inputs
    
    #convert to dataframes from Series
    user_df = user.to_frame().T
    hotel_df = hotel.to_frame().T
    review_df = review.to_frame().T
    
    tmp_df = pd.merge(hotel_df, review_df, how='cross')
    new_df = pd.merge(tmp_df, user_df, how='cross')
    
    # rename columns
    new_df.rename(columns={'country_x': 'hotel_country', 'country_y': 'user_country'}, inplace=True)
    
    # convert to datetime
    new_df['review_date'] = pd.to_datetime(new_df['review_date'])
    new_df['join_date'] = pd.to_datetime(new_df['join_date'])
    
    age_mapping = {'18-24': 0, '25-34': 1, '35-44': 2, '45-54': 3, '55+': 4}
    new_df['age_group'] = new_df['age_group'].map(age_mapping)
    
    new_df = new_df.reindex(columns=df.columns, fill_value=DEFAULT_VALUES)
    intersection_schema = {k: v for k, v in schema.items() if k in new_df}
    new_df = new_df.astype(intersection_schema, errors='raise')
    
    categorical_cols = new_df.select_dtypes(include=['object']).columns.tolist()
    
            # Remove 'country_group' if it exists
    if 'country_group' in categorical_cols:
        categorical_cols.remove('country_group')

    
    # Transform and replace the categorical columns in df
    ohe_features = ohe_fit.transform(new_df[categorical_cols])
    ohe_feature_names = ohe_fit.get_feature_names_out(categorical_cols)
    new_df = new_df.drop(columns=categorical_cols)
    new_df[ohe_feature_names] = ohe_features
    
    new_df = new_df[features]
    new_df.info()
    
    observation_2d = new_df.iloc[0].values.reshape((1, -1))

    y_pred = model.predict(observation_2d, verbose=0).flatten()
    
    # Explain Prediction
    predicted_class = label_encoder.classes_[y_pred.argmax()]
    
    print(f"The predicted class is: {predicted_class}")
    print(f"The probability distribution for the class predictions are:")

    probabilities = y_pred
    class_names = label_encoder.classes_

    plt.figure(figsize=(12, 5))  # wider figure
    plt.bar(class_names, probabilities, color='skyblue')
    plt.xlabel('Class')
    plt.ylabel('Predicted Probability')
    plt.title('Class Prediction Probability Distribution')
    plt.ylim(0, 1)
    plt.xticks(rotation=45, ha='right')  # rotate x-ticks 45 degrees
    plt.show()
    
    print("LIME Explanation: Feature contributions for the top 3 predicted classes are shown below.")
    exp = explainer_lime.explain_instance(
        new_df.iloc[0].values,
        predict_fn=lambda x: model.predict(x, verbose=0),
        top_labels=3,   # Show explanations for top 3 predicted classes
        num_features=5
    )

    for idx, class_idx in enumerate(exp.top_labels):
        print(f"\nLIME Explanation for class: {label_encoder.classes_[class_idx]}")
        html = exp.as_html(labels=[class_idx])  # generate figure for this class
        display(HTML(html))
    
    return y_pred

In [None]:
# Testing inference Function with two inputs

user = {'user_id': 1600,
  'user_gender': 'Female',
  'country': 'New Zealand',
  'age_group': '25-34',
  'traveller_type': 'Solo',
  'join_date': '2021-03-21'
}

review = {'review_id': 1,
 'user_id': 1600,
 'hotel_id': 1,
 'review_date': '2022-10-07',
 'score_overall': 8.7,
 'score_cleanliness': 8.6,
 'score_comfort': 8.7,
 'score_facilities': 8.5,
 'score_location': 9.0,
 'score_staff': 8.8,
 'score_value_for_money': 8.7,
 'review_text': 'Practice reduce young our because machine. Recent forget phone fast stuff adult.'
 }

hotel = {
 'hotel_id': 1,
 'hotel_name': 'The Azure Tower',
 'city': 'New York',
 'country': 'United States',
 'star_rating': 5,
 'lat': 40.758,
 'lon': -73.9855,
 'cleanliness_base': 9.1,
 'comfort_base': 8.8,
 'facilities_base': 8.9,
 'location_base': 9.5,
 'staff_base': 8.6,
 'value_for_money_base': 8.0
}

inference(user=user, hotel=hotel, review=review)