# Load Datasets

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from keras import Sequential, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.metrics import Precision, Recall
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
import os

def get_data_path(slug_name):
    base1 = f"/kaggle/input/{slug_name}"
    base2 = "/kaggle/input"
    if os.path.exists(base1):
        return base1
    elif os.path.exists(base2):
        return base2
    else:
        return "./Data"

data_dir = get_data_path("international-hotel-booking-analytics")

users = pd.read_csv(f"{data_dir}/users.csv")
hotels = pd.read_csv(f"{data_dir}/hotels.csv")
reviews = pd.read_csv(f"{data_dir}/reviews.csv")

# Data Cleaning

In [None]:
def clean_data(hotels, reviews, users):
    # merge all datasets
    tmp_df = pd.merge(hotels, reviews, on='hotel_id')
    df = pd.merge(tmp_df, users, on='user_id')
    
    # rename columns
    df.rename(columns={'country_x': 'hotel_country', 'country_y': 'user_country'}, inplace=True)

    # convert to datetime
    df['review_date'] = pd.to_datetime(df['review_date'])
    df['join_date'] = pd.to_datetime(df['join_date'])
    
    # clean NaNs and duplicates
    df = df.drop_duplicates()
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    text_cols = df.select_dtypes(include=['object']).columns
    df[text_cols] = df[text_cols].fillna('Unknown')

    # create mapping before one-hot
    country_to_group = {
        'United States': 'North_America',
        'Canada': 'North_America',
        'Germany': 'Western_Europe',
        'France': 'Western_Europe',
        'United Kingdom': 'Western_Europe',
        'Netherlands': 'Western_Europe',
        'Spain': 'Western_Europe',
        'Italy': 'Western_Europe',
        'Russia': 'Eastern_Europe',
        'China': 'East_Asia',
        'Japan': 'East_Asia',
        'South Korea': 'East_Asia',
        'Thailand': 'Southeast_Asia',
        'Singapore': 'Southeast_Asia',
        'United Arab Emirates': 'Middle_East',
        'Turkey': 'Middle_East',
        'Egypt': 'Africa',
        'Nigeria': 'Africa',
        'South Africa': 'Africa',
        'Australia': 'Oceania',
        'New Zealand': 'Oceania',
        'Brazil': 'South_America',
        'Argentina': 'South_America',
        'India': 'South_Asia',
        'Mexico': 'North_America_Mexico'
    }

    # create the country_group column
    df['country_group'] = df['user_country'].map(country_to_group).fillna('Other')

    # binary mapping for gender
    df['user_gender'] = df['user_gender'].map({'Male': 0, 'Female': 1}).fillna(-1)
    
    # drop unnecessary columns
    df = df.drop(columns=[
        'user_id', 'hotel_id', 'review_id','join_date', 'lat', 'lon', 'user_country','hotel_name', 'review_text', 'review_date'
    ], errors='ignore')

    return df
df = clean_data(hotels, reviews, users)

In [None]:
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nSample data:")
df.head()

In [None]:
print("\nSummary statistics for numeric columns:")
display(df.describe())

In [None]:
print("\nValue counts for categorical columns:")
categorical_cols = ['age_group', 'traveller_type', 'user_gender', 'city', 'hotel_country', 'country_group']
for col in categorical_cols:
    print(f"\n{col} distribution:")
    print(df[col].value_counts())

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(data=df, x='age_group', palette='pastel')
plt.title('Number of Reviews per Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count')

# Add counts above bars
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2., height + 20,  # adjust +20 for spacing
            f'{int(height)}', ha="center", fontsize=10)

plt.show()

In [None]:
plt.figure(figsize=(10,6))
ax = sns.countplot(data=df, x='traveller_type', palette='muted')
plt.title('Number of Reviews per Traveller Type')
plt.xlabel('Traveller Type')
plt.ylabel('Count')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 5, str(int(p.get_height())), 
            ha='center', fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.countplot(data=df, x='user_gender', palette='coolwarm')
plt.title('Number of Reviews by Gender')
plt.xlabel('Gender (0=Male,1=Female)')
plt.ylabel('Count')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 5, str(int(p.get_height())), 
            ha='center', fontsize=10)
plt.show()

In [None]:
top_cities = df.groupby('city')['score_overall'].mean().sort_values(ascending=False).head(10).reset_index()
plt.figure(figsize=(12,6))
ax = sns.barplot(data=top_cities, x='city', y='score_overall', palette='viridis')
plt.xticks(rotation=55)
plt.title('Top 10 Cities by Average Overall Score')
plt.xlabel('City')
plt.ylabel('Average Overall Score')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 0.02, f'{p.get_height():.2f}', 
            ha='center', fontsize=9)
plt.show()

In [None]:
top_countries = df.groupby('hotel_country')['score_value_for_money'].mean().sort_values(ascending=False).head(10).reset_index()
plt.figure(figsize=(12,6))
ax = sns.barplot(data=top_countries, x='hotel_country', y='score_value_for_money', palette='magma')
plt.xticks(rotation=55)
plt.title('Top 10 Countries by Average Value-for-Money Score')
plt.xlabel('Country')
plt.ylabel('Average Value-for-Money Score')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_height() + 0.02, f'{p.get_height():.2f}', 
            ha='center', fontsize=9)
plt.show()

# Data-Engineering Questions

In [None]:
grouped1_df = df.groupby(['traveller_type', 'city'])['score_overall'].mean().reset_index()
n = 10
top_n_cities = (
    grouped1_df
    .sort_values(['traveller_type', 'score_overall'], ascending=[True, False])
    .groupby('traveller_type')
    .head(n)
)
top_n_cities.head(20)
plt.figure(figsize=(12, 8))
traveller_types = top_n_cities['traveller_type'].unique()

for i, ttype in enumerate(traveller_types):
    plt.subplot(2, 2, i + 1)
    subset = (
        top_n_cities[top_n_cities['traveller_type'] == ttype]
        .sort_values('score_overall', ascending=False)
    )
    
    # Plot vertical bar chart for each traveller type
    sns.barplot(data=subset, x='city', y='score_overall', color='skyblue')
    
    # Annotate each bar with its value
    for index, value in enumerate(subset['score_overall']):
        plt.text(index, value + 0.009, f'{value:.2f}', ha='center', fontsize=7)
    
    # Title and axis labels
    plt.title(f'{ttype} Travelers - Top {n} Cities by Avg Overall Score', fontsize=10)
    plt.xlabel('City')
    plt.ylabel('Average Overall Score')
    plt.ylim(subset['score_overall'].min() - 0.02, subset['score_overall'].max() + 0.02)
    plt.xticks(rotation=55, ha='right')

plt.tight_layout()
plt.show()

### Based on the average overall scores from traveler reviews, the best city for each traveler type is:

- **Business Travelers:** **Dubai** (Average Score: 8.97)
- **Couple Travelers:** **Amsterdam** (Average Score: 9.10)
- **Family Travelers:** **Dubai** (Average Score: 9.21)
- **Solo Travelers:** **Amsterdam** (Average Score: 9.11)

These recommendations are based on the top-rated cities according to user reviews for each traveler category.

In [None]:
# Group by age_group and hotel_country to get mean value_for_money score
x = 3
grouped_age = (
    df.groupby(['age_group', 'hotel_country'])['score_value_for_money']
      .mean()
      .reset_index()
      .sort_values(['age_group', 'score_value_for_money'], ascending=[True, False])
      .groupby('age_group')
      .head(x)
      .reset_index(drop=True)
)

# Plot top 3 countries per age group
plt.figure(figsize=(12, 8))
age_groups = grouped_age['age_group'].unique()

ncols = 3
nrows = (len(age_groups) + ncols - 1) // ncols

for i, ag in enumerate(age_groups):
    ax = plt.subplot(nrows, ncols, i + 1)
    subset = grouped_age[grouped_age['age_group'] == ag].sort_values('score_value_for_money', ascending=False)
    sns.barplot(data=subset, x='hotel_country', y='score_value_for_money', ax=ax, color='skyblue')
    for idx, val in enumerate(subset['score_value_for_money']):
        ax.text(idx, val + 0.005, f'{val:.2f}', ha='center', fontsize=9)
    ax.set_title(f'Age Group: {ag}')
    ax.set_xlabel('')
    ax.set_ylabel('Avg Value for Money')
    ax.set_ylim(subset['score_value_for_money'].min() - 0.05, subset['score_value_for_money'].max() + 0.05)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.suptitle('Top 3 Countries by Avg Value-for-Money per Age Group', fontsize=14, y=1.02)
plt.show()

### Top 3 Countries by Avg Value-for-Money per Traveler Age Group

- **Age Group 18-24:**  
  1. China – 8.71  
  2. Netherlands – 8.70  
  3. Canada – 8.66  

- **Age Group 25-34:**  
  1. China – 8.73  
  2. Netherlands – 8.68  
  3. Spain – 8.63  

- **Age Group 35-44:**  
  1. China – 8.70  
  2. Netherlands – 8.69  
  3. New Zealand – 8.65  

- **Age Group 45-54:**  
  1. China – 8.72  
  2. New Zealand – 8.67  
  3. Netherlands – 8.65  

- **Age Group 55+:**  
  1. Netherlands – 8.70  
  2. New Zealand – 8.63  
  3. China – 8.60

# Predictive Modeling Task

Here we used a neural network for the predictive modeling since it helps with classifications and groups predictions, as well as being able to utilise a large number of samples, we started with a set that doesn't use the encoded city or age groups but switched from that due to the inconsistency in precision:

features = [ 'star_rating', 'cleanliness_base', 'comfort_base', 'facilities_base', 'location_base', 'staff_base', 'value_for_money_base',

'score_overall', 'score_cleanliness', 'score_comfort',
'score_facilities', 'score_location', 'score_staff',
'score_value_for_money',

 User attributes
'user_gender',

'traveller_type_Couple',
'traveller_type_Family',
'traveller_type_Solo']

Afterwards we tried using sets with all the encoded features including city and age groups as seen below and eventually the precision for both value and train became more consistent during the last 10 or so epochs after initial fluctuation

In [None]:
df.columns.tolist()

In [None]:
# one-hot encode categorical columns
categorical_cols = ['traveller_type', 'age_group', 'city']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=np.uint8)

In [None]:
X = df[['star_rating',
 'cleanliness_base', 'comfort_base', 'facilities_base',
 'location_base', 'staff_base', 'value_for_money_base',
 'score_overall', 'score_cleanliness', 'score_comfort', 'score_facilities',
 'score_location', 'score_staff', 'score_value_for_money', 'user_gender',
 'traveller_type_Couple', 'traveller_type_Family', 'traveller_type_Solo',
 'age_group_25-34', 'age_group_35-44', 'age_group_45-54', 'age_group_55+',
 'city_Bangkok', 'city_Barcelona', 'city_Berlin', 'city_Buenos Aires',
 'city_Cairo', 'city_Cape Town', 'city_Dubai', 'city_Istanbul', 'city_Lagos',
 'city_London', 'city_Mexico City', 'city_Moscow', 'city_Mumbai', 'city_New York',
 'city_Paris', 'city_Rio de Janeiro', 'city_Rome', 'city_Seoul', 'city_Shanghai',
 'city_Singapore', 'city_Sydney', 'city_Tokyo', 'city_Toronto', 'city_Wellington']]

y = df['country_group']

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

y_categorical = to_categorical(y_encoded)
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical,
test_size=0.2, random_state=42)
print("\nLabel-encoded (numeric) y:")
print(y_encoded)

print("\nOne-hot encoded y (matrix):")
print(y_categorical)

In [None]:
model = Sequential([Input(shape=(X_train.shape[1],)),
layers.Dense(128, activation='relu'),
layers.Dense(64, activation='relu'),
layers.Dense(y_train.shape[1], activation='softmax') 
])
model.summary()

In [None]:
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy',
Precision(name='precision'),
Recall(name='recall')])

In [None]:
early_stop = EarlyStopping(monitor='val_accuracy', patience=3,
restore_best_weights=True)
history = model.fit(X_train, y_train,
epochs=50,
validation_split=0.2,
verbose=1,
callbacks=[early_stop])

In [None]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.plot(history.history['precision'], label='Train Precision')
plt.plot(history.history['val_precision'], label='Val Precision')
plt.plot(history.history['recall'], label='Train Recall')
plt.plot(history.history['val_recall'], label='Val Recall')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Training Progress (Accuracy, Precision, Recall)')
plt.legend()
plt.show()

In [None]:
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test,
y_test, verbose=0)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)

from sklearn.metrics import f1_score, classification_report
import numpy as np


y_pred = model.predict(X_test)

y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

f1_macro = f1_score(y_true_classes, y_pred_classes, average='macro')
f1_weighted = f1_score(y_true_classes, y_pred_classes, average='weighted')

print("F1 (macro):", f1_macro)
print("F1 (weighted):", f1_weighted)

print("\nClassification Report:")
print(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

# Model Explainability

In [None]:
import shap
shap.initjs()

import lime
import lime.lime_tabular

In [None]:
background = shap.sample(X_train, 100, random_state=42)
explainer = shap.Explainer(model, background)
shap_values = explainer(X_test[0:100])


# Absolute Mean SHAP
This plot shows the contribution of each feature globally and ranks them

In [None]:
class_names = label_encoder.classes_
print("Classes:", class_names)

for class_idx in range(11):
    # Extract SHAP values for this specific class
    shap_values_class = shap_values[:, :, class_idx]
    
    # Create bar plot for this class
    fig, ax = plt.subplots(figsize=(16, 6))  # new figure and axis
    shap.plots.bar(shap_values_class, show=False)
    ax.set_title(f'{class_names[class_idx]}', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()


# Waterfall Plot
This plot shows the contribution of each feature, moving it from the baseline value, to the prediction value

# Force Plot
This is a similar perspective, in a different format
from IPython.display import display

In [None]:
observation = 0

# plt.rcParams['figure.figsize'] = (12, 6)  # default width/height for new figures


for class_idx in range(11):
    # Extract SHAP values for this specific class and observation
    shap_values_class_observation = shap_values[observation, :, class_idx]

    shap.plots.waterfall(shap_values_class_observation, max_display=5, show=False)

    fig = plt.gcf()   # get the current figure
    ax = plt.gca()    # get the current axes

    ax.set_title(f'{class_names[class_idx]}', fontsize=12, fontweight='bold')
    ax.set_xlabel("Feature contribution to prediction")
    fig.suptitle("SHAP Waterfall Explanation", fontsize=16, fontweight="bold", y=1.03)
    plt.gcf().set_size_inches(16, 6)

    plt.tight_layout()
    plt.show()

    display(shap.plots.force(shap_values_class_observation))
