# Load Datasets

In [1]:
import numpy as np
import pandas as pd

users = pd.read_csv('/kaggle/input/international-hotel-booking-analytics/users.csv')
hotels = pd.read_csv('/kaggle/input/international-hotel-booking-analytics/hotels.csv')
reviews = pd.read_csv('/kaggle/input/international-hotel-booking-analytics/reviews.csv')

# Data Cleaning

In [5]:
def clean_data(hotels, reviews, users):
    # merge all datasets
    tmp_df = pd.merge(hotels, reviews, on='hotel_id')
    df = pd.merge(tmp_df, users, on='user_id')
    
    # rename columns
    df.rename(columns={'country_x': 'hotel_country', 'country_y': 'user_country'}, inplace=True)

    # convert to datetime
    df['review_date'] = pd.to_datetime(df['review_date'])
    df['join_date'] = pd.to_datetime(df['join_date'])
    
    # clean NaNs and duplicates
    df = df.drop_duplicates()
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    text_cols = df.select_dtypes(include=['object']).columns
    df[text_cols] = df[text_cols].fillna('Unknown')

    # create mapping before one-hot
    country_to_group = {
        'United States': 'North_America',
        'Canada': 'North_America',
        'Germany': 'Western_Europe',
        'France': 'Western_Europe',
        'United Kingdom': 'Western_Europe',
        'Netherlands': 'Western_Europe',
        'Spain': 'Western_Europe',
        'Italy': 'Western_Europe',
        'Russia': 'Eastern_Europe',
        'China': 'East_Asia',
        'Japan': 'East_Asia',
        'South Korea': 'East_Asia',
        'Thailand': 'Southeast_Asia',
        'Singapore': 'Southeast_Asia',
        'United Arab Emirates': 'Middle_East',
        'Turkey': 'Middle_East',
        'Egypt': 'Africa',
        'Nigeria': 'Africa',
        'South Africa': 'Africa',
        'Australia': 'Oceania',
        'New Zealand': 'Oceania',
        'Brazil': 'South_America',
        'Argentina': 'South_America',
        'India': 'South_Asia',
        'Mexico': 'North_America_Mexico'
    }

    # create the country_group column
    df['country_group'] = df['user_country'].map(country_to_group).fillna('Other')

    # binary mapping for gender
    df['user_gender'] = df['user_gender'].map({'Male': 0, 'Female': 1}).fillna(-1)
    
    # one-hot encode categorical columns
    categorical_cols = ['traveller_type', 'age_group', 'city']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    # drop unnecessary columns
    df = df.drop(columns=[
        'user_id', 'hotel_id', 'review_id','join_date', 'lat', 'lon','hotel_country', 'user_country','hotel_name', 'review_text', 'review_date'
    ], errors='ignore')

    return df
df = clean_data(hotels, reviews, users)

In [8]:
df.head()

Unnamed: 0,star_rating,cleanliness_base,comfort_base,facilities_base,location_base,staff_base,value_for_money_base,score_overall,score_cleanliness,score_comfort,...,city_Paris,city_Rio de Janeiro,city_Rome,city_Seoul,city_Shanghai,city_Singapore,city_Sydney,city_Tokyo,city_Toronto,city_Wellington
0,5,9.1,8.8,8.9,9.5,8.6,8.0,8.7,8.6,8.7,...,False,False,False,False,False,False,False,False,False,False
1,5,9.1,8.8,8.9,9.5,8.6,8.0,8.7,8.8,8.9,...,False,False,False,False,False,False,False,False,False,False
2,5,9.1,8.8,8.9,9.5,8.6,8.0,8.8,8.3,8.6,...,False,False,False,False,False,False,False,False,False,False
3,5,9.1,8.8,8.9,9.5,8.6,8.0,8.8,9.3,9.4,...,False,False,False,False,False,False,False,False,False,False
4,5,9.1,8.8,8.9,9.5,8.6,8.0,8.7,8.2,8.8,...,False,False,False,False,False,False,False,False,False,False


In [9]:
df.columns.tolist()

['star_rating',
 'cleanliness_base',
 'comfort_base',
 'facilities_base',
 'location_base',
 'staff_base',
 'value_for_money_base',
 'score_overall',
 'score_cleanliness',
 'score_comfort',
 'score_facilities',
 'score_location',
 'score_staff',
 'score_value_for_money',
 'user_gender',
 'country_group',
 'traveller_type_Couple',
 'traveller_type_Family',
 'traveller_type_Solo',
 'age_group_25-34',
 'age_group_35-44',
 'age_group_45-54',
 'age_group_55+',
 'city_Bangkok',
 'city_Barcelona',
 'city_Berlin',
 'city_Buenos Aires',
 'city_Cairo',
 'city_Cape Town',
 'city_Dubai',
 'city_Istanbul',
 'city_Lagos',
 'city_London',
 'city_Mexico City',
 'city_Moscow',
 'city_Mumbai',
 'city_New York',
 'city_Paris',
 'city_Rio de Janeiro',
 'city_Rome',
 'city_Seoul',
 'city_Shanghai',
 'city_Singapore',
 'city_Sydney',
 'city_Tokyo',
 'city_Toronto',
 'city_Wellington']

# Data-Engineering Questions

# Predictive Modeling Task

In [None]:
df.head()

In [None]:
X = df['']

# Model Explainability