In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
df = pd.read_csv(r'hotel_bookings.csv')

In [3]:
# 1. Create the Resort Hotel DataFrame
resort_hotel = df[df['hotel'] == 'Resort Hotel'].copy()

# 2. Create the City Hotel DataFrame
city_hotel = df[df['hotel'] == 'City Hotel'].copy()

# 3. Verify the split
print(f"Resort Hotel Shape: {resort_hotel.shape}")
print(f"City Hotel Shape:   {city_hotel.shape}")

Resort Hotel Shape: (40060, 32)
City Hotel Shape:   (79330, 32)


In [4]:
# 4. Create the 'is_family' feature
df['children'] = df['children'].fillna(0)
df['is_family'] = ((df['children'] > 0) | (df['babies'] > 0)).astype(int)

In [5]:
# 4. Create 'date_of_arrival'
# We combine Year (int), Month (str), and Day (int) into a single datetime object
df['date_of_arrival'] = pd.to_datetime(
    df['arrival_date_year'].astype(str) + '-' + 
    df['arrival_date_month'] + '-' + 
    df['arrival_date_day_of_month'].astype(str)
)

In [6]:
# C. Create 'duration_of_stay'
# Summing weekend and week nights
df['duration_of_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

In [7]:
# Define the mapping dictionary for Northern Hemisphere seasons
season_mapping = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Autumn', 'October': 'Autumn', 'November': 'Autumn'
}

# Map the month column to the new season column
df['season_of_booking'] = df['arrival_date_month'].map(season_mapping)


In [8]:
# 1. Get the counts of each season
season_counts = df['season_of_booking'].value_counts()

print("Counts per season:")
print(season_counts)

# 2. (Optional) Calculate the percentage of bookings per season
season_percentages = df['season_of_booking'].value_counts(normalize=True) * 100

print("\nPercentage per season:")
print(season_percentages)

Counts per season:
season_of_booking
Summer    37477
Spring    32674
Autumn    28462
Winter    20777
Name: count, dtype: int64

Percentage per season:
season_of_booking
Summer    31.390401
Spring    27.367451
Autumn    23.839518
Winter    17.402630
Name: proportion, dtype: float64


In [9]:
columns_to_remove = [
    'company', 
    'days_in_waiting_list', 
    'required_car_parking_spaces', 
    'total_of_special_requests', 
    'arrival_date_week_number',
    'children',
    'babies',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'arrival_date_year',
    'arrival_date_month',
    'arrival_date_day_of_month',
]

# Dropping the columns
df.drop(columns=columns_to_remove, inplace=True)