# 1. Data Preparation

This notebook covers the initial data preparation steps:
1.  Loading the dataset.
2.  Inspecting the data for quality issues.
3.  Cleaning the data (handling missing values, duplicates).
4.  Performing feature engineering.
5.  Saving the cleaned dataset for further analysis.


In [None]:
# Essential libraries for data manipulation
import pandas as pd
import numpy as np

# Adjust display settings for better dataframe visibility
pd.set_option('display.max_columns', None)

# Load the raw dataset from the CSV file
print("Loading 'Hotel_bookings_final.csv'...")
df = pd.read_csv('Hotel_bookings_final.csv')
print("Dataset loaded successfully.")

# --- Initial Inspection ---
print("\n--- Data Overview ---")
df.info()

print("\n--- First 5 Rows ---")
display(df.head())


In [None]:
# --- Data Cleaning ---

# 1. Handling Missing Values
print("--- Missing Value Check ---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# The number of missing values is small, so we'll drop the rows.
if df.isnull().sum().sum() > 0:
    df.dropna(inplace=True)
    print("\nRows with missing values have been removed.")

# 2. Handling Duplicates
print("\n--- Duplicate Row Check ---")
duplicate_rows = df.duplicated().sum()
print(f"Found {duplicate_rows} duplicate rows.")

if duplicate_rows > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicate rows have been removed.")

print(f"\nShape of the dataframe after cleaning: {df.shape}")


In [None]:
# --- Feature Engineering ---
# Create new columns that will be useful for analysis.

# Convert 'booking_date' to a proper datetime object
if not pd.api.types.is_datetime64_any_dtype(df['booking_date']):
    df['booking_date'] = pd.to_datetime(df['booking_date'])

# Extract temporal features for seasonal analysis
print("Extracting date components (year, month, day of week)...")
df['booking_year'] = df['booking_date'].dt.year
df['booking_month'] = df['booking_date'].dt.month
df['booking_day_of_week'] = df['booking_date'].dt.dayofweek

# Calculate the total booking value for profitability analysis
print("Calculating 'total_booking_value'...")
df['total_booking_value'] = df['avg_daily_rate'] * df['length_of_stay']

print("\nFeature engineering complete. Preview of new columns:")
display(df[['booking_date', 'booking_year', 'booking_month', 'booking_day_of_week', 'total_booking_value']].head())


In [None]:
# --- Save Cleaned Data ---
# The prepared data is saved to a new file to separate the cleaning
# process from the analysis phase.

output_filename = 'hotel_bookings_cleaned.csv'
df.to_csv(output_filename, index=False)

print(f"Data preparation complete.")
print(f"Cleaned and feature-engineered data saved to '{output_filename}'.")
print(f"Final shape: {df.shape}")
