**Step 1: Import necessary libraries and load the data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('/content/hotel_bookings.csv')
print(f"Dataset shape: {df.shape}")

Dataset shape: (7324, 32)


**Step 2: Initial data exploration**

In [None]:
# Display basic information about the dataset
print(df.info())

# Show the first few rows
print(df.head())

# Display summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7324 entries, 0 to 7323
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           7324 non-null   object 
 1   is_canceled                     7324 non-null   int64  
 2   lead_time                       7324 non-null   int64  
 3   arrival_date_year               7324 non-null   int64  
 4   arrival_date_month              7324 non-null   object 
 5   arrival_date_week_number        7323 non-null   float64
 6   arrival_date_day_of_month       7323 non-null   float64
 7   stays_in_weekend_nights         7323 non-null   float64
 8   stays_in_week_nights            7323 non-null   float64
 9   adults                          7323 non-null   float64
 10  children                        7323 non-null   float64
 11  babies                          7323 non-null   float64
 12  meal                            73

**Step 3: Handle missing values**

In [None]:
# Check for missing values before handling
print("Missing values before handling:")
print(df.isnull().sum())

# Handle missing values (example for 'children' column)
df['children'] = df['children'].fillna(0)

# For categorical columns, fill with a new category
df['country'] = df['country'].fillna('Unknown')

# Check for missing values after handling
print("\nMissing values after handling:")
print(df.isnull().sum())

Missing values before handling:
hotel                                0
is_canceled                          0
lead_time                            0
arrival_date_year                    0
arrival_date_month                   0
arrival_date_week_number             1
arrival_date_day_of_month            1
stays_in_weekend_nights              1
stays_in_week_nights                 1
adults                               1
children                             1
babies                               1
meal                                 1
country                              4
market_segment                       1
distribution_channel                 1
is_repeated_guest                    1
previous_cancellations               1
previous_bookings_not_canceled       1
reserved_room_type                   1
assigned_room_type                   1
booking_changes                      1
deposit_type                         1
agent                             1157
company                         

**Step 4: Remove duplicates**

In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicates: {duplicate_count}")

# If duplicates exist, remove them and show the result
if duplicate_count > 0:
    df = df.drop_duplicates()
    print(f"Shape after removing duplicates: {df.shape}")



Number of duplicates: 0


**Step 5: Correct data types**

In [None]:
# Display current data types
print("Data types before conversion:")
print(df.dtypes)

# Convert 'reservation_status_date' to datetime
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

# Ensure 'adr' is float
df['adr'] = df['adr'].astype(float)

# Display new data types
print("\nData types after conversion:")
print(df.dtypes)

Data types before conversion:
hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number          float64
arrival_date_day_of_month         float64
stays_in_weekend_nights           float64
stays_in_week_nights              float64
adults                            float64
children                          float64
babies                            float64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                 float64
previous_cancellations            float64
previous_bookings_not_canceled    float64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                   float64
deposit_type                       object
agen

**Step 6: Handle outliers**

In [None]:
# Example for 'adr' column
Q1 = df['adr'].quantile(0.25)
Q3 = df['adr'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"ADR statistics before handling outliers:")
print(df['adr'].describe())

df = df[(df['adr'] >= lower_bound) & (df['adr'] <= upper_bound)]

print(f"\nADR statistics after handling outliers:")
print(df['adr'].describe())

ADR statistics before handling outliers:
count    5487.000000
mean       97.091881
std        54.241078
min         0.000000
25%        56.000000
50%        85.000000
75%       130.150000
max       332.000000
Name: adr, dtype: float64

ADR statistics after handling outliers:
count    5426.000000
mean       95.244095
std        51.623025
min         0.000000
25%        56.000000
50%        85.000000
75%       128.705000
max       241.000000
Name: adr, dtype: float64


**Step 7: Create Subtotals Columns**

In [None]:
# Create a total_guests column
df['total_guests'] = df['adults'] + df['children'] + df['babies']

# Create a stay_duration column
df['stay_duration'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

print("New columns added:")
print(df[['total_guests', 'stay_duration']].describe())

New columns added:
       total_guests  stay_duration
count   5426.000000    5426.000000
mean       2.146701       4.964615
std        1.600992       3.477146
min        0.000000       0.000000
25%        2.000000       2.000000
50%        2.000000       4.000000
75%        2.000000       7.000000
max       55.000000      46.000000


**Step 8: Final check and save cleaned data**

In [None]:
# Final check for missing values
print("Final check for missing values:")
print(df.isnull().sum())

# Display final dataset info
print("\nFinal dataset info:")
print(df.info())

# Save the cleaned dataset
df.to_csv('cleaned_hotel_booking.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_hotel_booking.csv'")

Final check for missing values:
hotel                                0
is_canceled                          0
lead_time                            0
arrival_date_year                    0
arrival_date_month                   0
arrival_date_week_number             0
arrival_date_day_of_month            0
stays_in_weekend_nights              0
stays_in_week_nights                 0
adults                               0
children                             0
babies                               0
meal                                 0
country                              0
market_segment                       0
distribution_channel                 0
is_repeated_guest                    0
previous_cancellations               0
previous_bookings_not_canceled       0
reserved_room_type                   0
assigned_room_type                   0
booking_changes                      0
deposit_type                         0
agent                              715
company                         