# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
raw_data = pd.read_csv('hotel_bookings.csv')
raw_data.sample(8)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
23624,Resort Hotel,0,12,2016,April,18,27,0,3,2,...,No Deposit,385.0,,0,Transient-Party,85.33,0,0,Check-Out,30-04-16
74205,City Hotel,1,363,2015,October,42,15,0,2,2,...,Non Refund,1.0,,0,Contract,62.0,0,0,Canceled,01-01-15
26620,Resort Hotel,0,1,2016,August,32,5,0,1,1,...,No Deposit,,,0,Transient,172.0,0,0,Check-Out,06-08-16
110400,City Hotel,0,4,2017,April,17,27,0,3,2,...,No Deposit,9.0,,0,Transient,0.0,0,2,Check-Out,30-04-17
23148,Resort Hotel,0,5,2016,April,15,9,2,5,1,...,No Deposit,40.0,,0,Contract,51.0,0,0,Check-Out,16-04-16
64674,City Hotel,1,37,2017,March,11,15,0,2,2,...,Non Refund,378.0,,0,Transient,110.0,0,0,Canceled,06-02-17
62194,City Hotel,1,86,2017,January,1,4,0,3,2,...,No Deposit,9.0,,0,Transient,79.2,0,2,Canceled,12-12-16
86413,City Hotel,0,4,2016,April,14,2,0,1,2,...,No Deposit,9.0,,0,Transient,99.0,0,1,Check-Out,03-04-16


## 1. Data Collection


In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

- There are 32 columns in the dataset.
- The dataset contains 119,390 rows.
- Data Types:
    - 1. 16 columns are of int64 data type.
    - 2. 4 columns are of float64 data type.
    - 3. 12 columns are of object data types. 
- Missing Values:
    - The column `children` has <span style="color:red">4</span> missing values.
    - The column `country` has <span style="color:red">488</span> missing values.
    - The column `agent` has <span style="color:red">16,340</span> missing values.
    - The column `company` has <span style="color:red">112,593</span> missing values.


## 2 Exploratory Data Analysis (EDA)

### 2.1 Looking for missing values.

In [4]:
raw_data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

- Dropping null values is easy but isn't a good practice, so we will start with replacing them with zero.

In [5]:
raw_data.fillna(0, inplace=True)

### 2.2 cleaning Data

In [6]:
raw_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
is_canceled,119390.0,0.370416,0.482918,0.0,0.0,0.0,1.0,1.0
lead_time,119390.0,104.011416,106.863097,0.0,18.0,69.0,160.0,737.0
arrival_date_year,119390.0,2016.156554,0.707476,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,119390.0,27.165173,13.605138,1.0,16.0,28.0,38.0,53.0
arrival_date_day_of_month,119390.0,15.798241,8.780829,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,119390.0,0.927599,0.998613,0.0,0.0,1.0,2.0,19.0
stays_in_week_nights,119390.0,2.500302,1.908286,0.0,1.0,2.0,3.0,50.0
adults,119390.0,1.856403,0.579261,0.0,2.0,2.0,2.0,55.0
children,119390.0,0.103886,0.398555,0.0,0.0,0.0,0.0,10.0
babies,119390.0,0.007949,0.097436,0.0,0.0,0.0,0.0,10.0


In [7]:
# Display rows with negative adr values
negative_adr_rows = raw_data[raw_data['adr'] < 0]
display(negative_adr_rows)

# Drop rows with negative adr values
raw_data.drop(negative_adr_rows.index, inplace=True)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
14969,Resort Hotel,0,195,2017,March,10,5,4,6,2,...,No Deposit,273.0,0.0,0,Transient-Party,-6.38,0,0,Check-Out,15-03-17


### Based on the following logic:

1.  Rows containing negative values in the `adr` column.
2.  The `distribution_channel` column, as it depicts the same information as the `market_segment` column.
3.  The `arrival_date_week_number` column, as the week number can be derived from the `arrival_date` column.
4.  Rows where both the `children` and `adults` columns contain zero.

In [8]:
def clean_dataframe(data):
    """
    Cleans a pandas DataFrame based on specified criteria.

    Args:
        raw_data: The pandas DataFrame to clean.

    Returns:
        The cleaned pandas DataFrame.
    """

    # 1. Drop rows with negative 'adr' values.
    data.drop(data[data['adr'] < 0].index, inplace=True)

    # 2. Drop the 'distribution_channel' column.
    if 'distribution_channel' in data.columns: # check if column exists
        data = data.drop(columns=['distribution_channel'])

    # 3. Drop the 'arrival_date_week_number' column.
    if 'arrival_date_week_number' in data.columns: # check if column exists
        data = data.drop(columns=['arrival_date_week_number'])

    # 4. Drop rows where both 'children' and 'adults' are zero.
    data = data[~((data['children'] == 0) & (data['adults'] == 0))] # ~ is the not operator.

    return data


raw_data = clean_dataframe(raw_data)

In [9]:
raw_data

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,1,0,0,2,0.0,...,No Deposit,0.0,0.0,0,Transient,0.00,0,0,Check-Out,01-07-15
1,Resort Hotel,0,737,2015,July,1,0,0,2,0.0,...,No Deposit,0.0,0.0,0,Transient,0.00,0,0,Check-Out,01-07-15
2,Resort Hotel,0,7,2015,July,1,0,1,1,0.0,...,No Deposit,0.0,0.0,0,Transient,75.00,0,0,Check-Out,02-07-15
3,Resort Hotel,0,13,2015,July,1,0,1,1,0.0,...,No Deposit,304.0,0.0,0,Transient,75.00,0,0,Check-Out,02-07-15
4,Resort Hotel,0,14,2015,July,1,0,2,2,0.0,...,No Deposit,240.0,0.0,0,Transient,98.00,0,1,Check-Out,03-07-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,30,2,5,2,0.0,...,No Deposit,394.0,0.0,0,Transient,96.14,0,0,Check-Out,06-09-17
119386,City Hotel,0,102,2017,August,31,2,5,3,0.0,...,No Deposit,9.0,0.0,0,Transient,225.43,0,2,Check-Out,07-09-17
119387,City Hotel,0,34,2017,August,31,2,5,2,0.0,...,No Deposit,9.0,0.0,0,Transient,157.71,0,4,Check-Out,07-09-17
119388,City Hotel,0,109,2017,August,31,2,5,2,0.0,...,No Deposit,89.0,0.0,0,Transient,104.40,0,0,Check-Out,07-09-17


### 2.3 Feature Selection

#### Why Feature Selection Might Be Beneficial:

* **Improved LLM Performance:** LLMs can be sensitive to irrelevant or redundant information. Feature selection can help focus the LLM on the most important data points, potentially improving accuracy and relevance.

* **Reduced Dimensionality:** If your dataset has a large number of features, feature selection can reduce the dimensionality of the data. This can lead to:

    1.  Faster vector embedding generation and retrieval.
    2.  Reduced storage requirements for your vector database.
    3.  Improved efficiency of your API.