In [110]:
# import pandas

import pandas as pd

In [98]:
# Read in the dataset
hotelBookings = pd.read_csv("hotel_bookings.csv")

In [99]:
# First look at the dataset
hotelBookings

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


### List Columns


Using the `.columns` attribute to display an `Index` object containing the name of all the columns.<br>

*The `Index` object looks like a list but **IT IS NOT** a list* <br>

Pandas uses Index objects because they provide:<br>

- Immutability (you can’t accidentally change it)<br>

- Fast lookups<br>

- Alignment behavior when joining/merging<br>

- Set-like operations (intersection, difference, union)<br>

- Ability to be used as row labels and column labels<br>

A normal list can’t do this.

See __[here](https://pandas.pydata.org/docs/reference/api/pandas.Index.html)__




In [100]:
hotelBookings.columns

# OPTIONAL: Use a for loop to display column names in a more organized manner:

for eachColumn in hotelBookings.columns:
    print(eachColumn)

hotel
is_canceled
lead_time
arrival_date_year
arrival_date_month
arrival_date_week_number
arrival_date_day_of_month
stays_in_weekend_nights
stays_in_week_nights
adults
children
babies
meal
country
market_segment
distribution_channel
is_repeated_guest
previous_cancellations
previous_bookings_not_canceled
reserved_room_type
assigned_room_type
booking_changes
deposit_type
agent
company
days_in_waiting_list
customer_type
adr
required_car_parking_spaces
total_of_special_requests
reservation_status
reservation_status_date


### Drop/Rename Columns

To drop columns we will use the `.drop()` function that, for this exercise, will take two parameters:<br>
    - `columns`, that specify the columns to be dropped <br>
    - `inplace`, that, when set to `True`, applies the drop directly to the existing DataFrame, while `inplace=False` returns a new DataFrame and leaves the original untouched.<br>

To rename columns we will use the `.rename()` function that, for this exercise, will take the following parameters:<br>
    - a dictionary where each **key** is the original column name and each **value** is the new column name.<br>
    <br>
      `df.rename({'OldName': 'NewName'})`.<br>
    <br>
    - `axis=1` that specifies whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).<br>
    - `inplace=True`... 

In [101]:
hotelBookings.drop(columns=['required_car_parking_spaces'], inplace=True)
hotelBookings.rename({'adults': 'count_of_adults',
                      'children': 'count_of_children',
                      'babies': 'count_of_babies'}, axis=1, inplace=True)
hotelBookings.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,count_of_adults,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,3,No Deposit,,,0,Transient,0.0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,4,No Deposit,,,0,Transient,0.0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,No Deposit,,,0,Transient,75.0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,No Deposit,304.0,,0,Transient,75.0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,,0,Transient,98.0,1,Check-Out,2015-07-03


### Handling missing values (NaNs)

- Check what columns have `NaN` or `null` values;
- Check percentage of `null` values in each column;
- Make a determination of:
    1. which columns should be dropped based on the amount of `null` values in them using `.dropna()`;
    2. which columns could be filled in based on values from other columns using the `.unique()` function;
    3. Single out one unique value using comparison and look for similarities in columns;
- If columns can be filled in: GREAT! Else, use the `fillna()` method to replace `NaN` with a placeholder: `None`, for string values, `pd.NA` for boolean values and `-1` for numerical values.
- For columns with a small amount of `NaN` values, like the `count_of_children` column, use `isna()` to isolate the rows containing `NaN` values.

In [102]:
# OPTIONAL: Display as percentage => BE CAREFUL BECAUSE THIS WILL RETURN A STRING, THEREFORE UNABLE TO PERFORM CALCULATIONS
((hotelBookings.isnull().sum() * 100 / len(hotelBookings)) # AI Generated Code
    .round(2) # rounds two decimal points
    .astype(str) + '%') # converts it into string so it can display '%'

hotelBookings.isnull().sum() * 100 / len(hotelBookings)


hotel                              0.000000
is_canceled                        0.000000
lead_time                          0.000000
arrival_date_year                  0.000000
arrival_date_month                 0.000000
arrival_date_week_number           0.000000
arrival_date_day_of_month          0.000000
stays_in_weekend_nights            0.000000
stays_in_week_nights               0.000000
count_of_adults                    0.000000
count_of_children                  0.003350
count_of_babies                    0.000000
meal                               0.000000
country                            0.408744
market_segment                     0.000000
distribution_channel               0.000000
is_repeated_guest                  0.000000
previous_cancellations             0.000000
previous_bookings_not_canceled     0.000000
reserved_room_type                 0.000000
assigned_room_type                 0.000000
booking_changes                    0.000000
deposit_type                    

In [103]:
# Checking unique values in a certain column and using a placeholder to replace null values

hotelBookings['agent'].unique()

hotelBookings[hotelBookings['agent'] == 5]

# hotelBookings['agent'].fillna(-1, inplace=True) #inplace=True at the end of the chain does not modify the original dataframe.
# FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
# The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

# For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

hotelBookings['agent'] = hotelBookings['agent'].fillna(-1) #AI Generated Code

display(hotelBookings)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,count_of_adults,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,3,No Deposit,-1.0,,0,Transient,0.00,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,4,No Deposit,-1.0,,0,Transient,0.00,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,No Deposit,-1.0,,0,Transient,75.00,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,No Deposit,304.0,,0,Transient,75.00,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,,0,Transient,98.00,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,0,No Deposit,394.0,,0,Transient,96.14,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,0,No Deposit,9.0,,0,Transient,225.43,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,0,No Deposit,9.0,,0,Transient,157.71,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,0,No Deposit,89.0,,0,Transient,104.40,0,Check-Out,2017-09-07


In [104]:
# Isolating columns with a few rows containing null values

hotelBookings[hotelBookings['count_of_children'].isna()] # There are four rows => these rows can be dropped

hotelBookings.dropna(subset=['count_of_children'], inplace=True) 

display(hotelBookings)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,count_of_adults,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,3,No Deposit,-1.0,,0,Transient,0.00,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,4,No Deposit,-1.0,,0,Transient,0.00,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,No Deposit,-1.0,,0,Transient,75.00,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,No Deposit,304.0,,0,Transient,75.00,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,,0,Transient,98.00,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,0,No Deposit,394.0,,0,Transient,96.14,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,0,No Deposit,9.0,,0,Transient,225.43,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,0,No Deposit,9.0,,0,Transient,157.71,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,0,No Deposit,89.0,,0,Transient,104.40,0,Check-Out,2017-09-07


In [105]:
hotelBookings[hotelBookings['country'].isna()] # Same concept as above... In this case there are 488 rows. Too much to be excluded. 

hotelBookings[hotelBookings['country'].isna()][['hotel', 'is_canceled', 'country']] # Filter rows where 'country' is missing and show only the hotel, cancellation status, and country columns 

# hotelBookings['country'].fillna('Unknown', inplace=True)

hotelBookings['country'] = hotelBookings['country'].fillna('Unknown')
 

In [106]:
hotelBookings.drop(columns=['company'], inplace=True)

### Investigate Data Types 

- Use `.dtypes` attribute to investigate the types of data in each column.
- Change the types of data accordingly assiging the df to the result of the `astype()` function.
- `astype` will take a dictionary where each **key** is the column containing the wrong data type and each **value** is the new data type for that column.

In [107]:
hotelBookings.dtypes

hotelBookings = hotelBookings.astype({'is_canceled': 'boolean',
                                      'is_repeated_guest': 'boolean',
                                      'count_of_children': 'int'
                                      })
hotelBookings.dtypes


hotel                              object
is_canceled                       boolean
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
count_of_adults                     int64
count_of_children                   int32
count_of_babies                     int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                 boolean
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

### Bin Columns

Group columns using `pd.cut` function to group values. It requires the "bins" and the "labels" for each group. <br>
This function will lookup the value in a column and assign a value based on which bin it falls under

In [108]:
# Check Unique values:
hotelBookings['lead_time'].unique()

# Use Describe to check for min and max values for a certain column:
hotelBookings['lead_time'].describe()

# Create bins by especifying the edges o the bin:
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800]

# Create labels to specify ranges:
labels = ['0-100', '101-200', '201-300', '301-400', '401-500', '501-600', '601-700', '701-800']

# Use pd.cut to assign value to a new column:
hotelBookings['lead_time_binned'] = pd.cut(hotelBookings['lead_time'], bins=bins, labels=labels)

# Print outputs:
hotelBookings[['lead_time', 'lead_time_binned']]

Unnamed: 0,lead_time,lead_time_binned
0,342,301-400
1,737,701-800
2,7,0-100
3,13,0-100
4,14,0-100
...,...,...
119385,23,0-100
119386,102,101-200
119387,34,0-100
119388,109,101-200


### Separate Columns

Split a column into two columns using a delimiter using the `.split()` function. It returns an `array`.<br>
The `.split()` function will take two arguments in this case: the character to be used as a delimiter and `expand=True`.<br>
`expand=True` allocate each portion of the split string into a new column, which makes it easier to manipulate.<br>
`[0]` returns the first items of the array that results from this operation, in thsi case, the month.



In [109]:
# Create appropriate columns
# hotelBookings['arrival_date_month'] = hotelBookings['arrival_date'].str.split('-', expand=True)[0]
# hotelBookings['arrival_date_year'] = hotelBookings['arrival_date'].str.split('-', expand=True)[1]
# hotelBookings.head()

# This is not making any sense