# Data Exploration - Hotel Bookings
Group #6: Allyson Vasquez, Alex Miller, Vena Khamvanthong, Mandev Doshi

This notebook explores our dataset to gain deeper insights in order to create meaningful visualizations.

In [16]:
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
from pandas_profiling import ProfileReport

Let's do some exploratory data analysis on our hotel_booking.csv file. This will help us to identify any patterns, relations, or cleaning that needs to be done.

In [17]:
df = pd.read_csv('hotel_booking.csv')

#Looking at the first 10 rows of our dataset
#df.head(10)
#df.info()

We can observe that 
- There are 36 columns/attributes.
- There are 119,390 rows/entries.
- Our attributes are objects, integers, or floats.

Let's see if there is any missing data below.

In [18]:
#Checking for missing data
df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

Our dataset does contain missing information, specifically in:
- country
- agent
- company

In [19]:
#Looking at the unique values for each column/attribute
#This output also gives us insight into which columns are quantitative and which are categorical

'''#NOTE: Missing data is nan in the dataset. needs to be cleaned/addressed with
for col in df.columns:
    print('{} : {}'.format(col,df[col].unique()))'''


"#NOTE: Missing data is nan in the dataset. needs to be cleaned/addressed with\nfor col in df.columns:\n    print('{} : {}'.format(col,df[col].unique()))"

We can see above that our missing data set is 'nan'. We will address/clean this when done with our data exploration.

Lets also create a Profile Report below to see if we can make any other observations.

In [20]:
#profile = ProfileReport(df, title="Hotel Bookings Profile Report", minimal=True)
#profile.to_file("hotel_booking_report.html")

## Data Cleaning

In [21]:
# Dropping unnecessary columns
df = df.drop(['agent','company','name','email','phone-number','credit_card', 'reservation_status_date', 'reservation_status', 'required_car_parking_spaces'], axis=1)

In [22]:
df.isnull().sum()

hotel                               0
is_canceled                         0
lead_time                           0
arrival_date_year                   0
arrival_date_month                  0
arrival_date_week_number            0
arrival_date_day_of_month           0
stays_in_weekend_nights             0
stays_in_week_nights                0
adults                              0
children                            4
babies                              0
meal                                0
country                           488
market_segment                      0
distribution_channel                0
is_repeated_guest                   0
previous_cancellations              0
previous_bookings_not_canceled      0
reserved_room_type                  0
assigned_room_type                  0
booking_changes                     0
deposit_type                        0
days_in_waiting_list                0
customer_type                       0
adr                                 0
total_of_spe

In [23]:
#Remove Nan from children
df = df.dropna(axis=0, subset=['children'])

In [24]:
#drop outliers
df.drop([48515,14969], axis=0, inplace=True)

In [25]:
df['country'] = df['country'].fillna('Unknown')
df.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
total_of_special_requests         0
dtype: int64

In [26]:
#save cleaned df to csv
from pathlib import Path  
filepath = Path('/Users/allysonvasquez/Developer/VisualAnalytics-Group-Project/hotel.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)

In [27]:
df = pd.read_csv('hotel.csv', index_col='index')


In [28]:
df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type',
       'days_in_waiting_list', 'customer_type', 'adr',
       'total_of_special_requests'],
      dtype='object')

In [29]:
#COUNTRIES OF ORIGIN
df['country'].count()

119384

In [30]:
#the highest amount paid for a hotel stay
highest_cost = df['adr'].idxmax()
highest_cost

111403

In [31]:
#average cost of hotel stay
df['adr'].mean()

101.79006801581215

In [32]:
df

Unnamed: 0_level_0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,total_of_special_requests
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,0,0,C,C,3,No Deposit,0,Transient,0.00,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,0,0,C,C,4,No Deposit,0,Transient,0.00,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,0,A,C,0,No Deposit,0,Transient,75.00,0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,0,A,A,0,No Deposit,0,Transient,75.00,0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,0,A,A,0,No Deposit,0,Transient,98.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,0,0,A,A,0,No Deposit,0,Transient,96.14,0
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,0,0,E,E,0,No Deposit,0,Transient,225.43,2
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,0,0,D,D,0,No Deposit,0,Transient,157.71,4
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,0,0,A,A,0,No Deposit,0,Transient,104.40,0
