# Import Libraries

In [27]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, chi2

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set_style("darkgrid")

# Load Dataset

In [2]:
df = pd.read_csv('hotel_bookings.csv')

df.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


## Apakah terdapat pengaruh besar pada deposit_type dengan pembatalan hotel?
- H0 : Tidak terdapat pengaruh besar antara deposit_type dengan pembatalan hotel
- H1 : Terdapat pengaruh besar antara deposit_type dengan pembatalan hotel

Rumus Chi-square kontingensi

$X^2 = \sum \frac{(O_{ij}-E_{ij})^2}{E_{ij}}$

- $X^2$ Nilai Chi Square
- $O_{ij}$ Frekuensi hasil yang diamati (observed value)
- $E_{ij}$ Frekuensi yang diharapkan (expected value)


In [23]:
# cross table
data = df[['is_canceled', 'deposit_type']]
cross_tab = pd.crosstab(data['is_canceled'], data['deposit_type'], margins=True)

cross_tab

deposit_type,No Deposit,Non Refund,Refundable,All
is_canceled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,74947,93,126,75166
1,29694,14494,36,44224
All,104641,14587,162,119390


In [16]:
# define observed value
Oij = [
    [74947, 93, 126],
    [29694, 14494, 36]
]
Oij = np.array(Oij)
Oij

array([[74947,    93,   126],
       [29694, 14494,    36]])

Rumus frekuensi yang diharapkan

$E_{ij} = \frac{(\sum_{i=1}^{r} O_{i\cdot})(\sum_{j=1}^{c} O_{\cdot j})}{n}$

- $O_{i\cdot}$ jumlah kolom ke-i
- $O_{\cdot j}$ jumlah baris ke-j
- $n$ jumlah total

In [15]:
# calculate expected value
Eij = [
    [((75166*104641)/119390), ((75166*14587)/119390), ((75166*162)/119390)],
    [((44224*104641)/119390), ((44224*14587)/119390), ((44224*162)/119390)]
]
Eij = np.array(Eij)
Eij

array([[6.58802698e+04, 9.18373768e+03, 1.01992562e+02],
       [3.87607302e+04, 5.40326232e+03, 6.00074378e+01]])

In [33]:
# calculate chi2
chi2_val = np.sum(
    ((Oij[0,0] - Eij[0,0])**2 / Eij[0,0]) +
    ((Oij[0,1] - Eij[0,1])**2 / Eij[0,1]) +
    ((Oij[0,2] - Eij[0,2])**2 / Eij[0,2]) +
    ((Oij[1,0] - Eij[1,0])**2 / Eij[1,0]) +
    ((Oij[1,1] - Eij[1,1])**2 / Eij[1,1]) +
    ((Oij[1,2] - Eij[1,2])**2 / Eij[1,2])
)

print('chi2 values: ', chi2_val)

chi2 values:  27677.32924132434


In [30]:
# find chi2 table
alpha = 0.05
dof = (2-1)*(3-1)

chi2_table = chi2.ppf(1 - alpha, dof)
print('chi2 table: ', chi2_table)

chi2 table:  5.991464547107979


In [32]:
if chi2_val < chi2_table:
    print('Gagal tolak H0, Tidak terdapat pengaruh besar antara deposit_type dengan pembatalan hotel')
else:
    print('Tolak H0, Terdapat pengaruh besar antara deposit_type dengan pembatalan hotel')

Tolak H0, Terdapat pengaruh besar antara deposit_type dengan pembatalan hotel


In [24]:
# process data
data = df[['is_canceled', 'deposit_type']]
cross_tab = pd.crosstab(data['is_canceled'], data['deposit_type'])

# uji chi-square
stat, p_value, dof, expected = chi2_contingency(cross_tab)
print('chi2 : ', stat)
print('p-value : ', p_value)

chi2 :  27677.32924132434
p-value :  0.0
