In [None]:
import pandas as pd
import seaborn as sb
import io
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
from patsy import dmatrices
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import files


In [None]:
uploaded = files.upload()

Saving hotel_cancellation.csv to hotel_cancellation.csv


We will use the hotel cancellation csv dataset toestimate the treatment effects if a ‘different room is assigned’ as the treatment indicator and interpret its effect on the room being ‘canceled’. We Use all the other columns as the covariates. 

In [None]:
#Load the dataset
data = pd.read_csv(io.BytesIO(uploaded['hotel_cancellation.csv']))

# Create binary indicator for the treatment and response variable 
data['is_canceled'] = (data['is_canceled'] == True).astype(int)
data['different_room_assigned'] = (data['different_room_assigned'] == True).astype(int)


# Specify the response and treatment variables
y = data['is_canceled']
x = data[['different_room_assigned', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'days_in_waiting_list']]

# Fit a logistic regression model
model = sm.Logit(y, x)
result = model.fit()

# Print the treatment effect estimates
print(result.summary2())


Optimization terminated successfully.
         Current function value: 0.598443
         Iterations 7
                              Results: Logit
Model:                  Logit              Pseudo R-squared:   0.105      
Dependent Variable:     is_canceled        AIC:                123164.4738
Date:                   2023-02-15 05:58   BIC:                123221.7225
No. Observations:       102894             Log-Likelihood:     -61576.    
Df Model:               5                  LL-Null:            -68825.    
Df Residuals:           102888             LLR p-value:        0.0000     
Converged:              1.0000             Scale:              1.0000     
No. Iterations:         7.0000                                            
--------------------------------------------------------------------------
                           Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
--------------------------------------------------------------------------
different_room_assigned   -2

In [None]:
import math
math.exp(-2.5186)


(math.exp(-2.5186)-1)*100 #percent



0.08057232908620612

Interpretation: The coefficient or treatment effect estimate for different_room_assigned is -2.5186. This indicates that having a different room assigned changes the odds of cancellation by a factor 0.080, holding all other variables constant.
the treatment effect of assigning a different room is a 91.87% decrease in the odds of cancellation holding all other variables constant.


We will now use double logistic regression to measure the effect of ‘different room is assigned’ on the room being ‘canceled’..

In [None]:
#double logistic regression
y = data['is_canceled']
x = data[['different_room_assigned', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'days_in_waiting_list']]

#first stage
#Regressing treatment on x variables
y2 = data['different_room_assigned']
x2 = data[['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'days_in_waiting_list']]
model1 = sm.Logit(y2, x2).fit()

#obtaining dhat
dhat = np.array(model1.predict(x2)).reshape(len(x), 1)

#attaching dhat to all other x variables + treatment
x3 = np.hstack((x, dhat))

#second stage
#regressing Y on dhat+d+x
model2 = sm.Logit(y, x3).fit()
print(model2.summary2())

Optimization terminated successfully.
         Current function value: 0.334283
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.596827
         Iterations 7
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.108      
Dependent Variable: is_canceled      AIC:              122833.8028
Date:               2023-02-15 06:05 BIC:              122900.5930
No. Observations:   102894           Log-Likelihood:   -61410.    
Df Model:           6                LL-Null:          -68825.    
Df Residuals:       102887           LLR p-value:      0.0000     
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     7.0000                                        
--------------------------------------------------------------------
        Coef.     Std.Err.      z       P>|z|     [0.025     0.975] 
--------------------------------------------------------------------
x1      -2.5

In [None]:
math.exp(-2.5043)



0.0817327909190746

Interpretation: With double logistic regression, we see that the coefficient for the treatment changes to -2.5043. The strength of the relationship has weakened slightly. This is the independent effect that the treatment has on the response (cancellation), accounting for any confounders. The odds of cancellation decreases by a factor of 0.0817 for new room assigned.

 Then, we use bootstrap to estimate the standard error of the treatment effects measured in earlier part

In [None]:
# Define the number of bootstrap resamples
n_resamples = 1000

# Initialize a matrix to store the treatment effect estimates
treat_effects = np.zeros((n_resamples, model2.params.shape[0] - 1))

# Use bootstrapping to estimate the standard error of the treatment effects
i = 0
while i < n_resamples:
    resample_index = np.random.choice(data.index, size = data.index.size, replace = True)
    resample = data.iloc[resample_index]
    y_resample = resample['different_room_assigned']
    x_resample = resample[['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'days_in_waiting_list']]
    model1 = sm.Logit(y_resample, x_resample).fit()
    dhat = np.array(model1.predict(x_resample)).reshape(len(x_resample), 1)
    x2_resample = resample[['different_room_assigned','lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'days_in_waiting_list']]
    x3_resample = np.hstack((x2_resample, dhat))
    y2_resample = resample['is_canceled']
    model2 = sm.Logit(y2_resample, x3_resample).fit()
    treat_effects[i, :] = model2.params[:-1]
    i += 1

# Calculate the standard error of the treatment effects
treat_effects_se = treat_effects.std(axis=0)

# Print the standard errors of the treatment effect estimates
print('Standard errors of the treatment effects:')
print(treat_effects_se)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         Current function value: 0.332471
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.598420
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.334254
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.595961
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.334804
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.596454
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.337752
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.595855
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.335689
         Iterations 6
Optimization terminated successfully.
         Current 

With double logistic regression, we see the standard error for tratment to be 0.0438. With bootstrapping, we see the standard error for treatment changes to 0.044.the bootstrapped standard error is slightly larger than the standard error estimated using double logistic regression. This could mean that the bootstrap was able to better capture the variability of the data and provides more accurate result of the standard error. The small standard error indicates that the estimate of the treatment effect is more precise and less uncertain.

