# install libraries

In [None]:
!pip install --upgrade pip
!pip install "snowflake-connector-python[pandas]" "snowflake-snowpark-python[pandas]" snowflake-snowpark-python==1.9.0 fosforio fosforml numpy pandas matplotlib scikit-learn xgboost seaborn python-dateutil tqdm holidays
!pip install --upgrade --q snowflake-snowpark-python==1.9.0
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15

# Import helper libraries

In [None]:
from fosforio import snowflake
from fosforml import *
from fosforml.constants import MLModelFlavours
from fosforio import get_dataframe
from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import warnings; warnings.simplefilter('ignore')
from joblib import dump, load
import requests
from tqdm import tqdm
import time
import calendar

from time import sleep
import configparser
from dateutil.relativedelta import relativedelta
import datetime
from dateutil.easter import easter
from scipy.optimize import minimize_scalar
from scipy.optimize import curve_fit

%matplotlib inline

# connect to snowflake

In [None]:
snowflake.get_connection(connection_name="TTH_REV_OPT_CXN")
data = get_dataframe("BOOKINGS_TRANSFORMED")
data

In [None]:
data.columns = [x.lower() for x in data.columns]

In [None]:
data.columns

# removing Canceletions and no-shows and keep City hotel data only


In [None]:
df = data[(data['is_canceled'] == 0) & (data['reservation_status'] !='No-Show')] 
df

In [None]:
df.groupby(['hotel','market_segment','reserved_room_type']).agg({'adr':'mean','reservation_status_date_transformed':'count'})


In [None]:
data = df[(df.market_segment != 'Complementary') ]
data = data[(data.reserved_room_type == 'A') |(data.reserved_room_type == 'D') | (data.reserved_room_type == 'E')]
data.reserved_room_type.value_counts()


In [None]:
data.info()


In [None]:
hotels = data['hotel'].unique()
room_types = data['reserved_room_type'].unique()

In [None]:
data['total_rns'] = data['stays_in_week_nights'] + data['stays_in_weekend_nights']
data_to_transform = data[['hotel','reserved_room_type','arrival_date_transformed','total_rns','adr']]
data_to_transform

In [None]:
import os
from snowflake.snowpark.session import Session
user = os.getenv("user")
warehouse = os.getenv("warehouse")
schema= os.getenv("schema")
database = os.getenv("database")
role =  os.getenv("role")
account =  os.getenv("account")
password= os.getenv("password")

connection_params = dict(user=user, 
                         password=password, 
                         account=account, 
                         warehouse=warehouse, 
                         database=database,
                         schema=schema, 
                         role=role)

session = Session.builder.configs(connection_params).create()

session.sql('use warehouse {};'.format(warehouse)).collect()

session.sql('use database {};'.format(database)).collect()

session.sql('use schema {}.{};'.format(database, schema)).collect()

# Create a new dataframe to store the data by stay date

In [None]:
expanded_df = pd.DataFrame()

for _, row in data.iterrows():
    num_stay_dates = row['total_rns']
    try:
        # Create a row for each stay date
        expanded_booking = pd.DataFrame({
            'hotel': row['hotel'],
            'room_type': row['reserved_room_type'], 
            'arrival_date': pd.date_range(start=row['expected_arrival_date'], periods=num_stay_dates),
            'total_rns': 1,
            'adr': row['adr']
        })
        
        # Append the stay date information to the new dataframe
        expanded_df = pd.concat([expanded_df, expanded_booking], ignore_index=True)
    except ValueError as e:
        print(f"Error processing booking for {row['hotel']} on {row['expected_arrival_date']} : {num_stay_dates} {e}")




# Sort the final dataframe by date

In [None]:
expanded_df = expanded_df.sort_values('arrival_date')
expanded_df = expanded_df.reset_index(drop=True)
expanded_df

In [None]:
expanded_df['adr']= np.round(expanded_df['adr'], 2)

expanded_df


# Building seasonality

In [None]:
import holidays
holiday_dates = holidays.CountryHoliday('PT', years=[2020,2021,2022,2023])
holidays = {
    expected_arrival_date: name
    for expected_arrival_date, name in holiday_dates.items()
    if name in ['Ano Novo', 'Páscoa', 'Dia de Natal']
}

# rename holiday columns

In [None]:
expanded_df = expanded_df.rename({'Ano Novo':'new_year','Páscoa':'easter','Dia de Natal':'christmas'},axis=1)

In [None]:
def generate_holiday_dates(start_year, end_year):
    holidays = {}
    for year in range(start_year, end_year + 1):
        holidays[datetime.date(year, 1, 1)] = 'new_year'
        easter_date = easter(year)
        holidays[easter_date] = 'easter'
        holidays[datetime.date(year, 12, 25)] = 'christmas'
    return holidays

holidays = generate_holiday_dates(2020, 2023)


In [None]:
holidays

In [None]:
# Define pre and post ranges for each holiday
pre_range_offset = {'new_year': relativedelta(days=-1),
                    'easter': relativedelta(days=-2),
                    'christmas': relativedelta(days=-3)}

post_range_offset = {'new_year': relativedelta(days=1),
                     'easter': relativedelta(days=2),
                     'christmas': relativedelta(days=3)}

In [None]:
# Create new columns for each holiday
for holiday in holidays.values():
    expanded_df[holiday] = 0
 
 # Set the holiday columns to 1 for matching dates
for arrival_date, name in holidays.items():
    expanded_df.loc[expanded_df['arrival_date'] == arrival_date, name] = 1

    # Set the holiday columns to 1 for pre and post dates
    pre_offset = pre_range_offset.get(name)
    if pre_offset:
        pre_date = pd.to_datetime(arrival_date) + pre_offset
        expanded_df.loc[expanded_df['arrival_date'] == pre_date.strftime('%Y-%m-%d'), name] = 1

    post_offset = post_range_offset.get(name)
    if post_offset:
        post_date = pd.to_datetime(arrival_date) + post_offset
        expanded_df.loc[expanded_df['arrival_date'] == post_date.strftime('%Y-%m-%d'), name] = 1

In [None]:
##check
expanded_df.head()

In [None]:
# Add dow, month to data
expanded_df['dow'] = expanded_df.arrival_date.dt.strftime('%A')
expanded_df['month'] = expanded_df.arrival_date.dt.strftime('%B')

In [None]:
##check
expanded_df.head()

In [None]:
expanded_df.shape

In [None]:
expanded_df.info()

In [None]:
expanded_df[expanded_df['easter'] == 0]

In [None]:
from scipy.optimize import brentq

def demand_to_price(num_rooms, a, b, c, d, max_demand, optimal_price):
    def root_func(x):
        return num_rooms - (a * np.exp(-b * x) + c)

    try:
        price = brentq(root_func, 0, 200)  # Adjust the interval bounds as needed
    except ValueError:
        # Fallback to default price if no root is found
        price_range=(0, optimal_price)
        price = np.random.uniform(*price_range)

    return price

In [None]:
expanded_df['adr'] = np.round(expanded_df['adr'], 2)
expanded_df

# Non holidays dataset

In [None]:
non_holidays = expanded_df[expanded_df[['new_year', 'easter', 'christmas']].sum(axis=1) == 0]

non_holidays.head()

In [None]:
non_holidays.shape

In [None]:
daily_rns= non_holidays.groupby(['arrival_date','dow','month', 'hotel', 'room_type']).agg({'total_rns':'sum'}).reset_index() # ge total stays per day

daily_rns = daily_rns.groupby(['dow','month', 'hotel', 'room_type']).agg({'total_rns':['sum','mean','median']}).reset_index() # get Rns metrics by Dow & Month

daily_rns.columns = ['_'.join(col) for col in daily_rns.columns] #remove multi level column
daily_rns

In [None]:
adr_frequency = non_holidays.groupby(['dow','month','adr', 'hotel', 'room_type']).agg({'total_rns':'sum'})
adr_frequency.reset_index(inplace=True)

In [None]:
adr_frequency

In [None]:
merged_df = pd.merge(adr_frequency, daily_rns,how='left',left_on=['dow','month', 'hotel', 'room_type'], right_on=['dow_','month_', 'hotel_', 'room_type_'],suffixes=('_act', '_tot'))

merged_df = merged_df.drop(['dow_','month_'],axis=1)

merged_df

In [None]:
merged_df['probability'] = merged_df['total_rns']/merged_df['total_rns_sum']

In [None]:
merged_df['expected_rns'] = merged_df['probability'] * merged_df['total_rns_median']

In [None]:
merged_df = merged_df.sort_values(by=['dow', 'month', 'adr'], ascending=[True, True, False])

In [None]:
merged_df['expected_demand']=merged_df.groupby(['dow', 'month'])['expected_rns'].cumsum()

In [None]:
merged_df['expected_rev'] = merged_df['adr']* merged_df['expected_demand']

In [None]:
merged_df['expected_rev'] = merged_df['adr']* merged_df['expected_demand']

In [None]:
merged_df[(merged_df.dow == 'Friday') & (merged_df.month =='April')].plot(x='adr', y='expected_demand', kind='line')

In [None]:
merged_df[(merged_df.dow == 'Friday') & (merged_df.month =='April')].plot(x='adr', y='expected_rev', kind='line')

In [None]:
from scipy.optimize import curve_fit

# Define the demand curve function
def demand_curve(x, a, b, c, d, max_demand):
    demand = a * np.exp(-b * x) + c
    demand = np.where(x <= max_demand, np.minimum(demand, max_demand), demand)
    return demand + d


In [None]:
x_data = merged_df['adr'].values
y_data = merged_df['expected_demand'].values


initial_guess = [1, 0.01, 1, 1, 100]
bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])

params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess)

a_fit, b_fit, c_fit ,d_fit,max_demand= params

In [None]:
predicted_demand = demand_curve(x_data, a_fit, b_fit,c_fit,d_fit,max_demand)

In [None]:
plt.scatter(x_data, y_data, label='Actual Demand')
plt.plot(x_data, predicted_demand, label='Fitted Curve')
plt.xlabel('Price')
plt.ylabel('Demand')
plt.legend()
plt.title('Demand Curve Fit')
plt.show()

In [None]:
def revenue(price):
    return price * demand_curve(price, a_fit, b_fit,c_fit,d_fit,max_demand)

In [None]:
objective = lambda price: -revenue(price)
from scipy.optimize import minimize_scalar

result = minimize_scalar(objective, bounds=(60, 180), method='bounded')
optimal_price = result.x
max_revenue = -result.fun
room_sold = demand_curve(optimal_price, a_fit, b_fit,c_fit,d_fit,max_demand)

In [None]:
print(f"The optimal price to maximize revenue: ${optimal_price}")
print(f"The maximum revenue achievable: ${max_revenue}")
print(f"The expected number of rooms to sell: {room_sold}")

In [None]:
demand_to_price(50,a_fit,b_fit,c_fit,d_fit,max_demand, optimal_price)

In [None]:
results = pd.DataFrame(columns=['month', 'hotel', 'room_type', 'dow', 'optimal_rate', 'expected_rn','expected_rev','optimal_rate_lim_inv'])

In [None]:
months = merged_df.month.unique()
dow = merged_df.dow.unique()

In [None]:
#Create a loop to observe if our demand curve fits properly to each demand month and dow


for month in months:
    for day in dow:
        
        print(month,day)
        #get data
        data = merged_df[(merged_df.dow == day) & (merged_df.month ==month)].reset_index()
        
        #remove outlier
        mean = data.adr.mean()
        std_dev = data.adr.std()
       

        # calculate z-scores
        data['z_scores'] = np.abs((data.adr - mean) / std_dev)
        
        #filter out outliers
        data = data[data.z_scores <=2]
        
        ## Fit Demand curve
        x_data = data['adr'].values
        y_data = data['expected_demand'].values
        
        # Try except expression to ensure we get no errors when fitting the demand curve due to our initial guess
        try:
            initial_guess = [1, 0.01, 1, 1,data['total_rns_median'].values[0] ]
            bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])

        # Fit the demand curve to the data
            params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess)
        except:
            if month =='January':
                
                initial_guess = [1, 0.01, 1, 1,40 ]
            else:
                initial_guess = [1, 0.01, 1, 1,50 ]
            bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])

        # Fit the demand curve to the data
            params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess)
        
        # Extract the fitted parameters
        a_fit, b_fit, c_fit ,d_fit,max_demand= params
        
        #visually explore if the demand curve fits the data
        predicted_demand = demand_curve(x_data, a_fit, b_fit,c_fit,d_fit,max_demand)
        
        plt.scatter(x_data, y_data, label='Actual Demand')
        plt.plot(x_data, predicted_demand, label='Fitted Curve')
        plt.xlabel('Price')
        plt.ylabel('Demand')
        plt.legend()
        plt.title('Demand Curve Fit')
        plt.show()

In [None]:
for hotel in hotels:
    for room_type in room_types:
        for month in months:
            for day in dow:
                # Get data for the specific combination
                data_subset = merged_df[(merged_df['dow'] == day) & 
                                        (merged_df['hotel'] == hotel) & 
                                        (merged_df['room_type'] == room_type) & 
                                        (merged_df['month'] == month)].reset_index()
                
                if data_subset.empty:
                    continue

                # Remove outliers
                mean = data_subset['adr'].mean()
                std_dev = data_subset['adr'].std()
                data_subset['z_scores'] = np.abs((data_subset['adr'] - mean) / std_dev)
                data_subset = data_subset[data_subset['z_scores'] <= 2]

                # Fit demand curve
                x_data = data_subset['adr'].values
                y_data = data_subset['expected_demand'].values

                try:
                    initial_guess = [1, 0.01, 1, 1, data_subset['total_rns_median'].values[0]]
                    bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])
                    maxfev = 10000  # Increase the number of maximum function evaluations
                    params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess, maxfev=maxfev)
                except RuntimeError as e:
                    print(f"Error fitting demand curve for {hotel}, {room_type}, {month}, {day}: {e}")
                    continue

                a_fit, b_fit, c_fit, d_fit, max_demand = params

                # Optimize revenue
                def revenue(price):
                    return price * demand_curve(price, a_fit, b_fit, c_fit, d_fit, max_demand)

                objective = lambda price: -revenue(price)
                optimize = minimize_scalar(objective, bounds=(45, 200), method='bounded')
                optimal_price = optimize.x
                max_revenue = -optimize.fun
                expected_rns = demand_curve(optimal_price, a_fit, b_fit, c_fit, d_fit, max_demand)

                optimal_rate_lim_inv = demand_to_price(50, a_fit, b_fit, c_fit, d_fit, max_demand, optimal_price)

                new_row = pd.DataFrame({'hotel': hotel,
                                        'room_type': room_type,
                                        'month': month,
                                        'dow': day,
                                        'optimal_rate': optimal_price,
                                        'expected_rev': max_revenue,
                                        'expected_rn': expected_rns,
                                        'optimal_rate_lim_inv': optimal_rate_lim_inv}, index=[0])
                results = pd.concat([results, new_row], ignore_index=True)


In [None]:
results

In [None]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Sort the dataframe by the custom order
results['month'] = pd.Categorical(results['month'], categories=month_order, ordered=True)
results['dow'] = pd.Categorical(results['dow'], categories=dow_order, ordered=True)
results = results.sort_values(['month', 'dow'])

grouped = results.groupby(['month', 'dow'])['optimal_rate'].mean().unstack()

# Create the graph
fig, ax = plt.subplots(figsize=(10, 6))
grouped.plot(ax=ax, kind='bar')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Optimal Rate')
ax.set_title('Optimal Rate by Month and Day of the Week')

# Customize the appearance (optional)
plt.legend(title='Month', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)

# Show the graph
plt.show()

In [None]:
#clean up the results
results['optimal_rate'] = results['optimal_rate'].round()
results['optimal_rate_lim_inv'] = results['optimal_rate_lim_inv'].round()

results['expected_rn'] = results['expected_rn'].round().astype(int)
results['expected_rev'] = results['expected_rev'].round()
results

# Holidays

In [None]:
holidays =  expanded_df[expanded_df[['new_year', 'easter', 'christmas']].sum(axis=1) != 0]
holidays

In [None]:
unpivoted = pd.melt(holidays, id_vars=['arrival_date', 'total_rns', 'adr', 'dow', 'month', "hotel", "room_type"],
                    value_vars=['new_year', 'easter', 'christmas'],
                    var_name='holiday', value_name='holiday_indicator')

In [None]:
unpivoted[unpivoted.holiday =='christmas']
unpivoted = unpivoted[unpivoted['holiday_indicator'] == 1]
unpivoted

In [None]:
holiday_adr = unpivoted.groupby(['holiday','adr','holiday_indicator', "hotel", "room_type"]).agg({'total_rns':'sum'}).reset_index()
holiday_rns = unpivoted.groupby(['arrival_date','holiday', "hotel", "room_type"]).agg({'total_rns':'sum'}).reset_index()
holiday_rns = holiday_rns.groupby(['holiday', "hotel", "room_type"]).agg({'total_rns':['sum','mean','median']}).reset_index()
holiday_rns.columns = ['_'.join(col) for col in holiday_rns.columns]

In [None]:
holiday_rns

In [None]:
merged_holidays = pd.merge(holiday_adr, holiday_rns,how='left',left_on=['holiday'], right_on=['holiday_'],suffixes=('_act', '_tot'))

merged_holidays.drop('holiday_',axis=1,inplace=True)

merged_holidays

In [None]:
merged_holidays['probability'] = merged_holidays['total_rns']/merged_holidays['total_rns_sum']
merged_holidays['expected_rns'] = merged_holidays['probability'] * merged_holidays['total_rns_median']
merged_holidays = merged_holidays.sort_values(by=['holiday', 'adr'], ascending=[True, False])
merged_holidays['expected_demand']=merged_holidays.groupby(['holiday'])['expected_rns'].cumsum()

In [None]:
merged_holidays

In [None]:
unique_holidays = merged_holidays.holiday.unique()


In [None]:
def demand_to_price(num_rooms, a, b, c, d, max_demand, optimal_price):
    def root_func(x):
        return num_rooms - (a * np.exp(-b * x) + c)

    try:
        price = brentq(root_func, 0, 200)  # Adjust the interval bounds as needed
    except ValueError:
        # Fallback to default price if no root is found
        price_range=(0, optimal_price)
        price = np.random.uniform(*price_range)

    return price

In [None]:
holiday_results = pd.DataFrame(columns=['holiday', 'optimal_rate', 'expected_rn','expected_rev','optimal_rate_lim_inv'])

for hotel in hotels:
    for room_type in room_types:

        for day in unique_holidays:
                data = merged_holidays[(merged_holidays.holiday == day) & (merged_holidays.hotel == hotel) & (merged_holidays.room_type == room_type) ].reset_index()

                #remove outlier
                mean = data.adr.mean()
                std_dev = data.adr.std()


                # calculate z-scores
                data['z_scores'] = np.abs((data.adr - mean) / std_dev)

                #filter out outliers
                data = data[data.z_scores <=2]

                ## Fit Demand curve
                x_data = data['adr'].values
                y_data = data['expected_demand'].values

                initial_guess = [1, 0.01, 1, 1,data['total_rns_median'].values[0] ]
                bounds = ([0, 0, 0, 0, 0], [np.inf, np.inf, np.inf, np.inf, np.inf])

                try:
                    params, _ = curve_fit(demand_curve, x_data, y_data, bounds=bounds, p0=initial_guess)
                except Exception as e:
                    print(f"Error fitting demand curve for {hotel}, {room_type}: {e}")
                    continue

                # Extract the fitted parameters
                a_fit, b_fit, c_fit ,d_fit,max_demand= params

                a_fit, b_fit, c_fit ,d_fit,max_demand = np.round(a_fit, 3), np.round(b_fit, 3), np.round(c_fit, 3) ,np.round(d_fit, 3),np.round(max_demand, 3)
                #optimze revenue
                objective = lambda price: -revenue(price)



                optimize = minimize_scalar(objective, bounds=(45, 400), method='bounded')
                optimal_price = optimize.x
                max_revenue = -optimize.fun

                expected_rns = demand_curve(optimal_price,a_fit, b_fit, c_fit ,d_fit,max_demand)
                print(50,a_fit,b_fit,c_fit,d_fit,max_demand)
                optimal_rate_lim_inv = demand_to_price(50,a_fit,b_fit,c_fit,d_fit,max_demand, optimal_price)

                new_row = pd.DataFrame({'holiday':day,
                                        'hotel': hotel,
                               'room_type': room_type,
                               'optimal_rate': optimal_price,
                               'expected_rev':max_revenue,
                               'expected_rn':expected_rns,
                               'optimal_rate_lim_inv':optimal_rate_lim_inv},index=[0])
                holiday_results = pd.concat([holiday_results, new_row], ignore_index=True)

In [None]:
holiday_results

In [None]:
results

In [None]:
room_types

In [None]:
holiday_results


years = [2020, 2021, 2022, 2023]

holiday_dates = []
for year in years:
    for index, row in holiday_results.iterrows():
        if row['holiday'] == 'christmas':
            date = datetime.date(year, 12, 25)
        elif row['holiday'] == 'easter':
            date = easter(year)
        elif row['holiday'] == 'new_year':
            date = datetime.date(year, 1, 1)

        holiday_dates.append({
            'hotel': row['hotel'],
            'room_type': row['room_type'],
            'month': date.strftime('%B'),
            'dow': date.strftime("%A"),
            'holiday': row['holiday'],
            'optimal_rate': row['optimal_rate'],
            'expected_rn': row['expected_rn'],
            'expected_rev': row['expected_rev'],
            'optimal_rate_lim_inv': row['optimal_rate_lim_inv'],
            'arrival_date': pd.to_datetime(date)
        })

holiday_results_yearly = pd.DataFrame(holiday_dates)




In [None]:
results

In [None]:
final_data = pd.concat([results, holiday_results_yearly], axis=0)

In [None]:
final_data.drop('arrival_date', axis=1, inplace=True)

In [None]:
final_data['holiday'].fillna('non_holiday', inplace=True)

In [None]:
final_data = pd.concat([final_data, pd.get_dummies(final_data.holiday)], axis=1)
final_data.drop('holiday', axis=1, inplace=True)

In [None]:
#final_data.to_csv('final_data.csv', index=False)

In [None]:
final_data

In [None]:
df

In [None]:
start_year = 2020
end_year = 2023

month_dict = {month: index for index, month in enumerate(pd.date_range('2020-01-01', periods=12, freq='M').strftime('%B'), 1)}

def generate_dates(row):
    month_num = month_dict[row['month']]
    dates = []
    for year in range(start_year, end_year + 1):
        month_dates = pd.date_range(start=f'{year}-{month_num:02d}-01', end=f'{year}-{month_num:02d}-01', freq='D') + MonthEnd(0)
        dow_dates = month_dates[month_dates.day_name() == row['dow']]
        dates.extend(dow_dates)
    return dates

expanded_data = final_data.apply(lambda row: pd.DataFrame({
    'month': row['month'],
    'hotel': row['hotel'],
    'room_type': row['room_type'],
    'dow': row['dow'],
    'optimal_rate': row['optimal_rate'],
    'expected_rn': row['expected_rn'],
    'expected_rev': row['expected_rev'],
    'optimal_rate_lim_inv': row['optimal_rate_lim_inv'],
    'christmas': row['christmas'],
    'easter': row['easter'],
    'new_year': row['new_year'],
    'non_holiday': row['non_holiday'],
    'arrival_date': generate_dates(row)
}), axis=1).explode('arrival_date').reset_index(drop=True)

expanded_data