# install libraries

In [None]:
!pip install --upgrade pip
!pip install "snowflake-connector-python[pandas]" "snowflake-snowpark-python[pandas]" snowflake-snowpark-python==1.9.0 fosforio fosforml numpy pandas matplotlib scikit-learn xgboost seaborn python-dateutil tqdm holidays
!pip install --upgrade --q snowflake-snowpark-python==1.9.0
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15

# Import helper libraries

In [None]:
from fosforio import snowflake
from fosforml import *
from fosforml.constants import MLModelFlavours
from fosforio import get_dataframe
from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', 500)
import seaborn as sns
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import warnings; warnings.simplefilter('ignore')
from joblib import dump, load
import requests
from tqdm import tqdm
import time
import calendar

from time import sleep
import configparser
from dateutil.relativedelta import relativedelta
import datetime
from dateutil.easter import easter
%matplotlib inline

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


# connect to snowflake

In [None]:
snowflake.get_connection(connection_name="TTH_REV_OPT_CXN")
data = get_dataframe("BOOKINGS_TRANSFORMED")
data

In [None]:
data.columns = [x.lower() for x in data.columns]

In [None]:
data.columns

# removing Canceletions and no-shows and keep City hotel data only


In [None]:
df = data[(data['is_canceled'] == 0) & (data['reservation_status'] !='No-Show')] 
df

In [None]:
df.groupby(['hotel','market_segment','reserved_room_type']).agg({'adr':'mean','reservation_status_date_transformed':'count'})


In [None]:
data = df[(df.market_segment != 'Complementary') ]
data = data[(data.reserved_room_type == 'A') |(data.reserved_room_type == 'D') | (data.reserved_room_type == 'E')]
data.reserved_room_type.value_counts()


In [None]:
data.info()


In [None]:
data['total_rns'] = data['stays_in_week_nights'] + data['stays_in_weekend_nights']
data_to_transform = data[['hotel','reserved_room_type','arrival_date_transformed','total_rns','adr']]
data_to_transform

In [None]:
import os
from snowflake.snowpark.session import Session
user = os.getenv("user")
warehouse = os.getenv("warehouse")
schema= os.getenv("schema")
database = os.getenv("database")
role =  os.getenv("role")
account =  os.getenv("account")
password= os.getenv("password")

connection_params = dict(user=user, 
                         password=password, 
                         account=account, 
                         warehouse=warehouse, 
                         database=database,
                         schema=schema, 
                         role=role)

session = Session.builder.configs(connection_params).create()

session.sql('use warehouse {};'.format(warehouse)).collect()

session.sql('use database {};'.format(database)).collect()

session.sql('use schema {}.{};'.format(database, schema)).collect()

# Create a new dataframe to store the data by stay date

In [None]:
expanded_df = pd.DataFrame()

for _, row in data.iterrows():
    num_stay_dates = row['total_rns']
    try:
        # Create a row for each stay date
        expanded_booking = pd.DataFrame({
            'hotel': row['hotel'],
            'room_type': row['reserved_room_type'], 
            'arrival_date': pd.date_range(start=row['expected_arrival_date'], periods=num_stay_dates),
            'total_rns': 1,
            'adr': row['adr']
        })
        
        # Append the stay date information to the new dataframe
        expanded_df = pd.concat([expanded_df, expanded_booking], ignore_index=True)
    except ValueError as e:
        print(f"Error processing booking for {row['hotel']} on {row['expected_arrival_date']} : {num_stay_dates} {e}")


expanded_df

# Sort the final dataframe by date

In [None]:
expanded_df = expanded_df.sort_values('adr')
expanded_df = expanded_df.reset_index(drop=True)
expanded_df

# Building seasonality

In [None]:
import holidays
holiday_dates = holidays.CountryHoliday('PT', years=[2020,2021,2022,2023])
holidays = {
    expected_arrival_date: name
    for expected_arrival_date, name in holiday_dates.items()
    if name in ['Ano Novo', 'Páscoa', 'Dia de Natal']
}

# rename holiday columns

In [None]:
expanded_df = expanded_df.rename({'Ano Novo':'new_year','Páscoa':'easter','Dia de Natal':'christmas'},axis=1)

In [None]:
def generate_holiday_dates(start_year, end_year):
    holidays = {}
    for year in range(start_year, end_year + 1):
        holidays[datetime.date(year, 1, 1)] = 'new_year'
        easter_date = easter(year)
        holidays[easter_date] = 'easter'
        holidays[datetime.date(year, 12, 25)] = 'christmas'
    return holidays

holidays = generate_holiday_dates(2020, 2023)


In [None]:
holidays

In [None]:
# Define pre and post ranges for each holiday
pre_range_offset = {'new_year': relativedelta(days=-1),
                    'easter': relativedelta(days=-2),
                    'christmas': relativedelta(days=-3)}

post_range_offset = {'new_year': relativedelta(days=1),
                     'easter': relativedelta(days=2),
                     'christmas': relativedelta(days=3)}

In [None]:
# Create new columns for each holiday
for holiday in holidays.values():
    expanded_df[holiday] = 0
 
 # Set the holiday columns to 1 for matching dates
for arrival_date, name in holidays.items():
    expanded_df.loc[expanded_df['arrival_date'] == arrival_date, name] = 1

    # Set the holiday columns to 1 for pre and post dates
    pre_offset = pre_range_offset.get(name)
    if pre_offset:
        pre_date = pd.to_datetime(arrival_date) + pre_offset
        expanded_df.loc[expanded_df['arrival_date'] == pre_date.strftime('%Y-%m-%d'), name] = 1

    post_offset = post_range_offset.get(name)
    if post_offset:
        post_date = pd.to_datetime(arrival_date) + post_offset
        expanded_df.loc[expanded_df['arrival_date'] == post_date.strftime('%Y-%m-%d'), name] = 1

In [None]:
##check
expanded_df.head()

In [None]:
# Add dow, month to data
expanded_df['dow'] = expanded_df.arrival_date.dt.strftime('%A')
expanded_df['month'] = expanded_df.arrival_date.dt.strftime('%B')

In [None]:
##check
expanded_df.head()

In [None]:
expanded_df.shape

In [None]:
expanded_df.info()

In [None]:
expanded_df[expanded_df['easter'] == 0]

In [None]:
expanded_df['adr'] = np.round(expanded_df['adr'], 2)

In [None]:
holidays_df = expanded_df.groupby(["hotel","room_type","arrival_date"]).agg({"total_rns": sum, "adr": np.mean,     "new_year": 'max',
    "easter": 'max',
    "christmas": 'max',
    "dow": 'first',
    "month": 'first'}).reset_index()

In [None]:
holidays_df

In [None]:
holidays_df['adr'] = np.round(holidays_df['adr'], 2)
holidays_df

In [None]:
holidays_df[holidays_df['easter'] == 0]

In [None]:
#holidays_df.columns = map(lambda x: str(x).upper(), holidays_df.columns)
#session.write_pandas(holidays_df, table_name="HOLIDAYS_DATASET", database=database, schema="TTH_REV_OPT_SCHEMA",  auto_create_table=True, overwrite=True)

In [None]:
holidays_df.columns

In [None]:
##PRICING MODEL 1 (WITHOUT HOLIDAYS)
#Remove holidays for our main model
non_holidays = expanded_df[expanded_df[['new_year', 'easter', 'christmas']].sum(axis=1) == 0]
non_holidays.head()

In [None]:
final_dataset = pd.concat([non_holidays , holidays_df], axis=0).reset_index(drop=True)
final_dataset
#final_dataset.to_csv("./dataset_latest.csv",index=True)

In [None]:
holidays_df.shape

In [None]:
def add_rns(row):
    if row['hotel'] == 'City Hotel' and row['room_type'] == 'A':
        return 406
    elif row['hotel'] == 'City Hotel' and row['room_type'] == 'D':
        return 135
    elif row['hotel'] == 'City Hotel' and row['room_type'] == 'E':
        return 30
    elif row['hotel'] == 'Resort Hotel' and row['room_type'] == 'A':
        return 360
    elif row['hotel'] == 'Resort Hotel' and row['room_type'] == 'D':
        return 125
    elif row['hotel'] == 'Resort Hotel' and row['room_type'] == 'E':

        return 100

In [None]:
final_dataset['dummy_rns'] = 0
final_dataset['dummy_rns'] = final_dataset.apply(add_rns, axis=1)

In [None]:
final_dataset

In [None]:
results = pd.DataFrame(columns=['month', 'dow', 'optimal_rate', 'expected_rn','expected_rev','optimal_rate_lim_inv'])


In [None]:
daily_rns= non_holidays.groupby(['date','dow','month']).agg({'rn':'sum'}).reset_index() # get total stays per day
daily_rns = daily_rns.groupby(['dow','month']).agg({'rn':['sum','mean','median']}).reset_index() # get Rns metrics by Dow & Month
daily_rns.columns = ['_'.join(col) for col in daily_rns.columns] #remove multi level column
daily_rns