# Setup

In [1]:
!pip install feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.6.0-py2.py3-none-any.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.4/319.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.0


In [2]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from feature_engine.creation import CyclicalFeatures
from scipy import stats

import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/IS4303/notebooks/new_cleaned

Mounted at /content/drive
/content/drive/MyDrive/IS4303/notebooks/new_cleaned


# Helpers

In [4]:
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1i-JbDm2nY15NnYl62GxlsBCoELp1HJZ3") # original dataset

def transform_test_df(data):
  data = data.copy()
  data['country'].fillna(f'unknown', inplace=True)
  data['agent'].fillna(0, inplace=True)
  data['company'].fillna(0, inplace=True)
  data['children'].fillna(0, inplace=True)
  data = data.drop_duplicates(keep="first")
  data = data[data['country'] != 'unknown']
  return data

df = transform_test_df(df)
df = df[df['hotel'] == 'Resort Hotel']
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,0.0,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,0.0,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0.0,0.0,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,0.0,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,0.0,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [5]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33522 entries, 0 to 40059
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           33522 non-null  object 
 1   is_canceled                     33522 non-null  int64  
 2   lead_time                       33522 non-null  int64  
 3   arrival_date_year               33522 non-null  int64  
 4   arrival_date_month              33522 non-null  object 
 5   arrival_date_week_number        33522 non-null  int64  
 6   arrival_date_day_of_month       33522 non-null  int64  
 7   stays_in_weekend_nights         33522 non-null  int64  
 8   stays_in_week_nights            33522 non-null  int64  
 9   adults                          33522 non-null  int64  
 10  children                        33522 non-null  float64
 11  babies                          33522 non-null  int64  
 12  meal                            

In [6]:
cleaned_1 = pd.read_csv('1cleaned.csv')
pre_cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
pre_ohe = pd.get_dummies(cleaned_1[pre_cols], prefix=pre_cols)
cleaned_2 = pd.read_csv('2cleaned.csv')
col_list = cleaned_2.columns

In [7]:
with open('FittedCyclicalEncoder.pkl', 'rb') as f:
  cyclical = pickle.load(f)

In [8]:
# just need class definition to load pipeline
class Cleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  
  def transform(self, X):
    X = X.copy()

    X = X.drop(columns=['arrival_date_year', 'reservation_status', 'reservation_status_date', 'arrival_date_day_of_month'])

    X['country'].fillna(f'unknown', inplace=True)
    X['agent'].fillna(0, inplace=True)
    X['company'].fillna(0, inplace=True)
    X['children'].fillna(0, inplace=True)

    X = X.drop_duplicates(keep="first")

    X = X.drop(columns=['company'])
    X = X.drop(columns=['agent'])

    X = X[X['country'] != 'unknown']
    return X

class Encoder(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def add_missing_columns(self, X):
    missing_columns = [col for col in pre_ohe if col not in X.columns]
    if missing_columns:
      df_missing = pd.DataFrame(0, index=X.index, columns=missing_columns)
      X = pd.concat([X, df_missing], axis=1)
    return X
  
  def transform(self, X):
    X = X.copy()

    cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
    ohe = pd.get_dummies(X[cols], prefix=cols)
    X = pd.concat([X, ohe], axis=1)
    X = X.drop(cols, axis=1)
    X = self.add_missing_columns(X)

    if 'is_canceled' not in X:
      X['is_canceled'] = 0 # doesn't matter, just because cyclical expects this after fitting

    months = {"JANUARY" : 1,"FEBRUARY" : 2,"MARCH" : 3,"APRIL": 4, "MAY": 5, "JUNE": 6, "JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10, "NOVEMBER": 11, "DECEMBER": 12}
    X['arrival_date_month'] = X['arrival_date_month'].apply(lambda x: months[str(x).upper()])

    X.drop(X.columns.difference(col_list), 1, inplace=True)

    cyclical_features_X = cyclical.transform(X)
    cos_only = cyclical_features_X.drop(["arrival_date_month_sin", "arrival_date_week_number_sin"], axis=1)

    X = cos_only.drop(columns=['is_canceled'])
    return X

In [9]:
with open('Pre_Pipeline.pkl', 'rb') as f:
  pipeline = pickle.load(f)

with open('model.pkl', 'rb') as f:
  model = pickle.load(f)

# Dynamic discounts

- assuming nothing is null or duplicates in input dataframe for new data

In [10]:
def get_proba(data, pipeline, model):
  data = data.copy()
  data = pipeline.transform(data)
  probas = model.predict_proba(data)[:,1]
  return probas

In [11]:
import pandas as pd
import numpy as np

def month_to_num(month):
  return pd.to_datetime(month, format='%B').month

def get_season(month):
  if 3 <= month <= 5:
    return 'Spring'
  elif 6 <= month <= 8:
    return 'Summer'
  elif 9 <= month <= 11:
    return 'Autumn'
  else:
    return 'Winter'

def dynamic_discounts(df, model, pipeline, discounts=None):
  backup = df.copy()
  df = df.copy()
  backup = df.copy()

  if discounts is None:
    discounts = {
      'low': 0.1,
      'medium': 0.3,
      'high': 0.5,
    }

  df['arrival_date'] = pd.to_datetime(df[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']].astype(str).agg('-'.join, axis=1))
  df['arrival_date_month_num'] = df['arrival_date_month'].apply(month_to_num)
  df['season'] = df['arrival_date_month_num'].apply(get_season)
  df['day_of_week'] = df['arrival_date'].apply(lambda x: x.weekday())

  proba = get_proba(backup, pipeline, model) 
  df['cancellation_proba'] = proba

  def apply_discount(row):
    season_weight = 1
    if row['season'] == 'Summer':
      season_weight = 0.9
    elif row['season'] == 'Winter':
      season_weight = 1.1

    day_of_week_weight = 1
    if row['day_of_week'] in [4, 5]: 
      day_of_week_weight = 0.9

    lead_time_weight = 1
    if row['lead_time'] > 180:
      lead_time_weight = 0.9
    elif row['lead_time'] < 30:
      lead_time_weight = 1.1

    discount = discounts['low']
    if row['cancellation_proba'] > 0.7:
      discount = discounts['high']
    elif row['cancellation_proba'] > 0.5:
      discount = discounts['medium']

    base_price = row['adr']
    return base_price * (1 - discount) * season_weight * day_of_week_weight * lead_time_weight

  df['cancellation_proba'].fillna(0.0, inplace=True)
  df['base_adr_price'] = df['adr']
  df['discounted_adr_price'] = df.apply(apply_discount, axis=1)

  backup['adr'] = df['discounted_adr_price']
  proba2 = get_proba(backup, pipeline, model)
  df['proba2'] = proba2
  df['reduced_risk'] = df['proba2'] < df['cancellation_proba']

  return df[['cancellation_proba', 'season', 'day_of_week', 'lead_time', 'base_adr_price', 'discounted_adr_price', 'reduced_risk']]

In [12]:
dynamic_discounts(df.iloc[2:10], model, pipeline)

  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)


Unnamed: 0,cancellation_proba,season,day_of_week,lead_time,base_adr_price,discounted_adr_price,reduced_risk
2,0.058417,Summer,2,7,75.0,66.825,True
3,0.129602,Summer,2,13,75.0,66.825,True
4,0.322977,Summer,2,14,98.0,87.318,True
6,0.728143,Summer,2,0,107.0,52.965,True
7,0.479478,Summer,2,9,103.0,91.773,True
8,0.756104,Summer,2,85,82.0,36.9,True
9,0.689074,Summer,2,75,105.5,66.465,True
10,0.873588,Summer,2,23,123.0,60.885,True
