# Helpers

In [1]:
!pip install feature-engine
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from feature_engine.creation import CyclicalFeatures
from scipy import stats

import pickle

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/IS4303/notebooks/new_cleaned

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.6.0-py2.py3-none-any.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.4/319.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.0
Mounted at /content/drive
/content/drive/MyDrive/IS4303/notebooks/new_cleaned


In [23]:
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1i-JbDm2nY15NnYl62GxlsBCoELp1HJZ3") # original dataset

def transform_test_df(data):
  data = data.copy()
  data['country'].fillna(f'unknown', inplace=True)
  data['agent'].fillna(0, inplace=True)
  data['company'].fillna(0, inplace=True)
  data['children'].fillna(0, inplace=True)
  data = data.drop_duplicates(keep="first")
  data = data[data['country'] != 'unknown']
  return data

df = transform_test_df(df)
df = df[df['hotel'] == 'Resort Hotel']

cleaned_1 = pd.read_csv('1cleaned.csv')
pre_cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
pre_ohe = pd.get_dummies(cleaned_1[pre_cols], prefix=pre_cols)
cleaned_2 = pd.read_csv('2cleaned.csv')
col_list = cleaned_2.columns

with open('FittedCyclicalEncoder.pkl', 'rb') as f:
  cyclical = pickle.load(f)
  
# just need class definition to load pipeline
class Cleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  
  def transform(self, X):
    X = X.copy()

    X = X.drop(columns=['arrival_date_year', 'reservation_status', 'reservation_status_date', 'arrival_date_day_of_month'])

    X['country'].fillna(f'unknown', inplace=True)
    X['agent'].fillna(0, inplace=True)
    X['company'].fillna(0, inplace=True)
    X['children'].fillna(0, inplace=True)

    X = X.drop_duplicates(keep="first")

    X = X.drop(columns=['company'])
    X = X.drop(columns=['agent'])

    X = X[X['country'] != 'unknown']
    return X

class Encoder(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def add_missing_columns(self, X):
    missing_columns = [col for col in pre_ohe if col not in X.columns]
    if missing_columns:
      df_missing = pd.DataFrame(0, index=X.index, columns=missing_columns)
      X = pd.concat([X, df_missing], axis=1)
    return X
  
  def transform(self, X):
    X = X.copy()

    cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
    ohe = pd.get_dummies(X[cols], prefix=cols)
    X = pd.concat([X, ohe], axis=1)
    X = X.drop(cols, axis=1)
    X = self.add_missing_columns(X)

    if 'is_canceled' not in X:
      X['is_canceled'] = 0 # doesn't matter, just because cyclical expects this after fitting

    months = {"JANUARY" : 1,"FEBRUARY" : 2,"MARCH" : 3,"APRIL": 4, "MAY": 5, "JUNE": 6, "JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10, "NOVEMBER": 11, "DECEMBER": 12}
    X['arrival_date_month'] = X['arrival_date_month'].apply(lambda x: months[str(x).upper()])

    X.drop(X.columns.difference(col_list), 1, inplace=True)

    cyclical_features_X = cyclical.transform(X)
    cos_only = cyclical_features_X.drop(["arrival_date_month_sin", "arrival_date_week_number_sin"], axis=1)

    X = cos_only.drop(columns=['is_canceled'])
    return X

with open('Pre_Pipeline.pkl', 'rb') as f:
  pipeline = pickle.load(f)

with open('model.pkl', 'rb') as f:
  model = pickle.load(f)

# Reducing risk

In [28]:
def get_proba(data, pipeline, model):
  data = data.copy()
  data = pipeline.transform(data)
  probas = model.predict_proba(data)[:,1]
  return probas

In [25]:
meals = df['meal'].unique()
assigned_rooms = df['assigned_room_type'].unique()

In [26]:
meals

array(['BB', 'FB', 'HB', 'SC', 'Undefined'], dtype=object)

In [27]:
assigned_rooms

array(['C', 'A', 'D', 'E', 'G', 'F', 'I', 'B', 'H', 'L'], dtype=object)

In [64]:
def reduce_risk_gs(df, pipeline, model):
  df = df.copy()
  
  orig_meal = df['meal']
  orig_assigned_rooms = df['assigned_room_type']

  proba = get_proba(df, pipeline, model)
  df['cancellation_proba'] = proba

  def apply_func(row):
    if row['cancellation_proba'] < 0.5:
      row = row.drop('cancellation_proba')
      return pd.Series({'At risk': False, 'Reduced risk': False, 'New Meal': row['meal'], 'Original Meal': row['meal'], 'New Room': row['assigned_room_type'], 'Original Room': row['assigned_room_type']})

    for meal in meals:
      for room in assigned_rooms:
        gs_row = row.copy()
        gs_row['meal'] = meal
        gs_row['assigned_room_type'] = room
        row_df = pd.DataFrame([gs_row])
        proba = get_proba(row_df, pipeline, model)[0]
        
        if proba < 0.5:
          return pd.Series({'At risk': True, 'Reduced risk': True, 'New Meal': gs_row['meal'], 'Original Meal': row['meal'], 'New Room': gs_row['assigned_room_type'], 'Original Room': row['assigned_room_type']})

    return pd.Series({'At risk': True, 'Reduced risk': False, 'New Meal': row['meal'], 'Original Meal': row['meal'], 'New Room': row['assigned_room_type'], 'Original Room': row['assigned_room_type']})
  
  return df.apply(apply_func, axis=1)

In [65]:
reduce_risk_gs(df.head(10), pipeline, model)

  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.columns.difference(col_list), 1, inplace=True)
  X.drop(X.col

Unnamed: 0,At risk,Reduced risk,New Meal,Original Meal,New Room,Original Room
0,True,True,BB,BB,I,C
1,True,True,BB,BB,I,C
2,False,False,BB,BB,C,C
3,False,False,BB,BB,A,A
4,False,False,BB,BB,A,A
6,True,True,BB,BB,I,C
7,False,False,FB,FB,C,C
8,True,True,BB,BB,C,A
9,True,True,BB,HB,C,D
10,True,True,BB,BB,I,E


- `At Risk` is if the booking is originally likely to cancel.
- `Reduced Risk` is if we made the probability from >= 0.5 to < 0.5.
- "New .." and "Original .." are the changed aspect and previously assigned aspect respectively.