# Adding encoding to existing scaler/PCA/RFE pipeline

### Assumptions:
- Under the assumption "is_canceled" would not exist in new data
- No null data
- No duplicates for new data

In [1]:
!pip install feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.6.0-py2.py3-none-any.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.4/319.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.0


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from feature_engine.creation import CyclicalFeatures
from scipy import stats

import pandas as pd
import numpy as np

import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/IS4303/notebooks/new_cleaned

Mounted at /content/drive
/content/drive/MyDrive/IS4303/notebooks/new_cleaned


In [30]:
# getting all ohe columns previously made, that the scaler expects
df = pd.read_csv('1cleaned.csv')
pre_cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
pre_ohe = pd.get_dummies(df[pre_cols], prefix=pre_cols)
cleaned_2 = pd.read_csv('2cleaned.csv')
col_list = cleaned_2.columns

In [14]:
with open('PCA_RFE_pipeline.pkl', 'rb') as f:
  pipeline = pickle.load(f)

with open('FittedCyclicalEncoder.pkl', 'rb') as f:
  cyclical = pickle.load(f)

In [15]:
pipeline

In [16]:
cyclical

In [31]:
class Cleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  
  def transform(self, X):
    X = X.copy()

    X = X.drop(columns=['arrival_date_year', 'reservation_status', 'reservation_status_date', 'arrival_date_day_of_month'])

    X['country'].fillna(f'unknown', inplace=True)
    X['agent'].fillna(0, inplace=True)
    X['company'].fillna(0, inplace=True)
    X['children'].fillna(0, inplace=True)

    X = X.drop_duplicates(keep="first")

    X = X.drop(columns=['company'])
    X = X.drop(columns=['agent'])

    X = X[X['country'] != 'unknown']
    print(X.shape)
    return X

class Encoder(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def add_missing_columns(self, X):
    missing_columns = [col for col in pre_ohe if col not in X.columns]
    if missing_columns:
      df_missing = pd.DataFrame(0, index=X.index, columns=missing_columns)
      X = pd.concat([X, df_missing], axis=1)
    return X
  
  def transform(self, X):
    X = X.copy()

    cols = ['hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type', 'deposit_type']
    ohe = pd.get_dummies(X[cols], prefix=cols)
    X = pd.concat([X, ohe], axis=1)
    X = X.drop(cols, axis=1)
    X = self.add_missing_columns(X)

    if 'is_canceled' not in X:
      X['is_canceled'] = 0 # doesn't matter, just because cyclical expects this after fitting

    months = {"JANUARY" : 1,"FEBRUARY" : 2,"MARCH" : 3,"APRIL": 4, "MAY": 5, "JUNE": 6, "JULY": 7, "AUGUST": 8, "SEPTEMBER": 9, "OCTOBER": 10, "NOVEMBER": 11, "DECEMBER": 12}
    X['arrival_date_month'] = X['arrival_date_month'].apply(lambda x: months[str(x).upper()])

    X.drop(X.columns.difference(col_list), 1, inplace=True) # only columns that were used during fit

    cyclical_features_X = cyclical.transform(X)
    cos_only = cyclical_features_X.drop(["arrival_date_month_sin", "arrival_date_week_number_sin"], axis=1)

    X = cos_only.drop(columns=['is_canceled'])
    return X

updated = Pipeline([
  ('clearner', Cleaner()),
  ('encoder', Encoder()),
  *pipeline.steps
])

updated

## Testing

In [32]:
df = pd.read_csv("https://drive.google.com/uc?export=download&id=1i-JbDm2nY15NnYl62GxlsBCoELp1HJZ3") # initial dataset
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [33]:
res = updated.transform(df)
res

(86615, 26)


  X.drop(X.columns.difference(col_list), 1, inplace=True)


array([[ 4.01121765,  1.81184762,  1.62674106, ...,  0.04752681,
         0.68875619, -0.50950693],
       [ 3.83859736,  2.83975844,  2.20252454, ..., -0.27466897,
         0.15849956, -1.3424417 ],
       [ 3.43140218,  0.52655765,  1.00394109, ...,  0.70021238,
         0.25763468,  1.39043911],
       ...,
       [-3.87769923,  0.53981846,  0.18941393, ..., -0.1684134 ,
        -0.4750981 , -0.08973418],
       [-1.0439633 ,  0.14340274,  0.22350922, ..., -0.01787914,
        -0.09105072,  0.54060288],
       [-2.02093817,  0.94215341,  1.07270749, ..., -0.15060708,
        -0.17841879, -0.10473233]])

# Saving

In [34]:
with open('Pre_Pipeline.pkl', 'wb') as f:
  pickle.dump(updated, f)