In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()
pd.set_option("display.max_columns", None)

In [None]:
target = 'cliked'

In [None]:
train = pd.read_pickle("data/train.pkl")
#test = pd.read_pickle("data/test.pkl")

In [None]:
import src.feature_engineering as fe

In [None]:
to_drop = ['offer_id', 'context.booking.param10', 'context.booking.param11','context.booking.param12', 'context.booking.param13' , 'context.booking.param14', 'context.booking.param15', 'context.booking.param16', 'id', 
          'schema', 'context.param3', 'context.leg.param12', 'context.leg.param13', 'context.leg.param14', 'context.leg.param15']

In [None]:
pipeline = [
    fe.DatetimeEncoder()
]

In [None]:
df_transformed = train.copy()
df_transformed = df_transformed.drop(columns=to_drop)
for encoder in pipeline:
    df_transformed = encoder.transform(df_transformed)

In [None]:
df_transformed.dtypes.value_counts()

# Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class ValEncoder(BaseEstimator, TransformerMixin):
    def transform(self, X):
        X_tr = X.copy()
        col = 'context.booking.param5'
        X_tr[col] = X_tr[col].clip(0,10)
        return X_tr

In [None]:
ValEncoder().transform(df_transformed)['context.booking.param5'].describe()

In [None]:
class TrigonometricTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X):
        X_tr = X.copy()
        month = 'data_month'
        day = 'data_day'
        
        X_tr[f"{month}_SIN"] = X_tr[month].apply(lambda x: np.sin(x * 2 * np.pi /12))
        X_tr[f"{month}_COS"] = X_tr[month].apply(lambda x: np.cos(x * 2 * np.pi /12))
        
        X_tr[f"{day}_SIN"] = X_tr[day].apply(lambda x: np.sin(x * 2 * np.pi /31))
        X_tr[f"{day}_COS"] = X_tr[day].apply(lambda x: np.cos(x * 2 * np.pi /31))
        
        return X_tr    

In [None]:
TrigonometricTransformer().transform(df_transformed)[['data_month_SIN', 'data_day_SIN']].head()

In [None]:
class Quantilizer(BaseEstimator, TransformerMixin):
    def __init__(self, input_cols:list, bins: int = 100):
        self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols]
        self.bins = bins
        self._bin_edges = dict()
        
    def fit(self, X, y=None):
        for col in self.input_cols:
            quantiles = np.arange(0, 1.00001, step=1/self.bins)
            self._bin_edges[col] = [X[col].quantile(q) for q in quantiles]
            self._bin_edges[col][0] = -np.inf
            self._bin_edges[col][-1] = np.inf
        return self
    
    def transform(self, X):
        X_tr = X.copy()
        for col in self.input_cols:
            X_tr[col] = pd.cut(X_tr[col], self._bin_edges[col], duplicates="drop").cat.codes
            X_tr[col] = np.where(X[col].isna(), np.nan, X_tr[col])
        return X_tr

In [None]:
quantilizer = Quantilizer(['context.booking.param5'], bins = 5)

t = quantilizer.fit_transform(df_transformed)['context.booking.param5']

In [None]:
df_transformed.groupby(t)[target].mean().plot();