# 002 feature engineering and validation

## import modules

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from plotly import express as px
from sklearn.model_selection import KFold

import mlflow
from hydra.experimental import initialize, compose
from omegaconf import DictConfig
from pathlib import Path

import category_encoders as ce
from xfeat import TargetEncoder

import os
import sys
sys.path.append('../src')
import utils
import feature
import preprocess

pd.options.display.max_columns = 100

## config

In [2]:
# hydra
with initialize(config_path='../config'):
    cfg = compose(config_name="exp_002.yaml")

## load dataset

In [None]:
train, test, submission = utils.load_dataset(cfg)

display(train.head())
display(test.head())
display(submission.head())

In [None]:
df_tr = pd.DataFrame(
    {
        'cat': [
            'mao', 'rena', np.nan, 'mao', 'rena', 'mao', 'ririmu',
        ]
    }
)

df_te = pd.DataFrame(
    {
        'cat': [
            'mao', 'ririmu', np.nan, 'rena', 'ange',
        ]
    }
)
display(df_tr)
display(df_te)

In [None]:
cols = ['cat']
enc = feature.OneHotEncodingBlock(cols=cols)
enc.fit(df_tr)
enc_tr = enc.transform(df_tr)
enc_te = enc.transform(df_te)
df_tr_ = pd.concat([df_tr[cols], enc_tr], axis=1)
df_te_ = pd.concat([df_te[cols], enc_te], axis=1)
# df_te_['cat_OHE_'] = '1_' + df_te_['cat_OHE'].astype(str)
display(df_tr_)
display(df_te_)

## preprocessing

In [None]:
train, test = preprocess.to_preprocess(cfg, train, test)

## feature engineering

In [None]:
# X_train, X_test = feature.to_features(train, test)
# y_train = train[cfg['training']['targets']]
y_train = train[cfg['training']['targets']]

X_train.head()

## CV

In [None]:
X_train = utils.get_group_k_fold(cfg, train)
X_train.head()

In [None]:
# Target Encoding
cols = ['variety_OE', 'region_OE']
group = [['variety_OE', 'processing_method_OE', 'region_OE']]
target = 'aroma'
splitter = KFold(n_splits=3, shuffle=True, random_state=42)

encoder = feature.TargetEncodingBlock(cols=cols, group_cols=group, target=target, splitter=splitter)
output_tr = encoder.fit_transform(X_train, y_train)
output_te = encoder.transform(X_train, y_train, X_test)
display(output_tr)
display(output_te)

In [None]:
# aggregation
class GroupingBlock(feature.BaseBlock):
    def __init__(self, cat_cols, target_cols, methods):
        self.cat_cols = cat_cols
        self.target_cols = target_cols
        self.methods = methods

        self.df = None
        self.a_cat = None

    def fit(self, input_df, y=None):
        self.df = [self._agg(input_df, target_col) for target_col in self.target_cols]
        self.df = pd.concat(self.df, axis=1)
        self.df[self.cat_cols] = self.a_cat[self.cat_cols]

    def transform(self, input_df):
        output_df = pd.merge(
            input_df[self.cat_cols], self.df, on=self.cat_cols, how="left"
        )
        output_df = output_df.drop(columns=self.cat_cols, axis=1)
        return output_df

    def _agg(self, input_df, target_col):
        _df = input_df.groupby(self.cat_cols, as_index=False).agg(
            {target_col: self.methods}
        )
        cols = self.cat_cols + [
            f"agg_{method}_{'_and_'.join(self.cat_cols)}_by_{target_col}"
            for method in self.methods
        ]
        _df.columns = cols
        self.a_cat = _df[self.cat_cols]
        return _df.drop(columns=self.cat_cols, axis=1)

In [None]:
cat_cols = ['countryof_origin_OE']
target_cols = ['processing_method_OE']
methods = ['mean']
enc = GroupingBlock(cat_cols, target_cols, methods)
enc.fit(X_train)
output_tr = enc.transform(X_train)
output_te = enc.transform(X_test)
output_te.head(10)