In [10]:
import collections
import functools
import itertools
import math
import os
import statistics
import typing as t
from pathlib import Path

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, EFeaturesSelectionAlgorithm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from kaggle_toolbox.environment import Environment
from kaggle_toolbox.features.transform import contiguous_to_categorical
from kaggle_toolbox.path import format_path
from kaggle_toolbox.prediction import PredDict
from kaggle_toolbox.trainer import train_kfold_model
from kaggle_toolbox.typing import ensure_list
from kaggle_toolbox.validation import analyze_val_strategy, build_fold_result_df
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from textstat import textstat
from tqdm.notebook import tqdm

tqdm.pandas()

In [2]:
TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]
TARGET = TARGET_LIST[0]

SEED = 42
NUM_FOLDS = 5
FOLD_LIST = [0, 1, 2, 3, 4]

ENVIRONMENT = os.getenv('__KGLTBX_ENVIRONMENT', 'laptop')
_env = Environment(ENVIRONMENT)

ROOT_DIR = _env.param(
    kaggle=Path('/kaggle'),
    colab=Path('/content/drive/MyDrive'),
    laptop=Path('/kaggle'))
DATA_DIR = _env.param(
    kaggle=ROOT_DIR / 'input',
    colab=ROOT_DIR / 'data',
    laptop=ROOT_DIR / 'data')
FP_ELL_DATASET_DIR = _env.param(
    kaggle=DATA_DIR / 'feedback-prize-english-language-learning',
    colab=DATA_DIR / 'fp-ell',
    laptop=DATA_DIR / 'fp-ell')
MODEL_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'models/fp-ell',
    laptop=ROOT_DIR / 'models')
OOF_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'oof/fp-ell',
    laptop=ROOT_DIR / 'oof')

TARGET_TO_LVL1_OOF_PATH_DICT = {
    'cohesion': OOF_DIR / 'cohesion-v1-layer_norm-ep_4-valfreq_0p25-pooler_att-full.csv',
    'syntax': OOF_DIR / 'syntax-v1-layer_norm-ep_3-valfreq_0p25-full.csv',
    'vocabulary': OOF_DIR / 'vocabulary-v1-layer_norm-ep_3-valfreq_0p25-std_init.csv',
    'phraseology': OOF_DIR / 'phraseology-v1-layer_norm-ep_3-valfreq_0p25-std_init-full.csv',
    'grammar': OOF_DIR / 'grammar-v1-lnorm-ep_4-valfreq_0p25-sqzr_cat_9_to_12-full.csv',
    'conventions': OOF_DIR / 'conventions-v1-layer_norm-ep_3-valfreq_0p25-full.csv',
}

In [3]:
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    text_srs = df['full_text']

    # Score-based
    for i, target_lhs in enumerate(TARGET_LIST):
        for target_rhs in TARGET_LIST[i:]:
            df[f'{target_lhs}_{target_rhs}_lvl1_score_l1'] = (
                df[f'{target_lhs}_lvl1_score'] - df[f'{target_rhs}_lvl1_score']).abs()
    df['lvl1_mean'] = df.progress_apply(
        lambda row: statistics.mean([row[f'{col}_lvl1_score'] for col in TARGET_LIST]), axis=1)
    df['lvl1_std'] = df.progress_apply(
        lambda row: statistics.stdev([row[f'{col}_lvl1_score'] for col in TARGET_LIST]), axis=1)

    # Custom simple
    df['num_commas'] = text_srs.progress_apply(lambda text: text.count(','))
    df['num_dots'] = text_srs.progress_apply(lambda text: text.count('.'))
    df['num_colons'] = text_srs.progress_apply(lambda text: text.count(':'))
    df['num_semicolons'] = text_srs.progress_apply(lambda text: text.count(';'))
    df['num_ellipsis'] = text_srs.progress_apply(lambda text: text.count('...'))
    df['num_newlines'] = text_srs.progress_apply(lambda text: text.count('\n'))
    df['num_spaces'] = text_srs.progress_apply(lambda text: text.count(' '))

    # TextStat simple
    df['syllable_count'] = text_srs.progress_apply(
        lambda text: textstat.syllable_count(text))
    df['lexicon_count'] = text_srs.progress_apply(
        lambda text: textstat.lexicon_count(text, removepunct=True))
    df['char_count'] = text_srs.progress_apply(
        lambda text: textstat.char_count(text, ignore_spaces=True))
    df['letter_count'] = text_srs.progress_apply(
        lambda text: textstat.letter_count(text, ignore_spaces=True))
    df['polysyllabcount'] = text_srs.progress_apply(
        lambda text: textstat.polysyllabcount(text))
    df['monosyllabcount'] = text_srs.progress_apply(
        lambda text: textstat.monosyllabcount(text))

    # Custom complex
    df['ratio_commas'] = df['num_commas'] / df['char_count']
    df['ratio_dots'] = df['num_dots'] / df['char_count']
    df['ratio_colons'] = df['num_colons'] / df['char_count']
    df['ratio_semicolons'] = df['num_semicolons'] / df['char_count']
    df['ratio_ellipsis'] = df['num_ellipsis'] / df['char_count']
    df['ratio_newlines'] = df['num_newlines'] / df['char_count']
    df['ratio_spaces'] = df['num_spaces'] / df['char_count']

    # TextStat complex
    df['flesch_reading_ease'] = text_srs.progress_apply(
        lambda text: textstat.flesch_reading_ease(text))
    df['flesch_kincaid_grade'] = text_srs.progress_apply(
        lambda text: textstat.flesch_kincaid_grade(text))
    df['gunning_fog'] = text_srs.progress_apply(
        lambda text: textstat.gunning_fog(text))
    df['smog_index'] = text_srs.progress_apply(
        lambda text: textstat.smog_index(text))
    df['automated_readability_index'] = text_srs.progress_apply(
        lambda text: textstat.automated_readability_index(text))
    df['coleman_liau_index'] = text_srs.progress_apply(
        lambda text: textstat.coleman_liau_index(text))
    df['linsear_write_formula'] = text_srs.progress_apply(
        lambda text: textstat.linsear_write_formula(text))
    df['dale_chall_readability_score'] = text_srs.progress_apply(
        lambda text: textstat.dale_chall_readability_score(text))
    df['text_standard'] = text_srs.progress_apply(
        lambda text: textstat.text_standard(text, float_output=True))
    df['spache_readability'] = text_srs.progress_apply(
        lambda text: textstat.spache_readability(text))
    df['mcalpine_eflaw'] = text_srs.progress_apply(
        lambda text: textstat.mcalpine_eflaw(text))
    df['reading_time'] = text_srs.progress_apply(
        lambda text: textstat.reading_time(text, ms_per_char=14.69))

    return df

In [4]:
def _read_data(
        dataset_dir_path: Path,
        target_list: t.List[str],
        target_to_lvl1_oof_path_dict: t.Dict[str, Path],
        num_folds: int,
        seed: int) -> pd.DataFrame:
    all_df = pd.read_csv(dataset_dir_path / 'train.csv')
    target_arr = contiguous_to_categorical(all_df[target_list].values)

    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold_, (_, v_) in enumerate(mskf.split(X=all_df, y=target_arr)):
        all_df.loc[v_, 'fold'] = fold_

    for target in target_list:
        all_df = all_df.merge(
            pd.read_csv(target_to_lvl1_oof_path_dict[target])
                .rename({
                    'id': 'text_id',
                    f'{target}_score': f'{target}_lvl1_score',
                }, axis=1),
            left_on='text_id',
            right_on='text_id')

    all_df = build_features(all_df)

    return all_df

all_df = _read_data(
    dataset_dir_path=FP_ELL_DATASET_DIR,
    target_list=TARGET_LIST,
    target_to_lvl1_oof_path_dict=TARGET_TO_LVL1_OOF_PATH_DICT,
    num_folds=NUM_FOLDS,
    seed=SEED)

analyze_val_strategy(all_df, target_list=TARGET_LIST, num_folds=NUM_FOLDS)

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

  0%|          | 0/3911 [00:00<?, ?it/s]

Unnamed: 0,fold,num_samples,cohesion_mean,syntax_mean,vocabulary_mean,phraseology_mean,grammar_mean,conventions_mean
0,0,782,3.077366,2.971867,3.205243,3.065857,2.959719,3.035166
1,1,783,3.12516,3.007024,3.226054,3.111111,3.015964,3.079183
2,2,782,3.140665,3.068414,3.258312,3.138747,3.069693,3.116368
3,3,782,3.131074,3.048593,3.245524,3.125959,3.042839,3.074169
4,4,782,3.161125,3.045396,3.243606,3.142583,3.076087,3.100384


In [5]:
all_df.head(3)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold,cohesion_lvl1_score,...,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,text_standard,spache_readability,mcalpine_eflaw,reading_time
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,1.0,2.85574,...,6.57,7.6,5.8,6.31,8.0,5.99,6.0,3.34,20.2,16.31
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0.0,2.784366,...,15.47,11.9,16.2,5.93,11.2,2.45,12.0,6.4,58.5,30.82
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,4.0,2.891909,...,7.22,9.4,6.8,6.09,6.625,5.95,7.0,3.64,23.6,19.73


#### SVR

In [6]:
class _SVRTrainer:
    _FEATURES_TO_EXCLUDE_SET = {
        'text_id',
        'full_text',
        'cohesion',
        'syntax',
        'vocabulary',
        'phraseology',
        'grammar',
        'conventions',
        'fold',
    }

    target: str

    def __init__(self, model_param_dict: t.Dict[str, t.Any]):
        self._model_param_dict = model_param_dict

    def _is_feature_included(self, feature_name: str) -> bool:
        return feature_name not in self._FEATURES_TO_EXCLUDE_SET

    def _get_model(self) -> SVR:
        return SVR(**self._model_param_dict)

    def _convert_df_to_xy(self, df: pd.DataFrame) -> t.Tuple[np.ndarray, np.ndarray, t.List[str]]:
        feature_name_list = [
            col for col in t.cast(t.List[str], df.columns)
            if self._is_feature_included(col)
        ]
        x = df[feature_name_list].values
        y = df[self.target].values
        return x, y, feature_name_list  # type: ignore

    def __call__(self, fold: int) -> t.Tuple[float, PredDict]:
        train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]
        train_x, train_y, feature_name_list = self._convert_df_to_xy(train_df)
        valid_x, valid_y, _ = self._convert_df_to_xy(valid_df)

        booster = self._get_model()
        booster.fit(X=train_x, y=train_y)
        valid_y_pred = booster.predict(valid_x)

        score = math.sqrt(mean_squared_error(y_true=valid_y, y_pred=valid_y_pred))

        return score, PredDict(zip(valid_df['text_id'].tolist(), valid_y_pred))


class _ConventionsSVRTrainer(_SVRTrainer):
    target = 'conventions'

    def _is_feature_included(self, feature_name: str) -> bool:
        return super()._is_feature_included(feature_name) and not feature_name.endswith('_l1')


# score_list, oof_pred_dict= train_kfold_model(
#         train_model_fn=_ConventionsSVRTrainer({}),
#         fold_list=FOLD_LIST)
# if OOF_PATH is not None:
#     oof_pred_dict.save_to_csv(
#         OOF_PATH,
#         score_col_name_list=[f'{target}_score' for target in TARGET_LIST])
# build_fold_result_df(fold_list=FOLD_LIST, score_list=score_list)

#### XGBoost

In [7]:
def grid_search(
        train_model_fn: t.Callable[[t.Dict[str, t.Any], int], t.Tuple[float, PredDict]],
        param_plan: t.Dict[str, t.List[t.Any]]) -> t.Tuple[t.Dict[str, t.Any], t.List[float], PredDict]:
    param_name_list = list(param_plan.keys())
    param_comb_list = list(itertools.product(*[param_plan[param_name] for param_name in param_name_list]))
    best_param_dict, best_score_list, best_pred_dict = None, None, None
    it = tqdm(param_comb_list)
    for param_value_tuple in it:
        param_value_list = list(param_value_tuple)
        param_dict = dict(zip(param_name_list, param_value_list))
        param_str = ', '.join([f'{k} = {v}' for k, v in param_dict.items()])
        if best_score_list is not None:
            it.set_description(f'Best score: {statistics.mean(best_score_list):.4f}. Params: {param_str}')
        else:
            it.set_description(f'Params: {param_str}')
        iter_score_list, iter_pred_dict = train_kfold_model(
            train_model_fn=functools.partial(train_model_fn, param_dict),
            fold_list=FOLD_LIST)
        if best_score_list is None or statistics.mean(best_score_list) > statistics.mean(iter_score_list):
            best_param_dict = param_dict
            best_score_list = iter_score_list
            best_pred_dict = iter_pred_dict
            it.set_description(f'Best score: {statistics.mean(best_score_list):.4f}. Params: {param_str}')
    assert best_param_dict is not None
    assert best_score_list is not None
    assert best_pred_dict is not None
    return best_param_dict, best_score_list, best_pred_dict


In [8]:
class _XGBoostTrainer:
    _FEATURES_TO_EXCLUDE_SET = {
        'text_id',
        'full_text',
        'cohesion',
        'syntax',
        'vocabulary',
        'phraseology',
        'grammar',
        'conventions',
        'fold',
    }

    target: str

    def __init__(self, model_param_dict: t.Dict[str, t.Any]):
        self._model_param_dict = model_param_dict
        self._fold_to_feature_importance: t.Dict[int, t.Dict[str, float]] = {}

    def get_feature_importance(self, fold: int) -> t.Dict[str, float]:
        return self._fold_to_feature_importance[fold]

    def _is_feature_included(self, feature_name: str) -> bool:
        return feature_name not in self._FEATURES_TO_EXCLUDE_SET

    def _get_model(self) -> XGBRegressor:
        return XGBRegressor(
            tree_method='gpu_hist',
            objective='reg:squarederror',
            **self._model_param_dict)

    def _convert_df_to_xy(self, df: pd.DataFrame) -> t.Tuple[np.ndarray, np.ndarray, t.List[str]]:
        feature_name_list = [
            col for col in t.cast(t.List[str], df.columns)
            if self._is_feature_included(col)
        ]
        x = df[feature_name_list].values
        y = df[self.target].values
        return x, y, feature_name_list  # type: ignore

    def __call__(self, fold: int) -> t.Tuple[float, PredDict]:
        train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]
        train_x, train_y, feature_name_list = self._convert_df_to_xy(train_df)
        valid_x, valid_y, _ = self._convert_df_to_xy(valid_df)

        booster = self._get_model()
        booster.fit(X=train_x, y=train_y, eval_set=[(valid_x, valid_y)])
        valid_y_pred = booster.predict(valid_x)
    
        self._fold_to_feature_importance[fold] = dict(zip(feature_name_list, booster.feature_importances_))

        score = math.sqrt(mean_squared_error(y_true=valid_y, y_pred=valid_y_pred))

        return score, PredDict(zip(valid_df['text_id'].tolist(), valid_y_pred))


class _VocabularyXGBoostTrainer(_XGBoostTrainer):
    target = 'vocabulary'

    def _is_feature_included(self, feature_name: str) -> bool:
        return super()._is_feature_included(feature_name) and not feature_name.endswith('_l1')


# param_dict, score_list, oof_pred_dict = grid_search(
#     train_model_fn=lambda param_dict, fold: _VocabularyXGBoostTrainer(param_dict)(fold),
#     param_plan={
#         # 'gamma': [1e-3, 1e-4, 1e-5],
#         'max_depth': [2, 3, 4, 5],
#         'n_estimators': [300, 600, 900, 1200],
#         'learning_rate': [1e-2, 75e-3, 5e-2, 25e-3],
#     })
# print(f'Best params: {param_dict}')
# score_list, oof_pred_dict= train_kfold_model(
#         train_model_fn=_VocabularyXGBoostTrainer({
#             'max_depth': 3,
#             'n_estimators': 1000,
#             'learning_rate': 5e-2,
#         }),
#         fold_list=[0])
# if OOF_PATH is not None:
#     oof_pred_dict.save_to_csv(
#         OOF_PATH,
#         score_col_name_list=[f'{target}_score' for target in TARGET_LIST])
# build_fold_result_df(fold_list=FOLD_LIST, score_list=score_list)

#### Catboost

In [11]:
class _CatboostTrainer:
    _FEATURES_TO_EXCLUDE_SET = {
        'text_id',
        'full_text',
        'cohesion',
        'syntax',
        'vocabulary',
        'phraseology',
        'grammar',
        'conventions',
        'fold',
    }

    target: str

    def __init__(
            self,
            explicit_feature_whitelist: t.Optional[t.Set[str]] = None,
            plot: bool = True,
            model_path_template: t.Optional[Path] = None):
        self._explicit_feature_whitelist = explicit_feature_whitelist
        self._plot = plot
        self._model_path_template = model_path_template

    @property
    def explicit_feature_whitelist(self) -> t.Optional[t.Set[str]]:
        return self._explicit_feature_whitelist

    def _is_feature_included(self, feature_name: str) -> bool:
        if feature_name in self._FEATURES_TO_EXCLUDE_SET:
            return False
        return self._explicit_feature_whitelist is None or feature_name in self._explicit_feature_whitelist

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
        )

    def _convert_df_to_xy(self, df: pd.DataFrame) -> t.Tuple[np.ndarray, np.ndarray, t.List[str]]:
        feature_name_list = [
            col for col in t.cast(t.List[str], df.columns)
            if self._is_feature_included(col)
        ]
        x = df[feature_name_list].values
        y = df[self.target].values
        return x, y, feature_name_list  # type: ignore

    def _select_features_for_fold(
            self,
            fold: int,
            num_features: int,
            steps: int,
            algo: EFeaturesSelectionAlgorithm = EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange) -> t.Set[str]:
        train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]
        train_x, train_y, feature_name_list = self._convert_df_to_xy(train_df)
        valid_x, valid_y, _ = self._convert_df_to_xy(valid_df)

        booster = self._get_model()
        summary = booster.select_features(
            train_x,
            train_y,
            eval_set=(valid_x, valid_y),
            features_for_select=list(range(len(feature_name_list))),
            num_features_to_select=num_features,
            steps=steps,
            algorithm=algo,
            logging_level='Silent',
            plot=self._plot)

        return {feature_name_list[i] for i in summary['selected_features']}

    def select_features_for_fold(
            self,
            fold: int,
            num_features: int,
            steps: int,
            algo: EFeaturesSelectionAlgorithm = EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange):
        self._explicit_feature_whitelist = self._select_features_for_fold(
            fold=fold,
            num_features=num_features,
            steps=steps,
            algo=algo)

    def __call__(self, fold: int) -> t.Tuple[float, PredDict]:
        train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]
        train_x, train_y, feature_name_list = self._convert_df_to_xy(train_df)
        valid_x, valid_y, _ = self._convert_df_to_xy(valid_df)

        booster = self._get_model()
        booster.fit(
            X=train_x,
            y=train_y,
            eval_set=(valid_x, valid_y),
            plot=self._plot,
            logging_level='Silent',)
        valid_y_pred = booster.predict(valid_x)

        if self._model_path_template is not None:
            booster.save_model(str(format_path(self._model_path_template, fold=fold)))

        score = math.sqrt(mean_squared_error(y_true=valid_y, y_pred=valid_y_pred))

        return score, PredDict(zip(valid_df['text_id'].tolist(), [ensure_list(x) for x in valid_y_pred.tolist()]))


# class _CatboostFeatureSelector:

#     def __init__(self, trainer: _CatboostTrainer, num_features: int, num_steps: int):
#         self._trainer = trainer
#         self._num_features = num_features
#         self._num_steps = num_steps

#     def __call__(self, fold: int) -> t.Tuple[float, PredDict]:
#         self._trainer.select_features_for_fold(fold=fold, num_features=self._num_features, steps=self._num_steps)
#         return self._trainer(fold)


class _CohesionCatboostTrainer(_CatboostTrainer):
    target = 'cohesion'

    FS_FOLD = 0
    FS_NUM_FEATURES = 20
    FS_STEPS = 3

    # def _is_feature_included(self, feature_name: str) -> bool:
    #     return super()._is_feature_included(feature_name) and not feature_name.endswith('_l1')

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )


class _SyntaxCatboostTrainer(_CatboostTrainer):
    target = 'syntax'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            max_depth=7)


class _VocabularyCatboostTrainer(_CatboostTrainer):
    target = 'vocabulary'

    FS_FOLD = 0
    FS_NUM_FEATURES = 18
    FS_STEPS = 1

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )


class _GrammarCatboostTrainer(_CatboostTrainer):
    target = 'grammar'

    FS_FOLD = 0
    FS_NUM_FEATURES = 23
    FS_STEPS = 3

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )


class _PhraseologyCatboostTrainer(_CatboostTrainer):
    """
    No improvement.
    """
    target = 'phraseology'

    FS_FOLD = 1
    FS_NUM_FEATURES = 21
    FS_STEPS = 3

    def _is_feature_included(self, feature_name: str) -> bool:
        return super()._is_feature_included(feature_name)

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=15.0)


class _ConventionsCatboostTrainer(_CatboostTrainer):
    """
    No improvement.
    """
    target = 'conventions'

    FS_FOLD = 2
    FS_NUM_FEATURES = 20
    FS_STEPS = 3

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=16.0,
        )


trainer = _CohesionCatboostTrainer(model_path_template=MODEL_DIR / 'lvl2-catboost-cohesion-cv1-fold_{fold}.cbm')
trainer.select_features_for_fold(fold=trainer.FS_FOLD, num_features=trainer.FS_NUM_FEATURES, steps=trainer.FS_STEPS)
score_list, oof_pred_dict= train_kfold_model(
        train_model_fn=trainer,
        fold_list=FOLD_LIST)
oof_pred_dict.save_to_csv(
    OOF_DIR / f'lvl2-catboost-{trainer.target}-cv1.csv',
    score_col_name_list=[f'{trainer.target}_score'])
print(f'Mean score: {statistics.mean(score_list):.4f}')
print(f'Features: {trainer.explicit_feature_whitelist}')
build_fold_result_df(fold_list=FOLD_LIST, score_list=score_list)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Mean score: 0.4760
Features: {'lexicon_count', 'syntax_lvl1_score', 'reading_time', 'linsear_write_formula', 'vocabulary_lvl1_score', 'num_dots', 'syntax_grammar_lvl1_score_l1', 'ratio_newlines', 'conventions_lvl1_score', 'char_count', 'letter_count', 'vocabulary_phraseology_lvl1_score_l1', 'num_spaces', 'ratio_colons', 'syllable_count', 'num_newlines', 'ratio_dots', 'lvl1_mean', 'cohesion_lvl1_score', 'num_colons'}


Unnamed: 0,fold,score
0,0,0.463983
1,1,0.486141
2,2,0.485832
3,3,0.465618
4,4,0.478464
