In [None]:
import functools
import itertools
import math
import os
import statistics
import typing as t
from pathlib import Path

import kaggle_toolbox.features.generation as features
import kaggle_toolbox.nlp.features as text_features
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, EFeaturesSelectionAlgorithm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from kaggle_toolbox.environment import Environment
from kaggle_toolbox.features.transform import contiguous_to_categorical
from kaggle_toolbox.path import format_path
from kaggle_toolbox.prediction import PredDict
from kaggle_toolbox.progress import NotebookProgressBar
from kaggle_toolbox.trainer import train_kfold_model
from kaggle_toolbox.typing import ensure_list
from kaggle_toolbox.validation import analyze_val_strategy, build_fold_result_df
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from textstat import textstat
from tqdm.notebook import tqdm

tqdm.pandas()

In [None]:
TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]
TARGET = TARGET_LIST[0]

SEED = 42
NUM_FOLDS = 5
FOLD_LIST = [0, 1, 2, 3, 4]

ENVIRONMENT = os.getenv('__KGLTBX_ENVIRONMENT', 'laptop')
_env = Environment(ENVIRONMENT)

ROOT_DIR = _env.param(
    kaggle=Path('/kaggle'),
    colab=Path('/content/drive/MyDrive'),
    laptop=Path('/kaggle'))
DATA_DIR = _env.param(
    kaggle=ROOT_DIR / 'input',
    colab=ROOT_DIR / 'data',
    laptop=ROOT_DIR / 'data')
FP_ELL_DATASET_DIR = _env.param(
    kaggle=DATA_DIR / 'feedback-prize-english-language-learning',
    colab=DATA_DIR / 'fp-ell',
    laptop=DATA_DIR / 'fp-ell')
MODEL_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'models/fp-ell',
    laptop=ROOT_DIR / 'models')
OOF_DIR = _env.param(
    kaggle=ROOT_DIR / 'working',
    colab=ROOT_DIR / 'oof/fp-ell',
    laptop=ROOT_DIR / 'oof')

TARGET_TO_LVL1_OOF_PATH_DICT = {
    'cohesion': OOF_DIR / 'cohesion-v1-layer_norm-ep_4-valfreq_0p25-pooler_att-full.csv',
    'syntax': OOF_DIR / 'syntax-v1-layer_norm-ep_3-valfreq_0p25-full.csv',
    'vocabulary': OOF_DIR / 'vocabulary-v1-layer_norm-ep_3-valfreq_0p25-std_init.csv',
    'phraseology': OOF_DIR / 'phraseology-v1-layer_norm-ep_3-valfreq_0p25-std_init-full.csv',
    'grammar': OOF_DIR / 'grammar-v1-lnorm-ep_4-valfreq_0p25-sqzr_cat_9_to_12-full.csv',
    'conventions': OOF_DIR / 'conventions-v1-layer_norm-ep_3-valfreq_0p25-full.csv',
}

#### Feature generation

In [None]:
_LVL1_SCORE_FEATURE_LIST = [f'{target}_lvl1_score' for target in TARGET_LIST]
_FEATURE_GENERATOR_LIST = [
    # Score-based
    *features.L1Distance.pairwise_from_feature_list(_LVL1_SCORE_FEATURE_LIST),
    features.Mean(name='lvl1_mean', feature_list=_LVL1_SCORE_FEATURE_LIST),
    features.Stdev(name='lvl1_std', feature_list=_LVL1_SCORE_FEATURE_LIST),
    # Custom simple
    text_features.SubstrCount(name='num_commas', substr=','),
    text_features.SubstrCount(name='num_dots', substr='.'),
    text_features.SubstrCount(name='num_colons', substr=':'),
    text_features.SubstrCount(name='num_semicolons', substr=';'),
    text_features.SubstrCount(name='num_ellipsis', substr='...'),
    text_features.SubstrCount(name='num_newlines', substr='\n'),
    text_features.SubstrCount(name='num_spaces', substr=' '),
    # TextStat simple
    text_features.Func(name='syllable_count', func=textstat.syllable_count),
    text_features.Func(name='lexicon_count', func=functools.partial(textstat.lexicon_count, removepunct=True)),
    text_features.Func(name='char_count', func=functools.partial(textstat.char_count, ignore_spaces=True)),
    text_features.Func(name='letter_count', func=functools.partial(textstat.letter_count, ignore_spaces=True)),
    text_features.Func(name='polysyllabcount', func=functools.partial(textstat.polysyllabcount)),
    text_features.Func(name='monosyllabcount', func=functools.partial(textstat.monosyllabcount)),
    # Custom complex
    features.Div(name='ratio_commas', lhs_feature='num_commas', rhs_feature='char_count'),
    features.Div(name='ratio_dots', lhs_feature='num_dots', rhs_feature='char_count'),
    features.Div(name='ratio_colons', lhs_feature='num_colons', rhs_feature='char_count'),
    features.Div(name='ratio_semicolons', lhs_feature='num_semicolons', rhs_feature='char_count'),
    features.Div(name='ratio_ellipsis', lhs_feature='num_ellipsis', rhs_feature='char_count'),
    features.Div(name='ratio_newlines', lhs_feature='num_newlines', rhs_feature='char_count'),
    features.Div(name='ratio_spaces', lhs_feature='num_spaces', rhs_feature='char_count'),
    # TextStat complex
    text_features.Func(name='flesch_reading_ease', func=textstat.flesch_reading_ease),
    text_features.Func(name='flesch_kincaid_grade', func=textstat.flesch_kincaid_grade),
    text_features.Func(name='gunning_fog', func=textstat.gunning_fog),
    text_features.Func(name='smog_index', func=textstat.smog_index),
    text_features.Func(name='automated_readability_index', func=textstat.automated_readability_index),
    text_features.Func(name='coleman_liau_index', func=textstat.coleman_liau_index),
    text_features.Func(name='linsear_write_formula', func=textstat.linsear_write_formula),
    text_features.Func(name='dale_chall_readability_score', func=textstat.dale_chall_readability_score),
    text_features.Func(name='text_standard', func=functools.partial(textstat.text_standard, float_output=True)),  # type: ignore
    text_features.Func(name='spache_readability', func=textstat.spache_readability),
    text_features.Func(name='mcalpine_eflaw', func=textstat.mcalpine_eflaw),
    text_features.Func(name='reading_time', func=functools.partial(textstat.reading_time, ms_per_char=14.69)),
]


def build_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    text_srs = df['full_text']

    feature_arr_dict = text_features.generate_text_features(
        generator_list=_FEATURE_GENERATOR_LIST,
        text_seq=text_srs.tolist(),
        progress_bar=NotebookProgressBar(),
        init_feature_array_dict={
            f'{target}_lvl1_score': df[f'{target}_lvl1_score'].values
            for target in TARGET_LIST
        })  # type: ignore
    for feature_name, feature_arr in feature_arr_dict.items():
        df[feature_name] = feature_arr

    return df

In [None]:
def _read_data(
        dataset_dir_path: Path,
        target_list: t.List[str],
        target_to_lvl1_oof_path_dict: t.Dict[str, Path],
        num_folds: int,
        seed: int) -> pd.DataFrame:
    all_df = pd.read_csv(dataset_dir_path / 'train.csv')
    target_arr = contiguous_to_categorical(all_df[target_list].values)

    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold_, (_, v_) in enumerate(mskf.split(X=all_df, y=target_arr)):
        all_df.loc[v_, 'fold'] = fold_

    for target in target_list:
        all_df = all_df.merge(
            pd.read_csv(target_to_lvl1_oof_path_dict[target])
                .rename({
                    'id': 'text_id',
                    f'{target}_score': f'{target}_lvl1_score',
                }, axis=1),
            left_on='text_id',
            right_on='text_id')

    all_df = build_features(all_df)

    return all_df

all_df = _read_data(
    dataset_dir_path=FP_ELL_DATASET_DIR,
    target_list=TARGET_LIST,
    target_to_lvl1_oof_path_dict=TARGET_TO_LVL1_OOF_PATH_DICT,
    num_folds=NUM_FOLDS,
    seed=SEED)

analyze_val_strategy(all_df, target_list=TARGET_LIST, num_folds=NUM_FOLDS)

In [None]:
all_df.head(3)

#### Catboost

In [None]:
class _XY(t.NamedTuple):
    x: np.ndarray
    y: np.ndarray
    feature_name_list: t.List[str]


def _convert_df_to_xy(df: pd.DataFrame, target: str, feature_whitelist: t.Set[str]) -> _XY:
    feature_name_list = [
        col for col in t.cast(t.List[str], df.columns)
        if col in feature_whitelist
    ]
    x = df[feature_name_list].values
    y = t.cast(np.ndarray, df[target].values)
    return _XY(x=x, y=y, feature_name_list=feature_name_list)


class _CatboostFeaturizer:

    def __init__(self, target: str, feature_whitelist: t.Set[str]):
        self._target = target
        self._feature_whitelist = feature_whitelist

    def get_xy_from_df(self, df: pd.DataFrame) -> _XY:
        return _convert_df_to_xy(df=df, target=self._target, feature_whitelist=self._feature_whitelist)


class _CatboostFeatureSelector:
    _FEATURES_TO_EXCLUDE_SET = {
        'text_id',
        'full_text',
        'cohesion',
        'syntax',
        'vocabulary',
        'phraseology',
        'grammar',
        'conventions',
        'fold',
    }

    def __init__(
            self,
            target: str,
            num_features: int,
            num_steps: int,
            plot: bool = True,
            algorithm: EFeaturesSelectionAlgorithm = EFeaturesSelectionAlgorithm.RecursiveByLossFunctionChange):
        self._target = target
        self._num_features = num_features
        self._num_steps = num_steps
        self._plot = plot
        self._algorithm = algorithm

    def select_features(
            self,
            model: CatBoostRegressor,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame) -> _CatboostFeaturizer:
        default_feature_whitelist = {
            col for col in train_df.columns
            if col not in self._FEATURES_TO_EXCLUDE_SET
        }
        train_xy = _convert_df_to_xy(train_df, target=self._target, feature_whitelist=default_feature_whitelist)
        valid_xy = _convert_df_to_xy(valid_df, target=self._target, feature_whitelist=default_feature_whitelist)

        summary = model.select_features(
            train_xy.x,
            train_xy.y,
            eval_set=(valid_xy.x, valid_xy.y),
            features_for_select=list(range(len(train_xy.feature_name_list))),
            num_features_to_select=self._num_features,
            steps=self._num_steps,
            algorithm=self._algorithm,
            logging_level='Silent',
            # train_final_model=False,
            train_final_model=True,
            plot=self._plot)

        return _CatboostFeaturizer(
            target=self._target,
            # feature_whitelist={train_xy.feature_name_list[i] for i in summary['selected_features']})
            feature_whitelist=default_feature_whitelist)


class _CatboostTrainer:
    target: str

    def __init__(
            self,
            plot: bool = True,
            model_path_template: t.Optional[Path] = None):
        self._plot = plot
        self._model_path_template = model_path_template

        self._fold_to_feature_whitelist_dict: t.Dict[int, t.List[str]] = {}

    @property
    def fold_to_feature_whitelist_dict(self) -> t.Dict[int, t.List[str]]:
        return self._fold_to_feature_whitelist_dict

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',)

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)

    def __call__(self, fold: int) -> t.Tuple[float, PredDict]:
        train_df, valid_df = all_df[all_df['fold'] != fold], all_df[all_df['fold'] == fold]
        feature_selector = self._get_feature_selector()

        booster = self._get_model()
        featurizer = feature_selector.select_features(model=booster, train_df=train_df, valid_df=valid_df)
        train_xy, valid_xy = featurizer.get_xy_from_df(train_df), featurizer.get_xy_from_df(valid_df)
        valid_y_pred = booster.predict(valid_xy.x)

        self._fold_to_feature_whitelist_dict[fold] = train_xy.feature_name_list
        if self._model_path_template is not None:
            booster.save_model(str(format_path(self._model_path_template, target=self.target, fold=fold)))  # type: ignore
        score = math.sqrt(mean_squared_error(y_true=valid_xy.y, y_pred=valid_y_pred))

        return score, PredDict(zip(valid_df['text_id'].tolist(), [ensure_list(x) for x in valid_y_pred.tolist()]))


class _CohesionCatboostTrainer(_CatboostTrainer):
    target = 'cohesion'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)


class _SyntaxCatboostTrainer(_CatboostTrainer):
    target = 'syntax'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)


class _VocabularyCatboostTrainer(_CatboostTrainer):
    target = 'vocabulary'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)


class _PhraseologyCatboostTrainer(_CatboostTrainer):
    target = 'phraseology'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)


class _GrammarCatboostTrainer(_CatboostTrainer):
    target = 'grammar'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=20,
            num_steps=3,
            plot=True)


class _ConventionsCatboostTrainer(_CatboostTrainer):
    target = 'conventions'

    def _get_model(self) -> CatBoostRegressor:
        return CatBoostRegressor(
            task_type='GPU',
            random_seed=SEED,
            loss_function='RMSE',
            l2_leaf_reg=12.0,
            learning_rate=0.025
        )

    def _get_feature_selector(self) -> _CatboostFeatureSelector:
        return _CatboostFeatureSelector(
            target=self.target,
            num_features=22,
            num_steps=3,
            plot=True)


trainer = _VocabularyCatboostTrainer(
    model_path_template=MODEL_DIR / 'fp-ell-models-boosting/lvl2-catboost-{target}-cv1-fold_{fold}.cbm')
score_list, oof_pred_dict= train_kfold_model(
        train_model_fn=trainer,
        fold_list=FOLD_LIST)
oof_pred_dict.save_to_csv(
    OOF_DIR / f'lvl2-catboost-{trainer.target}-cv1.csv',
    score_col_name_list=[f'{trainer.target}_score'])
print(f'Mean score: {statistics.mean(score_list):.4f}')
build_fold_result_df(fold_list=FOLD_LIST, score_list=score_list)