In [1]:
import math
import typing as t
from enum import Enum
from pathlib import Path

import kaggle_toolbox.features.transform as feature_transforms
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from kaggle_toolbox.validation import analyze_val_strategy
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
TARGET_LIST = [
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]
NUM_FOLDS = 5
SEED = 42

ROOT_DIR = Path('/kaggle')
DATA_DIR = ROOT_DIR / 'data'
FP_ELL_DATASET_DIR = DATA_DIR / 'fp-ell'
OOF_DIR = ROOT_DIR / 'oof'


class ModelId(str, Enum):
    a_b_s = 'a_b_s'
    d_b_s = 'd_b_s'
    d_l_s = 'd_l_s'
    k_b_m = 'k_b_m'
    a_b_m = 'a_b_m'


OOF_PATH_DICT = {
    ModelId.a_b_s: OOF_DIR / 'cv_preds_andrei_solo_target_base_5_folds.csv',
    ModelId.d_b_s: OOF_DIR / 'cv_preds_dima_solo_target_base_10_folds.csv',
    ModelId.d_l_s: OOF_DIR / 'cv_preds_dima_solo_target_large_10_folds.csv',
    ModelId.k_b_m: OOF_DIR / 'koj-awp-v3-base.csv',
    ModelId.a_b_m: OOF_DIR / 'v1-layer_norm-5fold.csv',
}

In [3]:
def _read_data(
        dataset_dir_path: Path,
        num_folds: int,
        seed: int) -> pd.DataFrame:
    all_df = pd.read_csv(dataset_dir_path / 'train.csv')
    target_arr = feature_transforms.contiguous_to_categorical(all_df[TARGET_LIST].values)

    mskf = MultilabelStratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold_, (_, v_) in enumerate(mskf.split(X=all_df, y=target_arr)):
        all_df.loc[v_, 'fold'] = fold_

    for oof_key in [ModelId.a_b_s, ModelId.d_b_s, ModelId.d_l_s]:
        all_df = all_df.merge(
            pd.read_csv(OOF_PATH_DICT[oof_key])
                .rename({
                    'id': 'text_id',
                    **{
                        target: f'{oof_key.value}_{target}_score'
                        for target in TARGET_LIST
                    }
                }, axis=1),
            left_on='text_id',
            right_on='text_id')

    all_df = all_df.merge(
        pd.read_csv(OOF_PATH_DICT[ModelId.a_b_m]).rename({
            'id': 'text_id',
            **{
                f'{target}_score': f'{ModelId.a_b_m.value}_{target}_score'
                for target in TARGET_LIST
            }
        }, axis=1),
        left_on='text_id',
        right_on='text_id')
    all_df = all_df.merge(
        pd.read_csv(OOF_PATH_DICT[ModelId.k_b_m]).rename({
            target: f'{ModelId.k_b_m.value}_{target}_score'
            for target in TARGET_LIST
        }, axis=1),
        left_on='text_id',
        right_on='text_id')

    return all_df

score_df = _read_data(
    dataset_dir_path=FP_ELL_DATASET_DIR,
    num_folds=NUM_FOLDS,
    seed=SEED)

analyze_val_strategy(score_df, target_list=TARGET_LIST, num_folds=NUM_FOLDS)

Unnamed: 0,fold,num_samples,cohesion_mean,syntax_mean,vocabulary_mean,phraseology_mean,grammar_mean,conventions_mean
0,0,782,3.077366,2.971867,3.205243,3.065857,2.959719,3.035166
1,1,783,3.12516,3.007024,3.226054,3.111111,3.015964,3.079183
2,2,782,3.140665,3.068414,3.258312,3.138747,3.069693,3.116368
3,3,782,3.131074,3.048593,3.245524,3.125959,3.042839,3.074169
4,4,782,3.161125,3.045396,3.243606,3.142583,3.076087,3.100384


In [4]:
score_df.head(3)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,fold,a_b_s_cohesion_score,...,a_b_m_vocabulary_score,a_b_m_phraseology_score,a_b_m_grammar_score,a_b_m_conventions_score,k_b_m_cohesion_score,k_b_m_syntax_score,k_b_m_vocabulary_score,k_b_m_phraseology_score,k_b_m_grammar_score,k_b_m_conventions_score
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,1.0,2.85574,...,3.142851,3.163987,3.148065,3.0148,3.050259,2.99133,3.215425,3.227505,3.242692,2.957901
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0.0,2.784366,...,2.858511,2.634341,2.465478,2.662711,2.783803,2.646464,2.897536,2.759491,2.462135,2.704659
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,4.0,2.891909,...,3.046087,2.960622,2.939112,2.984555,2.945666,2.97167,3.056777,3.002545,3.035044,3.195591


In [10]:
def _get_xy(df: pd.DataFrame, target: str) -> t.Tuple[np.ndarray, np.ndarray]:
    x_arr = df[[
        f'{ModelId.d_b_s.value}_{target}_score',
        f'{ModelId.d_l_s.value}_{target}_score',
        f'{ModelId.a_b_s.value}_{target}_score',
        f'{ModelId.a_b_m.value}_{target}_score',
        f'{ModelId.k_b_m.value}_{target}_score',
    ]].values
    y_arr = t.cast(np.ndarray, df[target].values)
    return x_arr, y_arr


class LinearFeatureGenerator:

    def __init__(self, model_id_list: t.List[ModelId]) -> None:
        self._model_id_list = model_id_list

    def __call__(self, df: pd.DataFrame, target: str) -> t.Tuple[np.ndarray, np.ndarray]:
        x_arr = df[[f'{m_id.value}_{target}_score' for m_id in self._model_id_list]].values
        y_arr = t.cast(np.ndarray, df[target].values)
        return x_arr, y_arr


def _get_pred(x: np.ndarray, weight_list: t.List[float]) -> np.ndarray:
    return np.stack([x[:, i] * m_w for i, m_w in enumerate(weight_list)], axis=0).sum(axis=0)


def train_linear_model(
        df: pd.DataFrame,
        feature_generator: LinearFeatureGenerator,
        fold_list: t.List[int],
        baseline_weight_list: t.List[float],
        fit_intercept: bool = False,
        restrict_to_positive: bool = True,
        coef_decimal_places: int = 4):
    baseline_rmse_list, model_rmse_list = [], []
    for target in TARGET_LIST:
        target_baseline_score_list, target_model_score_list, target_true_list = [], [], []
        # print(f'target = {target}')
        coef_list: t.List[np.ndarray] = []
        for fold in fold_list:
            train_df, valid_df = df[df['fold'] != fold], df[df['fold'] == fold]
            train_x, train_y = feature_generator(train_df, target=target)
            valid_x, valid_y = feature_generator(valid_df, target=target)
            model = LinearRegression(fit_intercept=fit_intercept, positive=restrict_to_positive)
            model.fit(train_x, train_y)
            valid_model_pred = model.predict(valid_x)
            model_rmse = math.sqrt(mean_squared_error(valid_y, valid_model_pred))
            valid_baseline_pred = _get_pred(valid_x, baseline_weight_list)
            baseline_rmse = math.sqrt(mean_squared_error(valid_y, valid_baseline_pred))

            coef_list.append(np.array(list(model.coef_)) if model_rmse < baseline_rmse else np.array(baseline_weight_list))
            target_baseline_score_list.extend(valid_baseline_pred.tolist())
            target_model_score_list.extend(valid_model_pred.tolist() if model_rmse < baseline_rmse else valid_baseline_pred.tolist())
            target_true_list.extend(valid_y.tolist())
            # print(f'fold {fold}: RMSE improvement = {baseline_rmse - model_rmse:+.4f}, coef = {["{:.6}".format(str(x)) for x in model.coef_]}')
        final_coef = np.stack(coef_list, axis=0).mean(axis=0)
        # all_x, all_y = _get_xy(df, target=target)
        # final_model_pred = _get_pred(all_x, final_coef.tolist())
        # final_model_rmse = math.sqrt(mean_squared_error(all_y, final_model_pred))
        # final_baseline_pred = _get_pred(all_x, [0.25, 0.25, 0.25, 0.125, 0.125])
        # final_baseline_rmse = math.sqrt(mean_squared_error(all_y, final_baseline_pred))
        # print(f' final: RMSE improvement = {final_baseline_rmse - final_model_rmse:+.4f}, coef = {["{:.6}".format(str(x)) for x in final_coef]}, coef_sum = {sum(final_coef):.4f}')

        target_baseline_pred_arr = np.array(target_baseline_score_list)
        target_model_pred_arr = np.array(target_model_score_list)
        target_true_arr = np.array(target_true_list)

        target_baseline_rmse_cv = math.sqrt(mean_squared_error(target_true_arr, target_baseline_pred_arr))
        target_model_rmse_cv = math.sqrt(mean_squared_error(target_true_arr, target_model_pred_arr))

        baseline_rmse_list.append(target_baseline_rmse_cv)
        model_rmse_list.append(target_model_rmse_cv)
        final_coef_list = [round(x * 10 ** coef_decimal_places) / 10 ** coef_decimal_places for x in final_coef]

        print(f'{target.rjust(16)}: baseline_cv = {target_baseline_rmse_cv:.4f}, model_cv = {target_model_rmse_cv:.4f}, coef_sum = {sum(final_coef_list)}, coef = {final_coef_list}')

    baseline_rmse_cv = np.mean(baseline_rmse_list)
    model_rmse_cv = np.mean(model_rmse_list)
    print(f'{"all targets".rjust(16)}: baseline_cv = {baseline_rmse_cv:.6f}, model_cv = {model_rmse_cv:.6f}, improvement = {baseline_rmse_cv - model_rmse_cv:.6f}')


In [11]:
train_linear_model(
    score_df,
    feature_generator=LinearFeatureGenerator(
        model_id_list=[
            ModelId.d_b_s,
            ModelId.d_l_s,
            ModelId.a_b_s,
            ModelId.a_b_m,
            ModelId.k_b_m,
        ]),
    fold_list=list(range(NUM_FOLDS)),
    baseline_weight_list=[0.25, 0.25, 0.25, 0.125, 0.125],
    coef_decimal_places=6)

        cohesion: baseline_cv = 0.4727, model_cv = 0.4724, coef_sum = 1.000944, coef = [0.170975, 0.372307, 0.331224, 0.05, 0.076438]
          syntax: baseline_cv = 0.4369, model_cv = 0.4369, coef_sum = 0.999497, coef = [0.222712, 0.326085, 0.242893, 0.126667, 0.08114]
      vocabulary: baseline_cv = 0.4062, model_cv = 0.4062, coef_sum = 1.0, coef = [0.25, 0.25, 0.25, 0.125, 0.125]
     phraseology: baseline_cv = 0.4458, model_cv = 0.4458, coef_sum = 1.0, coef = [0.25, 0.25, 0.25, 0.125, 0.125]
         grammar: baseline_cv = 0.4607, model_cv = 0.4601, coef_sum = 1.000119, coef = [0.184956, 0.407007, 0.254322, 0.088933, 0.064901]
     conventions: baseline_cv = 0.4370, model_cv = 0.4365, coef_sum = 0.9991450000000001, coef = [0.181257, 0.436733, 0.194493, 0.114384, 0.072278]
     all targets: baseline_cv = 0.443213, model_cv = 0.442960, improvement = 0.000253


In [12]:
train_linear_model(
    score_df,
    feature_generator=LinearFeatureGenerator(
        model_id_list=[
            ModelId.d_b_s,
            ModelId.d_l_s,
            ModelId.a_b_s,
            ModelId.a_b_m,
            ModelId.k_b_m,
        ]),
    fold_list=list(range(NUM_FOLDS)),
    baseline_weight_list=[0.25, 0.25, 0.25, 0.125, 0.125])

        cohesion: baseline_cv = 0.4727, model_cv = 0.4724, coef_sum = 1.0009000000000001, coef = [0.171, 0.3723, 0.3312, 0.05, 0.0764]
          syntax: baseline_cv = 0.4369, model_cv = 0.4369, coef_sum = 0.9995, coef = [0.2227, 0.3261, 0.2429, 0.1267, 0.0811]
      vocabulary: baseline_cv = 0.4062, model_cv = 0.4062, coef_sum = 1.0, coef = [0.25, 0.25, 0.25, 0.125, 0.125]
     phraseology: baseline_cv = 0.4458, model_cv = 0.4458, coef_sum = 1.0, coef = [0.25, 0.25, 0.25, 0.125, 0.125]
         grammar: baseline_cv = 0.4607, model_cv = 0.4601, coef_sum = 1.0001, coef = [0.185, 0.407, 0.2543, 0.0889, 0.0649]
     conventions: baseline_cv = 0.4370, model_cv = 0.4365, coef_sum = 0.9992000000000001, coef = [0.1813, 0.4367, 0.1945, 0.1144, 0.0723]
     all targets: baseline_cv = 0.443213, model_cv = 0.442960, improvement = 0.000253


In [7]:
train_linear_model(
    score_df,
    feature_generator=LinearFeatureGenerator(
        model_id_list=[
            ModelId.d_b_s,
            ModelId.d_l_s,
            ModelId.a_b_s,
        ]),
    fold_list=list(range(NUM_FOLDS)),
    baseline_weight_list=[1/3, 1/3, 1/3])

        cohesion: baseline_cv = 0.4720, model_cv = 0.4718, coef = [0.258709, 0.394697, 0.347128]
          syntax: baseline_cv = 0.4369, model_cv = 0.4368, coef = [0.314046, 0.357548, 0.328136]
      vocabulary: baseline_cv = 0.4062, model_cv = 0.4062, coef = [0.333333, 0.333333, 0.333333]
     phraseology: baseline_cv = 0.4458, model_cv = 0.4458, coef = [0.332206, 0.348713, 0.319228]
         grammar: baseline_cv = 0.4603, model_cv = 0.4600, coef = [0.251352, 0.444014, 0.305037]
     conventions: baseline_cv = 0.4370, model_cv = 0.4367, coef = [0.268295, 0.439319, 0.291995]
     all targets: baseline_cv = 0.443045, model_cv = 0.442900, improvement = 0.000146
