#### Imports

In [49]:
import collections
import math
import numpy as np
import pickle
import typing as t
from pathlib import Path

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

#### Directories

In [46]:
ROOT_DIR = Path('/root/data')
DATASET_ROOT_DIR_PATH = ROOT_DIR / 'datasets'
DATASET_DIR_PATH = DATASET_ROOT_DIR_PATH / 'fp-0p925'
MODEL_DIR_PATH = ROOT_DIR / 'models'

In [4]:
!ls -la $DATASET_DIR_PATH

total 559684
drwxrwxr-x 2 root root      4096 Aug 16 18:18  .
drwxrwxr-x 5 root root      4096 Aug 11 14:04  ..
-rw-rw-r-- 1 root root  13197552 Jul 30 22:29  TEST_DB.csv
-rw-rw-r-- 1 root root 181093973 Jul 30 12:29  TRAIN_DB.csv
-rw-rw-r-- 1 root root    192850 Aug 16 02:34  microsoft-deberta-v3-base-0p925-meta-fold_0.csv
-rw-rw-r-- 1 root root    192660 Aug 16 02:35  microsoft-deberta-v3-base-0p925-meta-fold_1.csv
-rw-rw-r-- 1 root root    192681 Aug 16 02:36  microsoft-deberta-v3-base-0p925-meta-fold_2.csv
-rw-rw-r-- 1 root root    192628 Aug 16 02:37  microsoft-deberta-v3-base-0p925-meta-fold_3.csv
-rw-rw-r-- 1 root root    192673 Aug 16 02:38  microsoft-deberta-v3-base-0p925-meta-fold_4.csv
-rw-r--r-- 1 root root    192590 Aug 15 20:00  microsoft-deberta-v3-base-0p925-meta.csv
-rw-r--r-- 1 root root    123699 Aug 12 12:35  predicted_test_by_v5DBlS42.csv
-rw-r--r-- 1 root root  36381617 Aug 11 14:00 'prediction[0].csv'
-rw-r--r-- 1 root root  37668899 Aug 11 14:01 'prediction[1].c

#### Data loading

In [5]:
def _load_df(data_dir: Path) -> pd.DataFrame:
    test_df = pd.read_csv(data_dir / 'TEST_DB.csv')
    for fold in range(5):
        fold_df = pd.read_csv(data_dir / f'microsoft-deberta-v3-base-0p925-meta-fold_{fold}.csv').rename({
            'score_ineffective': f'score_ineffective_{fold}',
            'score_adequate': f'score_adequate_{fold}',
            'score_effective': f'score_effective_{fold}',
        }, axis=1)
        test_df = test_df.merge(fold_df, left_on='discourse_id', right_on='id')
    return test_df

test_df = _load_df(DATASET_DIR_PATH)

  test_df = test_df.merge(fold_df, left_on='discourse_id', right_on='id')


In [11]:
test_df.columns

Index(['discourse_id', 'essay_id', 'discourse_text', 'discourse_type',
       'discourse_effectiveness', 'essay', 'before', 'after', 'id_x',
       'score_ineffective_0', 'score_adequate_0', 'score_effective_0', 'id_y',
       'score_ineffective_1', 'score_adequate_1', 'score_effective_1', 'id_x',
       'score_ineffective_2', 'score_adequate_2', 'score_effective_2', 'id_y',
       'score_ineffective_3', 'score_adequate_3', 'score_effective_3', 'id',
       'score_ineffective_4', 'score_adequate_4', 'score_effective_4'],
      dtype='object')

In [6]:
test_df.head(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay,before,after,id_x,score_ineffective_0,...,score_adequate_2,score_effective_2,id_y,score_ineffective_3,score_adequate_3,score_effective_3,id,score_ineffective_4,score_adequate_4,score_effective_4
0,ba3f708db030,02A3E737A10F,Students shouldn't have to participate in one ...,Position,Adequate,Students shouldn't have to participate in one ...,NO,\nStudents have other things to do at home. Ma...,ba3f708db030,0.109951,...,0.872251,0.055061,ba3f708db030,0.072376,0.823252,0.104372,ba3f708db030,0.143035,0.797386,0.059579
1,efeb7a805421,02A3E737A10F,Students have other things to do at home,Claim,Adequate,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,. Many students have a lot of things to do at ...,efeb7a805421,0.068727,...,0.855797,0.086394,efeb7a805421,0.044972,0.806221,0.148807,efeb7a805421,0.037542,0.841991,0.120467
2,03205305e7bd,02A3E737A10F,Many students have a lot of things to do at ho...,Evidence,Effective,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,I disagree with the principal that every stude...,03205305e7bd,0.068968,...,0.672169,0.2843,03205305e7bd,0.02464,0.506623,0.468738,03205305e7bd,0.03856,0.650531,0.310909


#### Loss

In [7]:
def _get_row_loss(row: t.Dict[str, t.Any]) -> float:
    (
        disc_eff,
        score_ineff,
        score_adeq,
        score_eff,
    ) = (
        str(row['discourse_effectiveness']),
        float(row['score_ineffective']),
        float(row['score_adequate']),
        float(row['score_effective']),
    )
    if disc_eff == 'Ineffective':
        return -math.log(score_ineff)
    if disc_eff == 'Adequate':
        return -math.log(score_adeq)
    if disc_eff == 'Effective':
        return -math.log(score_eff)
    raise ValueError(f'Unknown disc_eff = {disc_eff}')


def get_loss(df: pd.DataFrame) -> float:
    return df.apply(_get_row_loss, axis=1).mean()

#### Mean

In [8]:
def _mean(xs):
    return sum(xs) / len(xs)


def _ensemble_mean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['score_ineffective'] = _mean([df[f'score_ineffective_{fold}'] for fold in range(5)])
    df['score_adequate'] = _mean([df[f'score_adequate_{fold}'] for fold in range(5)])
    df['score_effective'] = _mean([df[f'score_effective_{fold}'] for fold in range(5)])
    return df

get_loss(_ensemble_mean(test_df))

0.6297654462679665

#### Voting

In [9]:
class _VoteGetter:

    def __init__(self, fold: int):
        self._fold = fold

    def __call__(self, row: t.Dict[str, t.Any]) -> int:
        (
            score_eff,
            score_adq,
            score_ineff,
        ) = (
            float(row[f'score_effective_{self._fold}']),
            float(row[f'score_adequate_{self._fold}']),
            float(row[f'score_ineffective_{self._fold}']),
        )
        if score_eff > score_adq and score_eff > score_ineff:
            return 2
        if score_adq > score_eff and score_adq > score_ineff:
            return 1
        if score_ineff > score_adq and score_ineff > score_eff:
            return 0
        raise RuntimeError(f'ineff = {score_ineff:.4f} adq = {score_adq:.4f} eff = {score_eff:.4f}')


def _get_num_vote_opt(row: t.Dict[str, t.Any]) -> int:
    return len({row[f'vote_{fold}'] for fold in range(5)})


def _vote_main(df: pd.DataFrame):
    df = df.copy()
    for fold in range(5):
        df[f'vote_{fold}'] = df.progress_apply(_VoteGetter(fold=fold), axis=1)
    df['num_vote_opt'] = df.progress_apply(_get_num_vote_opt, axis=1)
    print(df.groupby('num_vote_opt')['id'].count())


def _vote_to_score_col_name_template(vote: int) -> str:
    if vote == 0:
        return 'score_ineffective_{fold}'
    if vote == 1:
        return 'score_adequate_{fold}'
    if vote == 2:
        return 'score_effective_{fold}'
    raise ValueError(f'Invalid `vote`: {vote}.')


def _get_best_vote(row: t.Dict[str, t.Any]) -> int:
    vote_list = [row[f'vote_{fold}'] for fold in range(5)]
    vote_counter = collections.Counter(vote_list)
    return max(vote_counter, key=vote_counter.get)


def _get_best_fold(row: t.Dict[str, t.Any]) -> int:
    correct_fold_list = [fold for fold in range(5) if row[f'vote_{fold}'] == row['vote_best']]
    return max(
        [fold for fold in correct_fold_list],
        key=lambda fold: row[_vote_to_score_col_name_template(row['vote_best']).format(fold=fold)])


def _ensemble_vote_max(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for fold in range(5):
        df[f'vote_{fold}'] = df.progress_apply(_VoteGetter(fold=fold), axis=1)
    df['vote_best'] = df.progress_apply(_get_best_vote, axis=1)
    df['vote_best_fold'] = df.progress_apply(_get_best_fold, axis=1)
    df['score_ineffective'] = df.progress_apply(lambda row: row[f'score_ineffective_{row["vote_best_fold"]}'], axis=1)
    df['score_adequate'] = df.progress_apply(lambda row: row[f'score_adequate_{row["vote_best_fold"]}'], axis=1)
    df['score_effective'] = df.progress_apply(lambda row: row[f'score_effective_{row["vote_best_fold"]}'], axis=1)
    df['loss'] = df.progress_apply(_get_row_loss, axis=1)
    return df


def _ensemble_vote_mean(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for fold in range(5):
        df[f'vote_{fold}'] = df.progress_apply(_VoteGetter(fold=fold), axis=1)
    df['vote_best'] = df.progress_apply(_get_best_vote, axis=1)
    df['score_ineffective'] = df.progress_apply(
        lambda row: _mean([row[f'score_ineffective_{fold}'] for fold in range(5) if row[f'vote_{fold}'] == row['vote_best']]), axis=1)
    df['score_adequate'] = df.progress_apply(
        lambda row: _mean([row[f'score_adequate_{fold}'] for fold in range(5) if row[f'vote_{fold}'] == row['vote_best']]), axis=1)
    df['score_effective'] = df.progress_apply(
        lambda row: _mean([row[f'score_effective_{fold}'] for fold in range(5) if row[f'vote_{fold}'] == row['vote_best']]), axis=1)
    df['loss'] = df.progress_apply(_get_row_loss, axis=1)
    return df


get_loss(_ensemble_vote_mean(test_df))

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

0.6307805088892932

#### Decision tree

In [47]:
_DISC_EFF_STR_TO_INT = {
    'Ineffective': 0,
    'Adequate': 1,
    'Effective': 2,
}


def _build_x(df: pd.DataFrame) -> np.ndarray:
    x_2d_list = []
    disc_type_list = sorted(list(df['discourse_type'].unique()))
    for _, row in tqdm(df.iterrows(), total=len(df)):
        x_1d_list = []
        for fold in range(5):
            x_1d_list.extend([
                float(row[f'score_ineffective_{fold}']),
                float(row[f'score_adequate_{fold}']),
                float(row[f'score_effective_{fold}']),
                float(row[f'score_effective_{fold}']) - float(row[f'score_adequate_{fold}']),
                float(row[f'score_effective_{fold}']) - float(row[f'score_ineffective_{fold}']),
                float(row[f'score_adequate_{fold}']) - float(row[f'score_ineffective_{fold}']),
            ])
        x_1d_list.extend([int(row['discourse_type'] == dt) for dt in disc_type_list])
        x_2d_list.append(x_1d_list)
    return np.array(x_2d_list)


def _get_feature_name_list(df: pd.DataFrame) -> t.List[str]:
    disc_type_list = sorted(list(df['discourse_type'].unique()))
    feature_name_list = []
    for fold in range(5):
        feature_name_list.extend([
            f'score_ineffective_{fold}',
            f'score_adequate_{fold}',
            f'score_effective_{fold}',
            f'score_eff_adq_diff_{fold}',
            f'score_eff_ineff_diff_{fold}',
            f'score_adq_ineff_diff_{fold}',
        ])
    for dt in disc_type_list:
        feature_name_list.append(dt)
    return feature_name_list


def _build_y(df: pd.DataFrame) -> np.ndarray:
    return np.array([_DISC_EFF_STR_TO_INT[row['discourse_effectiveness']] for _, row in tqdm(df.iterrows(), total=len(df))])


def _compute_loss(df: pd.DataFrame, y_hat: np.ndarray) -> float:
    df = df.copy()
    df['score_ineffective'] = y_hat[:, 0]
    df['score_adequate'] = y_hat[:, 1]
    df['score_effective'] = y_hat[:, 2]
    return get_loss(df)


def _train_decision_tree_classifier(
        df: pd.DataFrame,
        save_model_to_dir: Path):
    df = df.copy()
    for max_depth in range(4, 7):
        x, y = _build_x(df), _build_y(df)
        tree = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
        tree.fit(x, y)
        y_hat = tree.predict_proba(x)
        print(f'max_depth = {max_depth} loss = {_compute_loss(df, y_hat):.5f}')
        with open(save_model_to_dir / f'lvl4-decision_tree-max_depth_{max_depth}.pkl', 'wb') as f:
            pickle.dump(tree, f)
    # return plot_tree(tree, feature_names=_get_feature_name_list(df))

_train_decision_tree_classifier(test_df, MODEL_DIR_PATH)

  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

max_depth = 4 loss = 0.60146


  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

max_depth = 5 loss = 0.56726


  0%|          | 0/2669 [00:00<?, ?it/s]

  0%|          | 0/2669 [00:00<?, ?it/s]

max_depth = 6 loss = 0.52285


#### Linear Regression

In [51]:
_DISC_EFF_STR_TO_INT = {
    'Ineffective': 0,
    'Adequate': 1,
    'Effective': 2,
}


def _build_x(df: pd.DataFrame) -> np.ndarray:
    x_2d_list = []
    # disc_type_list = sorted(list(df['discourse_type'].unique()))
    for _, row in tqdm(df.iterrows(), total=len(df)):
        x_1d_list = []
        for fold in range(5):
            x_1d_list.extend([
                float(row[f'score_ineffective_{fold}']),
                float(row[f'score_adequate_{fold}']),
                float(row[f'score_effective_{fold}']),
            ])
        # x_1d_list.extend([int(row['discourse_type'] == dt) for dt in disc_type_list])
        x_2d_list.append(x_1d_list)
    return np.array(x_2d_list)


def _build_y(df: pd.DataFrame) -> np.ndarray:
    return np.array([_DISC_EFF_STR_TO_INT[row['discourse_effectiveness']] for _, row in tqdm(df.iterrows(), total=len(df))])


def _compute_loss(df: pd.DataFrame, y_hat: np.ndarray) -> float:
    df = df.copy()
    df['score_ineffective'] = y_hat[:, 0]
    df['score_adequate'] = y_hat[:, 1]
    df['score_effective'] = y_hat[:, 2]
    return get_loss(df)


def _train_logistic_regression_classifier(
        df: pd.DataFrame,
        save_model_to_dir: Path):
    df = df.copy()
    for disc_type in sorted(list(df['discourse_type'].unique())):
        dt_df = df[df['discourse_type'] == disc_type]
        x, y = _build_x(dt_df), _build_y(dt_df)
        tree = LogisticRegression()
        tree.fit(x, y)
        y_hat = tree.predict_proba(x)
        print(f'loss = {_compute_loss(dt_df, y_hat):.5f}')
        with open(save_model_to_dir / f'lvl4-logistic_regression-dt_{disc_type.replace(" ", "_").lower()}.pkl', 'wb') as f:
            pickle.dump(tree, f)

_train_logistic_regression_classifier(test_df, MODEL_DIR_PATH)

  0%|          | 0/854 [00:00<?, ?it/s]

  0%|          | 0/854 [00:00<?, ?it/s]

loss = 0.64965


  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/251 [00:00<?, ?it/s]

loss = 0.58624


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

loss = 0.60955


  0%|          | 0/884 [00:00<?, ?it/s]

  0%|          | 0/884 [00:00<?, ?it/s]

loss = 0.62814


  0%|          | 0/170 [00:00<?, ?it/s]

  0%|          | 0/170 [00:00<?, ?it/s]

loss = 0.72910


  0%|          | 0/309 [00:00<?, ?it/s]

  0%|          | 0/309 [00:00<?, ?it/s]

loss = 0.55135


  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

loss = 0.79478
