In [None]:
from google.colab import drive

drive.mount('/content/drive/')

In [None]:
import os
os.chdir("drive/")
os.chdir('My Drive')
os.chdir('Kaggle')
os.chdir('Feedback3')

In [None]:
OUTPUT_DIR = './fb3-ensemble/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import mean_squared_error

In [None]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

# Load Out-of-folds

In [None]:
deberta_family_dir = './deberta_family_oof'
oof_files = os.listdir(deberta_family_dir)
oof_files.sort()

print(oof_files)

target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
oofs = [pd.read_pickle(f'{deberta_family_dir}/{oof_files[i]}') for i in range(10)]

# Preprocess

In [None]:
for i in range(10):
    oof = oofs[i]
    oof.rename(columns={
        "pred_cohesion": f"cohesion{i}", 
        "pred_syntax": f"syntax{i}", 
        "pred_vocabulary": f"vocabulary{i}", 
        "pred_phraseology": f"phraseology{i}", 
        "pred_grammar": f"grammar{i}", 
        "pred_conventions": f"conventions{i}"
        }, inplace=True)
    display(oofs[i].head(3))

In [None]:
cols_to_drop = ['full_text', 'fold'] + target_cols
for i in range(1, 10):
    oofs[i] = oofs[i].drop(cols_to_drop, axis=1)

In [None]:
oof = pd.merge(oofs[0], oofs[1], on=['text_id'])
for i in range(2, 10):
    oof = pd.merge(oof, oofs[i], on=['text_id'])

In [None]:
oof.head(10)

In [None]:
targets = ['text_id']
for col in target_cols:
    targets = targets + [f'{col}{i}' for i in range(0, 10)]
y_cols = ['text_id'] + target_cols

X = oof[targets]
Y = oof[y_cols]

# Train

In [None]:
import pickle
from joblib import dump, load


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

In [None]:
models = [xgb.XGBRegressor(objective="reg:squarederror", random_state=42, booster='dart') for _ in range(6)] # [gbtree, gblinear, dart]
# models = [LinearRegression() for _ in range(6)]
model_name_base = 'XGBRegressor'

In [None]:
total_mcsmse = 0

for col, model in zip(target_cols, models):
    x_cols = [f'{col}{i}' for i in range(0, 10)]
    y_cols = [col]

    Xx = X[x_cols]
    Yy = Y[y_cols]

    model = model.fit(Xx, Yy)
    score = model.score(Xx, Yy)
    preds1 = model.predict(Xx)
    preds2 = model.predict(Xx)
    preds3 = model.predict(Xx)

    preds = preds1 * 0.1 + preds2 * 0.2 + preds3 * 0.7
    # print(Yy.to_numpy())
    # print(preds.reshape(-1, 1))

    dump(model, f'{OUTPUT_DIR}{model_name_base}_{col}.pkl')
    # clf = load('filename.joblib')

    mcrmse_score, _ = get_score(Yy.to_numpy(), preds.reshape(-1, 1))
    print(f"{col} -> mcrmse_score={mcrmse_score}")
    total_mcsmse += mcrmse_score

    # print(model.coef_)


print('Total MCRMSE Score:', total_mcsmse / 6)

| Model | MCR MSE |
| ----- | ------- |
| LinearRegression | 0.44499491163317245 |
| RandomForestRegressor | 0.17236466948046347 |
| XGB | 0.41306675952329736 |


-----

1. LinearRegression

cohesion -> mcrmse_score=0.47525212701691993

syntax -> mcrmse_score=0.4387305454780827

vocabulary -> mcrmse_score=0.4078519873744786

phraseology -> mcrmse_score=0.447128036166492

grammar -> mcrmse_score=0.4622838507361844

conventions -> mcrmse_score=0.4387229230268769

Total MCRMSE Score: 0.44499491163317245

2. RandomForestRegressor

cohesion -> mcrmse_score=0.18427490983107225

syntax -> mcrmse_score=0.16950429329718303

vocabulary -> mcrmse_score=0.15831520752852293

phraseology -> mcrmse_score=0.17482226061765166

grammar -> mcrmse_score=0.1795295031339569

conventions -> mcrmse_score=0.16885819873788274

Total MCRMSE Score: 0.17236466948046347

3. XGB Regressor

cohesion -> mcrmse_score=0.44134209921795237

syntax -> mcrmse_score=0.40534709617943

vocabulary -> mcrmse_score=0.37747949472382825

phraseology -> mcrmse_score=0.4189509214088031

grammar -> mcrmse_score=0.4281746852031484

conventions -> mcrmse_score=0.4071062604066224

Total MCRMSE Score: 0.41306675952329736

In [None]:
for col in target_cols:
    x_cols = [f'{col}{i}' for i in range(0, 10)]
    y_cols = [col]

    Xx = X[x_cols]
    Yy = Y[y_cols]

    clf = load(f'{OUTPUT_DIR}{model_name_base}_{col}.pkl')