In [1]:
import os
import sys
sys.path.append('..')

import pandas as pd

from src.constants import get_constants
from src.features.config import CYEConfigPreProcessor, CYEConfigTransformer
from src.features.preprocessing import CYEDataPreProcessor, CYETargetTransformer

from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

In [2]:
cst = get_constants()

scale = 'none'

config = CYEConfigPreProcessor(delna_thr=50, fill_mode='mean', scale=scale)

dpp = CYEDataPreProcessor(config=config)
                    
config = CYEConfigTransformer(scale=scale)
tt_train = CYETargetTransformer(config=config)
tt_test = CYETargetTransformer(config=config)

df_train = pd.read_csv(os.path.join('..', cst.file_data_train), index_col='ID')

X_train, y_train = df_train.drop(columns=cst.target_column), df_train[cst.target_column]
y_train = tt_train.fit_transform(X_train, y_train)
X_train = dpp.fit_transform(X_train)

# Test data
X_test = pd.read_csv(os.path.join('..', cst.file_data_test), index_col='ID')
tt_test.fit(X_test)
X_test = dpp.transform(X_test)

In [3]:
def compute_score(estimator1):
    bins_train = pd.qcut(y_train, q=10, labels=False)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    cv = skf.split(X=X_train, y=bins_train)

    y_pred = cross_val_predict(
        estimator=estimator1,
        X=X_train.to_numpy(),
        y=y_train.to_numpy(),
        cv=cv,
    )
    
    y_pred = tt_train.inverse_transform(y_pred)
    # y_pred = [y if y > 0 else 0 for y in y_pred]
    rmse = metrics.mean_squared_error(y_pred=y_pred, y_true=tt_train.inverse_transform(y_train), squared=False)
    
    print(rmse)

    return rmse


def submit(estimator2, score, model):
    submission = pd.DataFrame()
    estimator2.fit(X=X_train.to_numpy(), y=y_train.to_numpy())
    
    ID = X_test.index.values.tolist()
    Yield = estimator2.predict(X_test.to_numpy())
    submission = tt_test.inverse_transform(Yield)
    submission = pd.Series(submission, index=X_test.index)
    submission.name = 'Yield'
    
    
    name_submission = f'{model}_{score:.3f}'
    file_submission = os.path.join(os.path.join('..', cst.path_submissions), f'{name_submission}.csv')

    submission.to_csv(file_submission, index=True)

In [4]:
from xgboost import XGBRegressor

estimator1 = XGBRegressor()
score = compute_score(estimator1)

estimator2 = XGBRegressor()
submit(estimator2, score, 'xgboost')

549.382070775091


In [5]:
from lightgbm import LGBMRegressor

estimator1 = LGBMRegressor(verbosity=-1)
score = compute_score(estimator1)

estimator2 = LGBMRegressor(verbosity=-1)
submit(estimator2, score, 'lightgbm')

486.2431065125051


In [6]:
from catboost import CatBoostRegressor

estimator1 = CatBoostRegressor(verbose=0)
score = compute_score(estimator1)

estimator2 = CatBoostRegressor(verbose=0)
submit(estimator2, score, 'catboost')

485.6435998255843


In [7]:
# from lce import LCERegressor

# estimator1 = LCERegressor(verbose=0)
# scores = compute_score(estimator1)

# estimator2 = LCERegressor()
# submit(estimator2, scores, 'lce')

In [8]:
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE

augmentations = RegressionSMOTE(p=0.2)

batch_size = 32

unsupervised = TabNetPretrainer()
unsupervised.fit(X_train.to_numpy(), 
                 pretraining_ratio=0.8,
                 batch_size=batch_size)

estimator = TabNetRegressor()
estimator.fit(X_train=X_train.to_numpy(), 
              y_train=y_train.to_numpy().reshape(-1, 1), 
              max_epochs=1000,
              patience=50,
              eval_metric=['rmse'],
              from_unsupervised=unsupervised, 
              batch_size=batch_size,
              drop_last=False,
              augmentations=augmentations)

submission = pd.DataFrame()
    
ID = X_test.index.values.tolist()
Yield = estimator.predict(X_test.to_numpy())

submission['ID'] = ID
submission['Yield'] = Yield

file_submission = os.path.join(os.path.join('..', cst.path_submissions), f'tabnet.csv')
submission.to_csv(file_submission, index=False)



epoch 0  | loss: 285.18937|  0:00:01s
epoch 1  | loss: -9391.97593|  0:00:02s
epoch 2  | loss: -258616.92487|  0:00:03s
epoch 3  | loss: -1847177.95104|  0:00:04s
epoch 4  | loss: -6659789.2125|  0:00:06s
epoch 5  | loss: -14732337.34167|  0:00:07s
epoch 6  | loss: -27992642.98333|  0:00:08s
epoch 7  | loss: -48436227.56667|  0:00:09s
epoch 8  | loss: -78244256.93333|  0:00:10s
epoch 9  | loss: -116384121.33333|  0:00:11s
epoch 10 | loss: -171937436.13333|  0:00:13s
epoch 11 | loss: -242939380.13333|  0:00:14s
epoch 12 | loss: -313002109.33333|  0:00:15s
epoch 13 | loss: -417165738.93333|  0:00:16s
epoch 14 | loss: -592685269.6|  0:00:17s
epoch 15 | loss: -803891354.13333|  0:00:19s
epoch 16 | loss: -1020150362.66667|  0:00:20s
epoch 17 | loss: -1226558673.06667|  0:00:21s
epoch 18 | loss: -1556071488.0|  0:00:22s
epoch 19 | loss: -1881055416.53333|  0:00:23s
epoch 20 | loss: -2253375886.93333|  0:00:25s
epoch 21 | loss: -2766060157.86667|  0:00:26s
epoch 22 | loss: -3233978700.8|  0:0



epoch 0  | loss: 681851.10362|  0:00:01s
epoch 1  | loss: 546853.23286|  0:00:02s
epoch 2  | loss: 329230.45368|  0:00:03s
epoch 3  | loss: 260995.99073|  0:00:04s
epoch 4  | loss: 237075.21398|  0:00:05s
epoch 5  | loss: 226454.4224|  0:00:06s
epoch 6  | loss: 199779.69617|  0:00:07s
epoch 7  | loss: 233886.61017|  0:00:08s
epoch 8  | loss: 231854.42664|  0:00:09s
epoch 9  | loss: 196661.68586|  0:00:10s
epoch 10 | loss: 188047.6741|  0:00:11s
epoch 11 | loss: 229706.87491|  0:00:12s
epoch 12 | loss: 230703.80347|  0:00:13s
epoch 13 | loss: 224113.21704|  0:00:14s
epoch 14 | loss: 223951.1149|  0:00:15s
epoch 15 | loss: 205907.08728|  0:00:16s
epoch 16 | loss: 204659.99026|  0:00:17s
epoch 17 | loss: 231701.86858|  0:00:18s
epoch 18 | loss: 218227.69826|  0:00:19s
epoch 19 | loss: 179469.22653|  0:00:20s
epoch 20 | loss: 222239.18504|  0:00:21s
epoch 21 | loss: 224472.81391|  0:00:22s
epoch 22 | loss: 213245.71839|  0:00:23s
epoch 23 | loss: 228386.68674|  0:00:24s
epoch 24 | loss: 20

epoch 201| loss: 115504.3306|  0:03:29s
epoch 202| loss: 120781.14149|  0:03:30s
epoch 203| loss: 118630.19991|  0:03:31s
epoch 204| loss: 127131.28353|  0:03:32s
epoch 205| loss: 149263.8467|  0:03:33s
epoch 206| loss: 116625.92419|  0:03:34s
epoch 207| loss: 130439.50354|  0:03:35s
epoch 208| loss: 139969.39746|  0:03:36s
epoch 209| loss: 129561.21263|  0:03:37s
epoch 210| loss: 136504.10975|  0:03:38s
epoch 211| loss: 91926.60098|  0:03:39s
epoch 212| loss: 128164.03439|  0:03:40s
epoch 213| loss: 104261.2215|  0:03:42s
epoch 214| loss: 135797.78092|  0:03:43s
epoch 215| loss: 110895.80922|  0:03:44s
epoch 216| loss: 145426.49153|  0:03:45s
epoch 217| loss: 134089.88205|  0:03:46s
epoch 218| loss: 155134.53189|  0:03:47s
epoch 219| loss: 106706.7148|  0:03:48s
epoch 220| loss: 137606.81769|  0:03:49s
epoch 221| loss: 144251.42202|  0:03:50s
epoch 222| loss: 108157.11101|  0:03:51s
epoch 223| loss: 109340.09465|  0:03:52s
epoch 224| loss: 119493.29486|  0:03:53s
epoch 225| loss: 1264

epoch 404| loss: 56398.35672|  0:07:02s
epoch 405| loss: 106870.53173|  0:07:03s
epoch 406| loss: 66844.30644|  0:07:04s
epoch 407| loss: 83671.06445|  0:07:05s
epoch 408| loss: 74959.71155|  0:07:06s
epoch 409| loss: 54008.63439|  0:07:07s
epoch 410| loss: 104606.97625|  0:07:08s
epoch 411| loss: 105629.29451|  0:07:09s
epoch 412| loss: 81561.36546|  0:07:10s
epoch 413| loss: 88357.18743|  0:07:11s
epoch 414| loss: 77634.88025|  0:07:12s
epoch 415| loss: 76522.46725|  0:07:13s
epoch 416| loss: 96167.55749|  0:07:14s
epoch 417| loss: 118493.84023|  0:07:15s
epoch 418| loss: 90607.93007|  0:07:16s
epoch 419| loss: 72107.90636|  0:07:17s
epoch 420| loss: 48897.66972|  0:07:19s
epoch 421| loss: 59641.70855|  0:07:20s
epoch 422| loss: 62744.59202|  0:07:21s
epoch 423| loss: 76099.21266|  0:07:22s
epoch 424| loss: 54490.31595|  0:07:23s
epoch 425| loss: 70423.547|  0:07:24s
epoch 426| loss: 78762.55945|  0:07:25s
epoch 427| loss: 56305.38085|  0:07:26s
epoch 428| loss: 70176.97886|  0:07:27

epoch 610| loss: 37283.71599|  0:10:35s
epoch 611| loss: 75151.97125|  0:10:36s
epoch 612| loss: 65663.08783|  0:10:37s
epoch 613| loss: 35081.61171|  0:10:38s
epoch 614| loss: 52336.26593|  0:10:39s
epoch 615| loss: 73622.40927|  0:10:40s
epoch 616| loss: 55524.90021|  0:10:41s
epoch 617| loss: 52786.31616|  0:10:42s
epoch 618| loss: 90909.3557|  0:10:43s
epoch 619| loss: 40081.23666|  0:10:44s
epoch 620| loss: 57637.29919|  0:10:45s
epoch 621| loss: 62083.39448|  0:10:46s
epoch 622| loss: 41927.64467|  0:10:47s
epoch 623| loss: 63697.45995|  0:10:48s
epoch 624| loss: 74953.74212|  0:10:50s
epoch 625| loss: 52018.96336|  0:10:51s
epoch 626| loss: 92338.47279|  0:10:52s
epoch 627| loss: 57674.78975|  0:10:53s
epoch 628| loss: 80750.42488|  0:10:54s
epoch 629| loss: 34702.34298|  0:10:55s
epoch 630| loss: 56623.40083|  0:10:56s
epoch 631| loss: 64785.58325|  0:10:57s
epoch 632| loss: 48544.34288|  0:10:58s
epoch 633| loss: 54125.82542|  0:10:59s
epoch 634| loss: 55499.46365|  0:11:00s
e

epoch 816| loss: 61388.07981|  0:14:09s
epoch 817| loss: 58904.3895|  0:14:10s
epoch 818| loss: 58870.23232|  0:14:11s
epoch 819| loss: 45229.36842|  0:14:12s
epoch 820| loss: 27779.31089|  0:14:13s
epoch 821| loss: 72082.04197|  0:14:14s
epoch 822| loss: 32822.83652|  0:14:15s
epoch 823| loss: 36078.85885|  0:14:17s
epoch 824| loss: 98694.79459|  0:14:18s
epoch 825| loss: 81534.40162|  0:14:19s
epoch 826| loss: 52259.123|  0:14:20s
epoch 827| loss: 64520.53663|  0:14:21s
epoch 828| loss: 53407.88136|  0:14:22s
epoch 829| loss: 66439.01652|  0:14:23s
epoch 830| loss: 100170.21892|  0:14:24s
epoch 831| loss: 52003.64369|  0:14:25s
epoch 832| loss: 38223.82664|  0:14:26s
epoch 833| loss: 67015.69428|  0:14:27s
epoch 834| loss: 36800.82016|  0:14:28s
epoch 835| loss: 98994.23653|  0:14:29s
epoch 836| loss: 40182.87905|  0:14:30s
epoch 837| loss: 53664.57265|  0:14:31s
epoch 838| loss: 63101.61889|  0:14:32s
epoch 839| loss: 104921.31692|  0:14:33s
epoch 840| loss: 67686.2163|  0:14:34s
ep