In [1]:
import os
import sys
sys.path.append('..')

import pandas as pd

from src.constants import get_constants
from src.features.config import CYEConfigPreProcessor, CYEConfigTransformer
from src.features.preprocessing import CYEPreProcessor, CYETargetTransformer

from sklearn import metrics
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

In [2]:
cst = get_constants()

scale = 'none'

config = CYEConfigPreProcessor(delna_thr=0.5)

dpp = CYEPreProcessor(config=config)
                    
config = CYEConfigTransformer(scale=scale)
tt_train = CYETargetTransformer(config=config)
tt_test = CYETargetTransformer(config=config)

df_train = pd.read_csv(os.path.join('..', cst.file_data_train), index_col='ID')

file_data_gen = 'TrainGReaTGPT2-9984.csv'
df_gen = pd.read_csv(os.path.join('..', cst.path_processed_data, file_data_gen))

df_train = pd.concat([df_train, df_gen]).reset_index(drop=True)

In [3]:
X_train, y_train = df_train.drop(columns=cst.target_column), df_train[cst.target_column]
y_train = tt_train.fit_transform(X_train, y_train)
X_train = dpp.fit_transform(X_train)

# Test data
X_test = pd.read_csv(os.path.join('..', cst.file_data_test), index_col='ID')
tt_test.fit(X_test)
X_test = dpp.transform(X_test)

KeyError: "['DistrictGaya', 'DistrictRajgir', 'BlockNoors', 'BlockWazirgan', 'TransplantingIrrigationSourceElectric', 'PCropSolidOrgFertAppMethodFalse'] not in index"

In [None]:
def compute_score(estimator1):
    bins_train = pd.qcut(y_train, q=10, labels=False)
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    cv = skf.split(X=X_train, y=bins_train)

    y_pred = cross_val_predict(
        estimator=estimator1,
        X=X_train.to_numpy(),
        y=y_train.to_numpy(),
        cv=cv,
    )
    
    y_pred = tt_train.inverse_transform(y_pred)
    # y_pred = [y if y > 0 else 0 for y in y_pred]
    rmse = metrics.mean_squared_error(y_pred=y_pred, y_true=tt_train.inverse_transform(y_train), squared=False)
    
    print(rmse)

    return rmse


def submit(estimator2, score, model):
    submission = pd.DataFrame()
    estimator2.fit(X=X_train.to_numpy(), y=y_train.to_numpy())
    
    ID = X_test.index.values.tolist()
    Yield = estimator2.predict(X_test.to_numpy())
    submission = tt_test.inverse_transform(Yield)
    submission = pd.Series(submission, index=X_test.index)
    submission.name = 'Yield'
    
    
    name_submission = f'{model}_{score:.3f}'
    file_submission = os.path.join(os.path.join('..', cst.path_submissions), f'{name_submission}.csv')

    submission.to_csv(file_submission, index=True)

In [None]:
from xgboost import XGBRegressor

estimator1 = XGBRegressor()
score = compute_score(estimator1)

estimator2 = XGBRegressor()
submit(estimator2, score, 'xgboost')

In [None]:
from lightgbm import LGBMRegressor

estimator1 = LGBMRegressor(verbosity=-1)
score = compute_score(estimator1)

estimator2 = LGBMRegressor(verbosity=-1)
submit(estimator2, score, 'lightgbm')

In [None]:
from catboost import CatBoostRegressor

estimator1 = CatBoostRegressor(verbose=0)
score = compute_score(estimator1)

estimator2 = CatBoostRegressor(verbose=0)
submit(estimator2, score, 'catboost')

In [None]:
# from lce import LCERegressor

# estimator1 = LCERegressor(verbose=0)
# scores = compute_score(estimator1)

# estimator2 = LCERegressor()
# submit(estimator2, scores, 'lce')

In [None]:
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE

augmentations = RegressionSMOTE(p=0.2)

batch_size = 32

unsupervised = TabNetPretrainer()
unsupervised.fit(X_train.to_numpy(), 
                 pretraining_ratio=0.8,
                 batch_size=batch_size)

estimator = TabNetRegressor()
estimator.fit(X_train=X_train.to_numpy(), 
              y_train=y_train.to_numpy().reshape(-1, 1), 
              max_epochs=1000,
              patience=50,
              eval_metric=['rmse'],
              from_unsupervised=unsupervised, 
              batch_size=batch_size,
              drop_last=False,
              augmentations=augmentations)

submission = pd.DataFrame()
    
ID = X_test.index.values.tolist()
Yield = estimator.predict(X_test.to_numpy())

submission['ID'] = ID
submission['Yield'] = Yield

file_submission = os.path.join(os.path.join('..', cst.path_submissions), f'tabnet.csv')
submission.to_csv(file_submission, index=False)