In [50]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge


In [51]:
# Load the training data
rawtrain = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
rawtest = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

In [52]:
rawtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 286 entries, f0 to target
dtypes: float64(240), int64(46)
memory usage: 2.1 GB


In [53]:
rawtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 1000000 to 1499999
Columns: 285 entries, f0 to f284
dtypes: float64(240), int64(45)
memory usage: 1.1 GB


In [54]:
train = rawtrain.copy()
y = train['target']
features = train.drop(['target'], axis=1)

In [55]:
X = features.copy()
X.shape

(1000000, 285)

In [56]:
test = rawtest.copy()
test.shape

(500000, 285)

In [57]:
lgbm_parameters = {
    "device":'gpu', "metric":"auc",
  'learning_rate': 0.08697465911179744, 'max_depth': 2, 'min_data_in_leaf': 592, 'n_estimators': 4565, 'num_leaves': 85, 'reg_alpha': 4.2395421854514055, 'reg_lambda': 3.4669998025411233
}
xgboost_parameters = { 'n_jobs':4, 
                       'tree_method': 'gpu_hist', 
                        'gpu_id': 0,
'learning_rate': 0.03237498960526522, 'n_estimators': 10837, 'gamma': 0.7820801389298118, 'max_depth': 2, 'min_child_weight': 9.149611138786955, 'subsample': 0.7488960655884316, 'lambda': 4.190627000051498, 'alpha': 6.241101997471009
                     }
catboost_parameters={
    'task_type':"GPU", "loss_function":"RMSE",
    'learning_rate': 0.03702505794236759, 'iterations': 1805, 'depth': 5, 'l2_leaf_reg': 22.93190966999958, 'bagging_temperature': 2.1402332674620075}

In [58]:
def fit_catboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose = 0)
def fit_lgbm(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 399, eval_set=[(Xv, yv)], verbose = -1)  
def fit_xgboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose=False)
model_types = [
    {"name": "lgbm", "create": lambda: LGBMRegressor(**lgbm_parameters), "fit": fit_lgbm},
    #{'name': "lgbm_0", 'create': lambda: LGBMRegressor(**lgbm_parameters_0), "fit": fit_lgbm},
    #{'name': "lgbm_1", "create": lambda: LGBMRegressor(**lgbm_parameters_1), 'fit': fit_lgbm},
    #{"name": "xgboost_2", "create": lambda: XGBRegressor(**xgboost_parameters_2), "fit": fit_xgboost},
    #{"name": "xgboost_1", "create": lambda: XGBRegressor(**xgboost_parameters_1), "fit": fit_xgboost},
    {'name': 'xgboost', 'create': lambda: XGBRegressor(**xgboost_parameters, eval_metric="auc"), 'fit': fit_xgboost},
    {"name": "catboost", "create": lambda: CatBoostRegressor(**catboost_parameters), "fit": fit_catboost},
    #{'name': 'catboost_0', "create": lambda: CatBoostRegressor(**catboost_parameters_0), "fit": fit_catboost},
]

In [59]:
model_types

[{'name': 'lgbm',
  'create': <function __main__.<lambda>()>,
  'fit': <function __main__.fit_lgbm(model, Xt, yt, Xv, yv)>},
 {'name': 'xgboost',
  'create': <function __main__.<lambda>()>,
  'fit': <function __main__.fit_xgboost(model, Xt, yt, Xv, yv)>},
 {'name': 'catboost',
  'create': <function __main__.<lambda>()>,
  'fit': <function __main__.fit_catboost(model, Xt, yt, Xv, yv)>}]

In [60]:
type(X)

pandas.core.frame.DataFrame

In [62]:
spl = 5
out_of_fold_predictions = np.zeros((X.shape[0], len(model_types)))
meta_features = np.zeros((test.shape[0], len(model_types)))
y_pred = []
for i, model_type in enumerate(model_types):
    kf = StratifiedKFold(n_splits=spl, shuffle=True)
    mse = []
    print("starting " + model_type["name"])
    meta_features_folds = np.zeros((test.shape[0], spl))
    i_fold = 0
    for train_idx, valid_idx in tqdm(kf.split(X,y.round())):
        Xt = X.iloc[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = X.iloc[valid_idx, :]
        yv = y.iloc[valid_idx]
        model = model_type['create']()
        model_type['fit'](model, Xt, yt, Xv, yv)
        y_pred = model.predict(Xv)
        out_of_fold_predictions[valid_idx, i] = y_pred
        valid_mse = roc_auc_score(y_true = yv, y_score = y_pred)
        mse.append(valid_mse)
        meta_features_folds[:, i_fold] = model.predict(test)
        print("valid mse", valid_mse)
        i_fold += 1
    meta_features[:, i] = meta_features_folds.mean(axis=1)
    print(model_type["name"], i, mse, " max ", max(mse), " min ", min(mse)," avg ", sum(mse)/len(mse))

starting lgbm


0it [00:00, ?it/s]

Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3607]	valid_0's auc: 0.856422


1it [01:08, 68.69s/it]

valid mse 0.8564219760074373
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3689]	valid_0's auc: 0.856561


2it [02:18, 69.42s/it]

valid mse 0.8565612487384789
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3697]	valid_0's auc: 0.856293


3it [03:29, 69.97s/it]

valid mse 0.8562930234861057
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3569]	valid_0's auc: 0.855947


4it [04:37, 69.28s/it]

valid mse 0.855946922060459
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3614]	valid_0's auc: 0.856498


5it [05:46, 69.35s/it]


valid mse 0.856497690378677
lgbm 0 [0.8564219760074373, 0.8565612487384789, 0.8562930234861057, 0.855946922060459, 0.856497690378677]  max  0.8565612487384789  min  0.855946922060459  avg  0.8563441721342315
starting xgboost


1it [04:09, 249.29s/it]

valid mse 0.8553297071297215


2it [08:16, 248.25s/it]

valid mse 0.8558245410453107


3it [12:34, 252.62s/it]

valid mse 0.858173687955623


4it [16:52, 254.89s/it]

valid mse 0.8566814878016119


5it [20:40, 248.09s/it]


valid mse 0.8564715033540374
xgboost 1 [0.8553297071297215, 0.8558245410453107, 0.858173687955623, 0.8566814878016119, 0.8564715033540374]  max  0.858173687955623  min  0.8553297071297215  avg  0.8564961854572608
starting catboost


1it [00:16, 16.45s/it]

valid mse 0.8564693414520034


2it [00:32, 16.30s/it]

valid mse 0.8530762537594472


3it [00:48, 16.22s/it]

valid mse 0.8564425873268304


4it [01:04, 16.19s/it]

valid mse 0.8558494386687369


5it [01:21, 16.20s/it]

valid mse 0.8557494644746712
catboost 2 [0.8564693414520034, 0.8530762537594472, 0.8564425873268304, 0.8558494386687369, 0.8557494644746712]  max  0.8564693414520034  min  0.8530762537594472  avg  0.855517417136338





In [63]:
out_of_fold_predictions_file = "./outoffoldpredictions.np"
meta_features_file = "./meta_features.np"

In [64]:
np.save(out_of_fold_predictions_file, out_of_fold_predictions)
np.save(meta_features_file, meta_features)

In [65]:
meta_model = Ridge()

In [66]:
# Cross validate ridge
spl = 5
kf = StratifiedKFold(n_splits=spl, shuffle=True)
test_pred_total = np.zeros(len(rawtest))
mse = []
for train_idx, valid_idx in tqdm(kf.split(out_of_fold_predictions,y.round())):
        model = meta_model
        Xt = out_of_fold_predictions[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = out_of_fold_predictions[valid_idx, :]
        yv = y.iloc[valid_idx]
        model.fit(Xt, yt)
        y_pred = model.predict(Xv)
        valid_mse = roc_auc_score(y_true = y.iloc[valid_idx], y_score = y_pred)
        print("valid mse ", valid_mse)
        mse.append(valid_mse)
        test_pred = model.predict(meta_features) / spl
        test_pred_total += test_pred
print("valid mse", mse, " min ", min(mse), " max ", max(mse), " avg ", sum(mse)/len(mse))

2it [00:00,  5.47it/s]

valid mse  0.8565435610218366
valid mse  0.8563093866015019
valid mse  

4it [00:00,  7.86it/s]

0.8554663531082917
valid mse  0.8569798316323236
valid mse  0.8574441448691958


5it [00:00,  7.15it/s]

valid mse [0.8565435610218366, 0.8563093866015019, 0.8554663531082917, 0.8569798316323236, 0.8574441448691958]  min  0.8554663531082917  max  0.8574441448691958  avg  0.8565486554466298





In [67]:
# Save the predictions to a CSV file
output = pd.DataFrame({'Id': rawtest.index,
                       'target': test_pred_total})
output.to_csv('submission.csv', index=False)