In [12]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge


In [13]:
# Load the training data
rawtrain = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
rawtest = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

In [14]:
rawtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 286 entries, f0 to target
dtypes: float64(240), int64(46)
memory usage: 2.1 GB


In [15]:
rawtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 1000000 to 1499999
Columns: 285 entries, f0 to f284
dtypes: float64(240), int64(45)
memory usage: 1.1 GB


In [16]:
train = rawtrain.copy()
y = train['target']
features = train.drop(['target'], axis=1)

In [17]:
X = features.copy()
X.shape

(1000000, 285)

In [18]:
test = rawtest.copy()
test.shape

(500000, 285)

In [19]:
lgbm_parameters = {
    "device":'gpu', "metric":"auc",
  'learning_rate': 0.08697465911179744, 'max_depth': 2, 'min_data_in_leaf': 592, 'n_estimators': 4565, 'num_leaves': 85, 'reg_alpha': 4.2395421854514055, 'reg_lambda': 3.4669998025411233
}
xgboost_parameters = { 'n_jobs':4, 
                       'tree_method': 'gpu_hist', 
                        'gpu_id': 0,
'learning_rate': 0.03351805368773793, 'n_estimators': 15398, 'gamma': 1.781011403322097, 'max_depth': 1, 'min_child_weight': 31.84691790156552, 'subsample': 0.4120852520801601, 'lambda': 4.232621628428967, 'alpha': 0.14624124916709164                     }
catboost_parameters={
    'task_type':"GPU", "loss_function":"RMSE",
    'learning_rate': 0.04930064646089139, 'iterations': 3238, 'depth': 6, 'l2_leaf_reg': 32.60575094913488, 'bagging_temperature': 0.9250154053358728
}

In [20]:
def fit_catboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose = 0)
def fit_lgbm(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 399, eval_set=[(Xv, yv)], verbose = -1)  
def fit_xgboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose=False)
model_types = [
    {"name": "lgbm", "create": lambda: LGBMRegressor(**lgbm_parameters), "fit": fit_lgbm},
    #{'name': "lgbm_0", 'create': lambda: LGBMRegressor(**lgbm_parameters_0), "fit": fit_lgbm},
    #{'name': "lgbm_1", "create": lambda: LGBMRegressor(**lgbm_parameters_1), 'fit': fit_lgbm},
    #{"name": "xgboost_2", "create": lambda: XGBRegressor(**xgboost_parameters_2), "fit": fit_xgboost},
    #{"name": "xgboost_1", "create": lambda: XGBRegressor(**xgboost_parameters_1), "fit": fit_xgboost},
    {'name': 'xgboost', 'create': lambda: XGBRegressor(**xgboost_parameters, eval_metric="auc"), 'fit': fit_xgboost},
    {"name": "catboost", "create": lambda: CatBoostRegressor(**catboost_parameters), "fit": fit_catboost},
    #{'name': 'catboost_0', "create": lambda: CatBoostRegressor(**catboost_parameters_0), "fit": fit_catboost},
]

In [21]:
type(X)

pandas.core.frame.DataFrame

In [22]:
spl = 5
base_models = [list() for x in model_types]
out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
y_pred = []
for i, model_type in enumerate(model_types):
    kf = StratifiedKFold(n_splits=spl, shuffle=True)
    mse = []
    print("starting " + model_type["name"])
    for train_idx, valid_idx in tqdm(kf.split(X,y.round())):
        Xt = X.iloc[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = X.iloc[valid_idx, :]
        yv = y.iloc[valid_idx]
        model = model_type['create']()
        model_type['fit'](model, Xt, yt, Xv, yv)
        y_pred = model.predict(Xv)
        out_of_fold_predictions[valid_idx, i] = y_pred
        valid_mse = roc_auc_score(y_true = yv, y_score = y_pred)
        mse.append(valid_mse)
        base_models[i].append(model)
        print("valid mse", valid_mse) 
    print(model_type["name"], i, mse, " max ", max(mse), " min ", min(mse)," avg ", sum(mse)/len(mse))

starting lgbm


0it [00:00, ?it/s]

Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3940]	valid_0's auc: 0.857016


1it [01:03, 63.10s/it]

valid mse 0.8570162639666028
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3339]	valid_0's auc: 0.855593


2it [01:57, 58.27s/it]

valid mse 0.855592580627059
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3596]	valid_0's auc: 0.85618


3it [02:56, 58.39s/it]

valid mse 0.8561803882801273
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3643]	valid_0's auc: 0.856351


4it [03:55, 58.68s/it]

valid mse 0.856350630240308
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3153]	valid_0's auc: 0.856322


5it [04:48, 57.63s/it]


valid mse 0.8563220623134284
lgbm 0 [0.8570162639666028, 0.855592580627059, 0.8561803882801273, 0.856350630240308, 0.8563220623134284]  max  0.8570162639666028  min  0.855592580627059  avg  0.8562923850855052
starting xgboost


1it [04:02, 242.09s/it]

valid mse 0.8571136042581902


2it [08:25, 254.51s/it]

valid mse 0.8554379525315696


3it [12:48, 258.43s/it]

valid mse 0.8555675139534739


4it [16:28, 243.40s/it]

valid mse 0.8571747822657526


5it [20:37, 247.42s/it]


valid mse 0.8573325893142333
xgboost 1 [0.8571136042581902, 0.8554379525315696, 0.8555675139534739, 0.8571747822657526, 0.8573325893142333]  max  0.8573325893142333  min  0.8554379525315696  avg  0.8565252884646439
starting catboost


1it [00:29, 29.21s/it]

valid mse 0.8545909702846439


2it [00:59, 29.79s/it]

valid mse 0.8586106976668054


3it [01:25, 28.00s/it]

valid mse 0.8569553327092725


4it [01:53, 27.97s/it]

valid mse 0.855259553713714


5it [02:23, 28.67s/it]

valid mse 0.85656210013928
catboost 2 [0.8545909702846439, 0.8586106976668054, 0.8569553327092725, 0.855259553713714, 0.85656210013928]  max  0.8586106976668054  min  0.8545909702846439  avg  0.8563957309027433





In [23]:
meta_features = np.column_stack([
    np.column_stack([model.predict(test) for model in bms]).mean(axis=1) for bms in base_models])


In [24]:
out_of_fold_predictions_file = "./outoffoldpredictions.np"
meta_features_file = "./meta_features.np"

In [25]:
np.save(out_of_fold_predictions_file, out_of_fold_predictions)
np.save(meta_features_file, meta_features)

In [26]:
meta_model = Ridge()

In [27]:
# Cross validate ridge
spl = 10
kf = StratifiedKFold(n_splits=spl, shuffle=True)
test_pred_total = np.zeros(len(rawtest))
mse = []
for train_idx, valid_idx in tqdm(kf.split(out_of_fold_predictions,y.round())):
        model = meta_model
        Xt = out_of_fold_predictions[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = out_of_fold_predictions[valid_idx, :]
        yv = y.iloc[valid_idx]
        model.fit(Xt, yt)
        y_pred = model.predict(Xv)
        valid_mse = roc_auc_score(y_true = y.iloc[valid_idx], y_score = y_pred)
        print("valid mse ", valid_mse)
        mse.append(valid_mse)
        test_pred = model.predict(meta_features) / spl
        test_pred_total += test_pred
print("valid mse", mse, " min ", min(mse), " max ", max(mse), " avg ", sum(mse)/len(mse))

3it [00:00,  8.50it/s]

valid mse  0.8562427227355109
valid mse  0.85674632441917
valid mse  0.857503704746558


5it [00:00,  8.94it/s]

valid mse  0.8551704521057022
valid mse  0.8545824411409766


7it [00:00, 10.04it/s]

valid mse  0.8567552895856748
valid mse  0.8566074610494361
valid mse  0.8575261510961009


10it [00:01,  9.89it/s]

valid mse  0.8575522463201501
valid mse  0.8581326940550908
valid mse [0.8562427227355109, 0.85674632441917, 0.857503704746558, 0.8551704521057022, 0.8545824411409766, 0.8567552895856748, 0.8566074610494361, 0.8575261510961009, 0.8575522463201501, 0.8581326940550908]  min  0.8545824411409766  max  0.8581326940550908  avg  0.8566819487254371





In [28]:
# Save the predictions to a CSV file
output = pd.DataFrame({'Id': rawtest.index,
                       'target': test_pred_total})
output.to_csv('submission.csv', index=False)