In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge


In [2]:
# Load the training data
rawtrain = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
rawtest = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

In [3]:
rawtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 286 entries, f0 to target
dtypes: float64(240), int64(46)
memory usage: 2.1 GB


In [4]:
rawtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 1000000 to 1499999
Columns: 285 entries, f0 to f284
dtypes: float64(240), int64(45)
memory usage: 1.1 GB


In [5]:
train = rawtrain.copy()
y = train['target']
features = train.drop(['target'], axis=1)

In [6]:
X = features.copy()
X.shape

(1000000, 285)

In [7]:
test = rawtest.copy()
test.shape

(500000, 285)

In [8]:
lgbm_parameters = {
    "device":'gpu', "metric":"auc",
  'learning_rate': 0.08697465911179744, 'max_depth': 2, 'min_data_in_leaf': 592, 'n_estimators': 4565, 'num_leaves': 85, 'reg_alpha': 4.2395421854514055, 'reg_lambda': 3.4669998025411233
}
xgboost_parameters = { 'n_jobs':4, 
                       'tree_method': 'gpu_hist', 
                        'gpu_id': 0,
'learning_rate': 0.03237498960526522, 'n_estimators': 10837, 'gamma': 0.7820801389298118, 'max_depth': 2, 'min_child_weight': 9.149611138786955, 'subsample': 0.7488960655884316, 'lambda': 4.190627000051498, 'alpha': 6.241101997471009
                     }
catboost_parameters={
    'task_type':"GPU", "loss_function":"RMSE",
    'bagging_temperature': 9.961385165409107, 'depth': 15, 'iterations': 1685, 'l2_leaf_reg': 1.4634054932859328, 'learning_rate': 0.03030510670682149}

In [9]:
def fit_catboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose = 0)
def fit_lgbm(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 399, eval_set=[(Xv, yv)], verbose = -1)  
def fit_xgboost(model, Xt, yt, Xv, yv):
    model.fit(Xt, yt, early_stopping_rounds = 400, eval_set=[(Xv, yv)], verbose=False)
model_types = [
    {"name": "lgbm", "create": lambda: LGBMRegressor(**lgbm_parameters), "fit": fit_lgbm},
    #{'name': "lgbm_0", 'create': lambda: LGBMRegressor(**lgbm_parameters_0), "fit": fit_lgbm},
    #{'name': "lgbm_1", "create": lambda: LGBMRegressor(**lgbm_parameters_1), 'fit': fit_lgbm},
    #{"name": "xgboost_2", "create": lambda: XGBRegressor(**xgboost_parameters_2), "fit": fit_xgboost},
    #{"name": "xgboost_1", "create": lambda: XGBRegressor(**xgboost_parameters_1), "fit": fit_xgboost},
    {'name': 'xgboost', 'create': lambda: XGBRegressor(**xgboost_parameters, eval_metric="auc"), 'fit': fit_xgboost},
    #{"name": "catboost", "create": lambda: CatBoostRegressor(**catboost_parameters), "fit": fit_catboost},
    #{'name': 'catboost_0', "create": lambda: CatBoostRegressor(**catboost_parameters_0), "fit": fit_catboost},
]

In [10]:
type(X)

pandas.core.frame.DataFrame

In [11]:
spl = 5
base_models = [list() for x in model_types]
out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
y_pred = []
for i, model_type in enumerate(model_types):
    kf = StratifiedKFold(n_splits=spl, shuffle=True)
    mse = []
    print("starting " + model_type["name"])
    for train_idx, valid_idx in tqdm(kf.split(X,y.round())):
        Xt = X.iloc[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = X.iloc[valid_idx, :]
        yv = y.iloc[valid_idx]
        model = model_type['create']()
        model_type['fit'](model, Xt, yt, Xv, yv)
        y_pred = model.predict(Xv)
        out_of_fold_predictions[valid_idx, i] = y_pred
        valid_mse = roc_auc_score(y_true = yv, y_score = y_pred)
        mse.append(valid_mse)
        base_models[i].append(model)
        print("valid mse", valid_mse) 
    print(model_type["name"], i, mse, " max ", max(mse), " min ", min(mse)," avg ", sum(mse)/len(mse))

starting lgbm


0it [00:00, ?it/s]

Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3635]	valid_0's auc: 0.857082


1it [00:49, 49.44s/it]

valid mse 0.8570823353287693
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3225]	valid_0's auc: 0.856275


2it [01:35, 47.46s/it]

valid mse 0.8562754263695488
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3386]	valid_0's auc: 0.855308


3it [02:22, 47.12s/it]

valid mse 0.8553080488593432
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3995]	valid_0's auc: 0.856875


4it [03:15, 49.61s/it]

valid mse 0.8568745076332241
Training until validation scores don't improve for 399 rounds
Early stopping, best iteration is:
[3790]	valid_0's auc: 0.8559


5it [04:07, 49.43s/it]


valid mse 0.8559001751164748
lgbm 0 [0.8570823353287693, 0.8562754263695488, 0.8553080488593432, 0.8568745076332241, 0.8559001751164748]  max  0.8570823353287693  min  0.8553080488593432  avg  0.856288098661472
starting xgboost


1it [04:07, 247.32s/it]

valid mse 0.8555027255425145


2it [08:33, 258.36s/it]

valid mse 0.857009543010279


3it [12:30, 248.43s/it]

valid mse 0.8571999741394556


4it [17:21, 265.58s/it]

valid mse 0.8573775159565047


5it [22:16, 267.34s/it]

valid mse 0.8555095921989753
xgboost 1 [0.8555027255425145, 0.857009543010279, 0.8571999741394556, 0.8573775159565047, 0.8555095921989753]  max  0.8573775159565047  min  0.8555027255425145  avg  0.8565198701695458





In [12]:
meta_features = np.column_stack([
    np.column_stack([model.predict(test) for model in bms]).mean(axis=1) for bms in base_models])


In [13]:
out_of_fold_predictions_file = "./outoffoldpredictions.np"
meta_features_file = "./meta_features.np"

In [14]:
np.save(out_of_fold_predictions_file, out_of_fold_predictions)
np.save(meta_features_file, meta_features)

In [15]:
meta_model = Ridge()

In [16]:
# Cross validate ridge
spl = 10
kf = StratifiedKFold(n_splits=spl, shuffle=True)
test_pred_total = np.zeros(len(rawtest))
mse = []
for train_idx, valid_idx in tqdm(kf.split(out_of_fold_predictions,y.round())):
        model = meta_model
        Xt = out_of_fold_predictions[train_idx, :]
        yt = y.iloc[train_idx]
        Xv = out_of_fold_predictions[valid_idx, :]
        yv = y.iloc[valid_idx]
        model.fit(Xt, yt)
        y_pred = model.predict(Xv)
        valid_mse = roc_auc_score(y_true = y.iloc[valid_idx], y_score = y_pred)
        print("valid mse ", valid_mse)
        mse.append(valid_mse)
        test_pred = model.predict(meta_features) / spl
        test_pred_total += test_pred
print("valid mse", mse, " min ", min(mse), " max ", max(mse), " avg ", sum(mse)/len(mse))

3it [00:00,  6.78it/s]

valid mse  0.8561004949989155
valid mse  0.8579434127688537
valid mse  0.8562437123364612


5it [00:00,  9.10it/s]

valid mse  0.856633865111164
valid mse  0.8558580679660885
valid mse  0.8575860175512737


9it [00:00, 11.57it/s]

valid mse  0.8579214974604521
valid mse  0.8547688429549656
valid mse  0.8562612491303673
valid mse  0.8567675703969928


10it [00:01,  9.86it/s]

valid mse [0.8561004949989155, 0.8579434127688537, 0.8562437123364612, 0.856633865111164, 0.8558580679660885, 0.8575860175512737, 0.8579214974604521, 0.8547688429549656, 0.8562612491303673, 0.8567675703969928]  min  0.8547688429549656  max  0.8579434127688537  avg  0.8566084730675534





In [17]:
# Save the predictions to a CSV file
output = pd.DataFrame({'Id': rawtest.index,
                       'target': test_pred_total})
output.to_csv('submission.csv', index=False)