In [1]:
import os
import cv2
import glob
import time
import random

import numpy as np
import pandas as pd
from sklearn import (svm, impute, metrics, ensemble, linear_model, decomposition,
                     tree, preprocessing, model_selection)
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('../input/tabularsep21-kfolddataset/train_10folds.csv', index_col=False)
df_train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f112,f113,f114,f115,f116,f117,f118,claim,nan_count,kfold
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1,1,3
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0,0,8
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1,5,6
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1,2,6
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,10002.0,0.23049,1,8,9


In [3]:
submission_data = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col=False)
test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
1,957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
2,957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
3,957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
4,957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


In [4]:
features = [c for c in df_train.columns if c not in ('id','claim', 'kfold')]
len(features)

119

In [5]:
test['nan_count'] = test.isnull().sum(axis=1)
test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,nan_count
0,957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,...,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357,1
1,957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,...,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125,0
2,957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,...,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797,1
3,957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,...,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291,0
4,957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,...,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796,0


In [6]:
df_train.isnull().sum() # I already filled nan with modes when preparing kfold data

id           0
f1           0
f2           0
f3           0
f4           0
            ..
f117         0
f118         0
claim        0
nan_count    0
kfold        0
Length: 122, dtype: int64

In [7]:
modes = df_train[features].mode().iloc[0]
test = test.fillna(modes)
test.isnull().sum()

id           0
f1           0
f2           0
f3           0
f4           0
            ..
f115         0
f116         0
f117         0
f118         0
nan_count    0
Length: 120, dtype: int64

In [16]:
df_train['row_min'] = df_train[features].min(axis=1)
df_train['row_max'] = df_train[features].max(axis=1)
df_train['row_mean'] = df_train[features].mean(axis=1)
df_train['row_std'] = df_train[features].std(axis=1)

test['row_min'] = test.min(axis=1)
test['row_max'] = test.max(axis=1)
test['row_mean'] = test.mean(axis=1)
test['row_std'] = test.std(axis=1)

features += ['row_min', 'row_max', 'row_mean', 'row_std']
len(features)

127

In [18]:
test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f114,f115,f116,f117,f118,nan_count,row_min,row_max,row_mean,row_std
0,957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,...,81606.0,1.194,198040000000000.0,2017.1,0.46357,1,-22.189,6.0462e+16,1049201000000000.0,7669420000000000.0
1,957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,...,90114.0,1.1507,4.388e+16,6638.9,0.28125,0,-15.691,4.388e+16,819276500000000.0,5591113000000000.0
2,957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,...,37739.0,1.1548,171810000000000.0,5844.0,0.13797,1,-595310000000.0,397780000000000.0,9016036000000.0,52993880000000.0
3,957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,...,26849.0,1.149,2.1388e+17,6173.3,0.3291,0,-34322000000000.0,2.1388e+17,4056629000000000.0,2.732284e+16
4,957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,...,50336.0,1.2488,2.1513e+17,2250.1,0.33796,0,-394570000.0,2.1513e+17,3723466000000000.0,2.72897e+16


In [38]:
scaler = preprocessing.StandardScaler()
xtest = scaler.fit_transform(test[features])

def cross_validate_model(model, best_params, train, test):
    valid_preds = []
    true_yvalid = []
    roc_scores = []
    test_preds = np.zeros((xtest.shape[0]))
    folds = 2
    for fold in range(folds):
        xtrain = train[train.kfold != fold].reset_index(drop=True)
        xvalid = train[train.kfold == fold].reset_index(drop=True)
        
        ytrain = xtrain.claim
        yvalid = xvalid.claim
        
        xtrain = scaler.transform(xtrain[features])
        xvalid = scaler.transform(xvalid[features])
        
        classifier = model(**best_params)
        classifier = classifier.fit(xtrain, ytrain)
        preds = classifier.predict_proba(xvalid)[:,1]
        
        valid_preds.extend(preds)
        
        test_preds += classifier.predict_proba(xtest)[:, 1]
        
        roc_score = roc_auc_score(yvalid, preds)
        roc_scores.append(roc_score)
        print(f'ROC AUC score: {roc_score}')
        
    print(f'Overall ROC AUC score is: {np.mean(roc_scores)} and std: {np.std(roc_scores)}')
    
    return valid_preds, test_preds/folds #for 10 fold

In [34]:
xgb_params = {
    'n_estimators' : 3600,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
    'learning_rate': 0.01,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'gpu_id' : '0',
    'predictor' : 'gpu_predictor',
    
}



In [26]:
'''valid_preds = {}
true_yvalid = []
roc_scores = []
test_preds = np.zeros((xtest.shape[0]))
folds = 2
for fold in range(folds):
    xtrain = df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

    valid_ids = xvalid.id.values.tolist() # to keep all ids of the valid data
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim

    xtrain = scaler.transform(xtrain[features])
    xvalid = scaler.transform(xvalid[features])

    model = XGBClassifier(**xgb_params)
    model.fit(xtrain, ytrain)
    preds = model.predict_proba(xvalid)[:,1]

    valid_preds.update(dict(zip(valid_ids, preds)))
    true_yvalid.extend(yvalid)

    test_preds += model.predict_proba(xtest)[:, 1]

    roc_score = roc_auc_score(yvalid, preds)
    roc_scores.append(roc_score)
    print(f'ROC AUC score: {roc_score}')

print(f'Overall ROC AUC score is: {np.mean(roc_scores)} and std: {np.std(roc_scores)}')

valid_preds = pd.DataFrame.from_dict(valid_preds, orient='index').reset_index()
valid_preds.columns = ['id', 'pred_1']
valid_preds.to_csv('valid_pred1.csv', index=False)

submission_data.columns = ['id', 'pred1']
submission_data.to_csv('test_pred1.csv', index=False)
'''

ROC AUC score: 0.8142691475976023
ROC AUC score: 0.8129822212556952
Overall ROC AUC score is: 0.8136256844266487 and std: 0.0006434631709535399


In [52]:
L1valid_pred1 = pd.DataFrame()
L1test_pred1 = pd.DataFrame()

In [39]:
valid_preds, test_preds = cross_validate_model(XGBClassifier,
                                              xgb_params,
                                              df_train,
                                              test)
L1valid_pred1['xgb'] = valid_preds
L1test_pred1['xgb'] = test_preds

ROC AUC score: 0.8142691484694369
ROC AUC score: 0.8129822210377324
Overall ROC AUC score is: 0.8136256847535847 and std: 0.0006434637158522216


In [60]:

catb_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}



In [51]:
catb_params['random_state'] = 42
valid_preds, test_preds = cross_validate_model(CatBoostClassifier,
                                              catb_params,
                                              df_train,
                                              test)
L1valid_pred1['catb_1'] = valid_preds
L1test_pred1['catb_1'] = test_preds

ROC AUC score: 0.8159486678473571
ROC AUC score: 0.8164027319233889
Overall ROC AUC score is: 0.816175699885373 and std: 0.00022703203801588412


NameError: name 'L1valid_pred1' is not defined

In [54]:
catb_params['random_state'] = 0
valid_preds, test_preds = cross_validate_model(CatBoostClassifier,
                                              catb_params,
                                              df_train,
                                              test)
L1valid_pred1['catb_2'] = valid_preds
L1test_pred1['catb_2'] = test_preds

ROC AUC score: 0.8161567429426484
ROC AUC score: 0.8160446822997195
Overall ROC AUC score is: 0.8161007126211839 and std: 5.603032146445086e-05


In [None]:
lgbm_params = {
    "objective": "binary",
    "learning_rate": 0.008,
    #'device': 'gpu',
    'n_estimators': 3205,
    'num_leaves': 184,
    'min_child_samples': 63,
    #'feature_fraction': 0.6864594334728974,
    #'bagging_fraction': 0.9497327922401265,
    'bagging_freq': 1,
    'reg_alpha': 19,
    'reg_lambda': 19,
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}
lgbm_params['random_state'] = 42
valid_preds, test_preds = cross_validate_model(CatBoostClassifier,
                                              lgbm_params,
                                              df_train,
                                              test)
L1valid_pred1['lgbm_1'] = valid_preds
L1test_pred1['lgbm_1'] = test_preds

gbm_params['random_state'] = 0
valid_preds, test_preds = cross_validate_model(CatBoostClassifier,
                                              lgbm_params,
                                              df_train,
                                              test)
L1valid_pred1['lgbm_2'] = valid_preds
L1test_pred1['lgbm_2'] = test_preds

In [None]:
L1['claim'] = df_train['claim']

In [None]:
#create new train dataset from valid predictions
#df = pd.read_csv('../input/train10fold/train-folds (1).csv')
#df_test = pd.read_csv('../input/30-days-of-ml/test.csv')

df1 = pd.read_csv('../input/lavel1blendingdataset/valid_pred1.csv')
df1.columns = ['id', 'pred1']
df2 = pd.read_csv('../input/lavel1blendingdataset/valid_pred2.csv')
df2.columns = ['id', 'pred2']
df3 = pd.read_csv('../input/lavel1blendingdataset/valid_pred3.csv')
df3.columns = ['id', 'pred3']
df4 = pd.read_csv('../input/lavel1blendingdataset/valid_pred4.csv')
df4.columns = ['id', 'pred4']
df5 = pd.read_csv('../input/lavel1blendingdataset/valid_pred5.csv')
df5.columns = ['id', 'pred5']

df_test1 = pd.read_csv('../input/lavel1blendingdataset/test_pred1.csv')
df_test1.columns = ['id', 'pred1']
df_test2 = pd.read_csv('../input/lavel1blendingdataset/test_pred2.csv')
df_test2.columns = ['id', 'pred2']
df_test3 = pd.read_csv('../input/lavel1blendingdataset/test_pred3.csv')
df_test3.columns = ['id', 'pred3']
df_test4 = pd.read_csv('../input/lavel1blendingdataset/test_pred4.csv')
df_test4.columns = ['id', 'pred4']
df_test5 = pd.read_csv('../input/lavel1blendingdataset/test_pred5.csv')
df_test5.columns = ['id', 'pred5']

df = df.merge(df1, on='id', how='left')
df = df.merge(df2, on='id', how='left')
df = df.merge(df3, on='id', how='left')
df = df.merge(df4, on='id', how='left')
df = df.merge(df5, on='id', how='left')

df_test = df_test.merge(df_test1, on='id', how='left')
df_test = df_test.merge(df_test2, on='id', how='left')
df_test = df_test.merge(df_test3, on='id', how='left')
df_test = df_test.merge(df_test4, on='id', how='left')
df_test = df_test.merge(df_test5, on='id', how='left')

df.head()