In [2]:
#!pip install optuna 
import optuna
from optuna.samplers import TPESampler

import multiprocessing
# Warning Libraries 
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)

# Scientific and Data Manipulation Libraries 
# import pandas as pd
import datatable as dt
import numpy as np
import math
import gc
import os


# Data Preprocessing, Machine Learning and Metrics Libraries 
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.impute                   import SimpleImputer
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble                 import VotingClassifier
from sklearn.metrics                  import f1_score, log_loss, accuracy_score,roc_auc_score, roc_curve

# model visualization
import shap


from sklearn.model_selection import KFold, RepeatedKFold, train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Boosting Algorithms 
from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier



# Data Visualization Libraries 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

2021-10-03 19:50:58.296508: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
%%time

# read data
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()

sub = dt.fread('../input/tabular-playground-series-oct-2021/sample_submission.csv').to_pandas()

# Looks at the first 5 rows of the Train and Test data
display('Train Head :',train.head())
display('Test Head :',test.head())


# Displays Information of Columns of Train and Test data
# display(train.info())
# display(test.info())


# Display Descriptive Statistics of Train and Test data
display('Train Description :',train.describe().T)
display('Test  Description :',test.describe().T)


# Displays Correlation between Features through HeatMap - Ligther Color means Higher Correlation
# sns.heatmap(train.corr(), annot = True)

'Train Head :'

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,False,True,False,False,False,False,False,False,False,True
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,False,True,False,False,False,False,False,False,False,True
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,False,False,False,True,True,False,False,False,False,True
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,False,False,False,False,True,False,False,False,False,True
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,False,True,True,False,True,False,False,True,False,True


'Test Head :'

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
0,1000000,0.178216,0.435617,0.01023,0.202074,0.39017,0.324221,0.221722,0.738894,0.582588,...,True,False,False,False,False,False,True,True,True,False
1,1000001,0.18125,0.476455,0.022413,0.283146,0.59802,0.349508,0.283467,0.721575,0.26899,...,False,False,False,False,False,False,False,False,False,False
2,1000002,0.159721,0.451202,0.259649,0.365274,0.594634,0.413502,0.249318,0.642339,0.411104,...,False,False,False,False,False,False,True,False,False,False
3,1000003,0.182424,0.520976,0.095344,0.327742,0.74183,0.358711,0.270077,0.601662,0.297742,...,False,False,False,False,False,True,True,False,False,False
4,1000004,0.229329,0.336513,0.023511,0.300913,0.668738,0.481586,0.54566,0.667849,0.546045,...,False,False,False,False,True,False,False,True,False,False


'Train Description :'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1000000.0,499999.500000,288675.278933,0.000000,249999.750000,499999.500000,749999.250000,999999.000000
f0,1000000.0,0.214334,0.053320,0.041790,0.181676,0.204498,0.229684,1.000000
f1,1000000.0,0.460218,0.101316,0.022016,0.389215,0.453893,0.526023,0.959019
f2,1000000.0,0.129253,0.120805,0.000381,0.017692,0.095496,0.177717,0.994818
f3,1000000.0,0.277598,0.063163,0.000000,0.235342,0.264669,0.305837,0.979797
...,...,...,...,...,...,...,...,...
f237,1000000.0,0.020571,0.073547,0.000114,0.005004,0.006707,0.008481,1.000000
f238,1000000.0,0.171748,0.057980,0.015935,0.138110,0.156161,0.188783,1.000000
f239,1000000.0,0.228530,0.138197,0.001260,0.197519,0.202545,0.362163,0.987972
f240,1000000.0,0.187746,0.061176,0.011777,0.142875,0.170167,0.199849,0.937160


'Test  Description :'

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,500000.0,1.250000e+06,144337.711634,1000000.000000,1.125000e+06,1.250000e+06,1.374999e+06,1.499999e+06
f0,500000.0,2.136951e-01,0.056670,0.000000,1.807110e-01,2.037990e-01,2.295450e-01,9.693840e-01
f1,500000.0,4.550073e-01,0.102219,0.019280,3.838607e-01,4.481490e-01,5.182550e-01,9.562660e-01
f2,500000.0,1.250876e-01,0.123786,0.000315,1.753550e-02,9.445710e-02,1.766100e-01,9.710670e-01
f3,500000.0,2.774628e-01,0.064343,0.045517,2.339050e-01,2.642560e-01,3.068022e-01,9.988440e-01
...,...,...,...,...,...,...,...,...
f237,500000.0,2.192792e-02,0.076789,0.000203,5.213860e-03,6.993050e-03,8.808765e-03,9.695230e-01
f238,500000.0,1.752554e-01,0.062176,0.010355,1.394168e-01,1.579690e-01,1.901240e-01,9.758930e-01
f239,500000.0,2.307407e-01,0.129118,0.002734,1.989418e-01,2.030720e-01,2.083130e-01,9.832200e-01
f240,500000.0,1.862095e-01,0.056404,0.089677,1.420630e-01,1.695040e-01,2.163220e-01,9.013870e-01


CPU times: user 25.6 s, sys: 4.09 s, total: 29.7 s
Wall time: 44.4 s


In [6]:
# data - pandas dataframe
def missing_value_describe(data):
    # check missing values in the data
    missing_value_stats = (data.isnull().sum() / len(data)*100)
    missing_value_col_count = sum(missing_value_stats > 0)
    missing_value_stats = missing_value_stats.sort_values(ascending=False)[:missing_value_col_count]
    print("Number of rows with at least 1 missing values:", data.isna().any(axis = 1).sum())
    print("Number of columns with missing values:", missing_value_col_count)
    if missing_value_col_count != 0:
        # print out column names with missing value percentage
        print("\nMissing percentage (desceding):")
        print(missing_value_stats)
    else:
        print("No missing data!!!")



print("Check Missing Values for train dataset")        
display(missing_value_describe(train))
print('\n-----\n')

print("Check Missing Values for test dataset")   
display(missing_value_describe(test))

Check Missing Values for train dataset
Number of rows with at least 1 missing values: 0
Number of columns with missing values: 0
No missing data!!!


None


-----

Check Missing Values for test dataset
Number of rows with at least 1 missing values: 0
Number of columns with missing values: 0
No missing data!!!


None

In [7]:
train.dtypes.value_counts()

float64    240
bool        46
int32        1
dtype: int64

In [4]:
TARGET = 'target'
train[TARGET] = train[TARGET].astype('uint8')

feature_cols_discont = train.select_dtypes(include=['bool']).columns.tolist()

print(len(feature_cols_discont))

feature_cols_discont
# col 22, 43, 242 to 284 bool type

45


['f22',
 'f43',
 'f242',
 'f243',
 'f244',
 'f245',
 'f246',
 'f247',
 'f248',
 'f249',
 'f250',
 'f251',
 'f252',
 'f253',
 'f254',
 'f255',
 'f256',
 'f257',
 'f258',
 'f259',
 'f260',
 'f261',
 'f262',
 'f263',
 'f264',
 'f265',
 'f266',
 'f267',
 'f268',
 'f269',
 'f270',
 'f271',
 'f272',
 'f273',
 'f274',
 'f275',
 'f276',
 'f277',
 'f278',
 'f279',
 'f280',
 'f281',
 'f282',
 'f283',
 'f284']

In [None]:
train[['f284', 'f283']].value_counts()

In [5]:
len(train.select_dtypes(include=['float64']).columns)
feature_cols_cont =  train.select_dtypes(include=['float64']).columns.tolist()
len(feature_cols_cont)

240

In [6]:
train[feature_cols_cont] = train[feature_cols_cont].astype('float32')
train[feature_cols_discont] = train[feature_cols_discont].astype('uint8')

test[feature_cols_cont] = test[feature_cols_cont].astype('float32')
test[feature_cols_discont] = test[feature_cols_discont].astype('uint8')

In [7]:
train.dtypes.value_counts()

float32    240
uint8       46
int32        1
dtype: int64

In [8]:
features = feature_cols_discont + feature_cols_cont

In [9]:
# ML Dataset
X=train.drop([TARGET,'id'],axis=1).values
y=train[TARGET]

# test dataset
X_test =  test.drop('id',axis=1)

# Scaling features
scaler = RobustScaler() # StandardScaler()  RobustScaler()  MinMaxScaler() MaxAbsScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [14]:
%%time

test_preds=None
scores = []
# kf = RepeatedKFold(n_splits = 10 , n_repeats=3, random_state = 42)
n_splits = 10
kf = StratifiedKFold(n_splits = n_splits , shuffle=True, random_state = 42)
for fold, (train_index , valid_index) in enumerate(kf.split(X , y)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")

    X_train, X_val = X[train_index] , X[valid_index]
    y_train, y_val = y[train_index] , y[valid_index]
    
#     X_test = test[features]
        
    eval_set = [(X_val, y_val)]
    xgb_params = {'eval_metric':'auc',
                  'tree_method':'gpu_hist', 
                  'n_estimators': 2000,
#                   'max_depth': 8,
#                   'min_child_weight':1,
                  'subsample': 0.7, }
    model = CatBoostClassifier(eval_metric="AUC", task_type="GPU")
#     model = XGBClassifier(**xgb_params) 
    model.fit(X_train, y_train, eval_set = eval_set, verbose = False)
    
    train_preds = model.predict_proba(X_train)[:,1]
    val_preds = model.predict_proba(X_val)[:,1]
    
    auc = roc_auc_score(y_val, val_preds)
    
    print("AUC Score : ",auc)
    scores.append(auc)
    if test_preds is None:
        test_preds = model.predict_proba(X_test)[:,1] 
    else:
        test_preds += model.predict_proba(X_test)[:,1] 
    gc.collect()

print("-" * 50)
test_preds /= n_splits
print("mean score : ", np.mean(scores), np.std(scores))

# xgboost: 'subsample': 0.7 -> mean score :  0.8509375563534972 0.0016472301588207994
# mean score :  0.8554502464194721 0.00163149966477719

--------------------------------------------------
Fold 1


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

AUC Score :  0.8594474238133059
--------------------------------------------------
Fold 2
AUC Score :  0.855403471529494
--------------------------------------------------
Fold 3
AUC Score :  0.8532520566632752
--------------------------------------------------
Fold 4
AUC Score :  0.8547738165247734
--------------------------------------------------
Fold 5
AUC Score :  0.8545781223368287
--------------------------------------------------
Fold 6
AUC Score :  0.8572071192020811
--------------------------------------------------
Fold 7
AUC Score :  0.8546632008576058
--------------------------------------------------
Fold 8
AUC Score :  0.854643269239237
--------------------------------------------------
Fold 9
AUC Score :  0.8549809139504103
--------------------------------------------------
Fold 10
AUC Score :  0.8555530700777093
--------------------------------------------------
mean score :  0.8554502464194721 0.00163149966477719
CPU times: user 14min 7s, sys: 2min 37s, total: 16min 4

In [46]:
model = XGBClassifier(**xgb_params)
model
# max_depth , no of leave, min child weight

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, eval_metric='auc',
              gamma=None, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=None,
              max_delta_step=None, max_depth=10, min_child_weight=1,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=None,
              reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
              subsample=0.7, tree_method='gpu_hist', validate_parameters=None,
              verbosity=None)

In [15]:
sub.iloc[:, 1:] = test_preds
sub.to_csv("submission.csv", index=False)

sub.shape

(500000, 2)