In [None]:
#!pip install optuna 
import optuna
from optuna.samplers import TPESampler

import multiprocessing
# Warning Libraries 
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)

# Scientific and Data Manipulation Libraries 
import pandas as pd
import numpy as np
import math
import gc
import os


# Data Preprocessing, Machine Learning and Metrics Libraries 
from sklearn.preprocessing            import LabelEncoder, OneHotEncoder 
from sklearn.impute                   import SimpleImputer
from sklearn.preprocessing            import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.ensemble                 import VotingClassifier
from sklearn.metrics                  import f1_score, log_loss, accuracy_score,roc_auc_score, roc_curve

# model visualization
import shap


from sklearn.model_selection import KFold, RepeatedKFold, train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# Boosting Algorithms 
from xgboost                          import XGBClassifier
from catboost                         import CatBoostClassifier
from lightgbm                         import LGBMClassifier



# Data Visualization Libraries 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px

# EDA

In [None]:
# read data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

# Looks at the first 5 rows of the Train and Test data
display('Train Head :',train.head())
display('Test Head :',test.head())


# Displays Information of Columns of Train and Test data
train.info()
test.info()


# Display Descriptive Statistics of Train and Test data
display('Train Description :',train.describe())
display('Test  Description :',test.describe())


# Displays Correlation between Features through HeatMap - Ligther Color means Higher Correlation
# sns.heatmap(train.corr(), annot = True)

# Missing Values

In [None]:
# data - pandas dataframe
def missing_value_describe(data):
    # check missing values in the data
    missing_value_stats = (data.isnull().sum() / len(data)*100)
    missing_value_col_count = sum(missing_value_stats > 0)
    missing_value_stats = missing_value_stats.sort_values(ascending=False)[:missing_value_col_count]
    print("Number of rows with at least 1 missing values:", data.isna().any(axis = 1).sum())
    print("Number of columns with missing values:", missing_value_col_count)
    if missing_value_col_count != 0:
        # print out column names with missing value percentage
        print("\nMissing percentage (desceding):")
        print(missing_value_stats)
    else:
        print("No missing data!!!")



print("Check Missing Values for train dataset")        
display(missing_value_describe(train))
print('\n-----\n')

print("Check Missing Values for test dataset")   
display(missing_value_describe(test))

## Check Duplicate Values

In [None]:
display(train[train.duplicated(keep=False)])

display(test[test.duplicated(keep=False)])

# Transformation

In [None]:
# ML Dataset
X=train.drop(['claim','id'],axis=1)
y=train['claim']

# test dataset
test =  test.drop('id',axis=1)

# Large number of missing values , imputation 

# imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # 'most_frequent' 'constant' 'median'

imputed_X = pd.DataFrame(imputer.fit_transform(X))
imputed_test = pd.DataFrame(imputer.transform(test))


# Scaling features
scaler = RobustScaler() # StandardScaler()  RobustScaler()  MinMaxScaler() MaxAbsScaler()
imputed_X = scaler.fit_transform(imputed_X)
imputed_test = scaler.transform(imputed_test)

In [None]:
imputed_X.shape , y.shape, imputed_test.shape

# Baseline Model ( CatboostClassifier + OOF )

In [None]:
test_preds=None
scores = []
# kf = RepeatedKFold(n_splits = 10 , n_repeats=3, random_state = 42)
n_splits = 10
kf = StratifiedKFold(n_splits = n_splits , shuffle=True, random_state = 42)
for fold, (tr_index , val_index) in enumerate(kf.split(imputed_X , y)):
    
    print("-" * 50)
    print(f"Fold {fold + 1}")
    
    X_train,X_val = imputed_X[tr_index] , imputed_X[val_index]
    y_train,y_val = y[tr_index] , y[val_index]
        
    eval_set = [(X_val, y_val)]
    xgb_params = {'eval_metric':'auc',
                  'tree_method':'gpu_hist', 
                  'lambda': 0.22982527008250872, 
                  'alpha': 0.1198217268647637, 
                  'colsample_bytree': 0.3, 
                  'subsample': 0.6, 
                  'learning_rate': 0.012, 
                  'n_estimators': 3276, 
                  'max_depth': 11, 
                  'random_state': 2020, 
                  'min_child_weight': 268}
#     model = CatBoostClassifier(eval_metric="AUC", task_type="GPU")
    model = XGBClassifier(**xgb_params) 
    model.fit(X_train, y_train, eval_set = eval_set, verbose = False)
    
    train_preds = model.predict_proba(X_train)[:,1]  
    val_preds = model.predict_proba(X_val)[:,1]
    
    auc = roc_auc_score(y_val, val_preds)
    
    print("AUC Score : ",auc)
    scores.append(auc)
    if test_preds is None:
        test_preds = model.predict_proba(imputed_test)[:,1] 
    else:
        test_preds += model.predict_proba(imputed_test)[:,1] 

print("-" * 50)
test_preds /= n_splits
print("mean score : ", np.mean(scores), np.std(scores))

# Hyperparameter Tuning

In [None]:
def objective(trial):
    
    train_x, test_x, train_y, test_y = train_test_split(imputed_X, y, test_size=0.25,random_state=42, stratify=y)
    
    ctb_params = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
    }

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=42,
        border_count=64,
        **ctb_params
    ) 
    
    
    
    model.fit(train_x,train_y,verbose=False)
    
    test_preds = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y, test_preds)
    
    return auc

In [None]:
%%time
n_trials = 10
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(1)))
study.optimize(objective, n_trials = n_trials, n_jobs = multiprocessing.cpu_count())
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

# Prediction 

In [None]:
sub.iloc[:, 1:] = test_preds
sub.to_csv("submission.csv", index=False)

sub.shape

In [None]:
sub