In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import copy #Avoid memory referencing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns


In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train_df['num_nulls'] = train_df.drop(['id', 'claim'], axis = 1).isna().sum(axis = 1)
test_df['num_nulls'] = test_df.drop(['id'], axis = 1).isna().sum(axis = 1)

In [None]:
X = train_df.drop(columns = ['id', 'claim'])
Y = train_df['claim']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2,
                                                                random_state=0)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from sklearn.impute import SimpleImputer

features = [col for col in train_df.columns if col not in ['claim', 'id']]
pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=64,output_distribution='uniform')),
        ('bin', KBinsDiscretizer(n_bins=64, encode='ordinal',strategy='uniform'))
        ])
train_df[features] = pipe.fit_transform(train_df[features])
test_df[features] = pipe.transform(test_df[features])
test_df.head()

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier 
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
import optuna


In [None]:
OPTUNA_OPTIMIZATION = True

def objective(trial):    
    params = {
            'n_estimators':trial.suggest_int("n_estimators", 1000, 20000),
            'learning_rate' : trial.suggest_uniform('learning_rate', 0.001, 1),
            'subsample': trial.suggest_uniform('subsample', 0.1, 1),
            'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),
            'max_depth': trial.suggest_categorical('max_depth', [1,3,5,7,9,11,13,15,17,20]),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
            'tree_method': 'gpu_hist'
        }
    
    model = XGBClassifier(**params)
    
    training_df, validation_df =  train_test_split(train_df, test_size=0.2, shuffle=True, random_state=1)

    x_train = training_df.drop(['claim', 'id'],axis=1) 
    y_train = training_df['claim']
    x_valid = validation_df.drop(['claim', 'id'],axis=1) 
    y_valid = validation_df['claim']

    model.fit(
        x_train , y_train,
        eval_set=[(x_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=0
    )
    
    return roc_auc_score(y_valid,  model.predict_proba(x_valid)[:,1])

In [None]:
study = optuna.create_study(
    direction='maximize',
    study_name='XG_boost'
)

study.optimize(
    objective,
    n_trials=100
)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
params = study.best_trial.params
print(f"Best Trial: {study.best_trial.value}")
print(f"Best Params: {study.best_trial.params}")

In [None]:
model = XGBClassifier(**params)
del study
    
training_df, validation_df =  train_test_split(train_df, test_size=0.2, shuffle=True, random_state=1)

x_train = training_df.drop(['claim', 'id'],axis=1) 
y_train = training_df['claim']
x_valid = validation_df.drop(['claim', 'id'],axis=1) 
y_valid = validation_df['claim']

model.fit(
    x_train , y_train,
    eval_set=[(x_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=0
)

In [None]:
X_test = 
test_preds = model.predict(X_test)

In [None]:
submission_df['claim'] = test_preds
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df.head()