04b: Version without PCA

2021-10-25: 
  - Clean-up
  - Using useful features only to avoid overfitting (if possible)
  - Testing MLFlow integration
  - Testing 12 clusters and poly degree 3 incl poly for clusters
  - Started new approach for evaluation metrics (not fully implemented yet)
  - Changed to conda environment (with pip only dependencies)

In [1]:
# The rest of the modules are loaded when required.
# To ensure a standalone character (for easier reusability).

import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import numpy as np
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px

import mlflow
import mlflow.xgboost
from urllib.parse import urlparse

# Debugging
from icecream import ic

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline

In [2]:
# importing the tensorflow package
import tensorflow as tf

# Checking GPU support
print(tf.test.is_built_with_cuda()) # True
print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None)) # True
print(tf.config.list_physical_devices('GPU'))

True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import os
import configparser

# Load external config file
config = configparser.ConfigParser()
config.read("../resources/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [4]:
train_df = pd.read_csv(PATH_DATA_RAW+'train.csv',index_col=0)
test_df = pd.read_csv(PATH_DATA_RAW+'test.csv',index_col=0)
sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

In [5]:
train_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,1,0,0,0,0,0,0,0,1
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,1,0,0,0,0,0,0,0,1
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0,0,1,1,0,0,0,0,1
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0,0,0,1,0,0,0,0,1
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,1,1,0,1,0,0,1,0,1


In [6]:
memory_usage = train_df.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()
start_mem

2189.63623046875

In [7]:
feature_cols = train_df.drop(['target'], axis=1).columns.tolist()
cnt_features = train_df.drop(['target'], axis=1).select_dtypes(exclude=['int64']).columns.tolist()
cat_features = train_df.drop(['target'], axis=1).select_dtypes(exclude=['float64']).columns.tolist()

ic(len(feature_cols))
ic(len(cnt_features))
ic(len(cat_features));

ic| len(feature_cols): 285
ic| len(cnt_features): 240
ic| len(cat_features): 45


In [30]:
#useful_features = useful_features = ["f22", "f179", "f69", "f58", "f214", "f78", "f136", "f156", "f8", "f3", "f77", "f200", "f92", "f185", "f142", "f115", "f284"]
useful_features = pd.read_csv(PATH_DATA_INT+'features_selected_6way_140.csv')['Feature'].head(13).tolist()
ic(len(useful_features));

ic| len(useful_features): 13


In [31]:
feature_cols = useful_features

In [10]:
train_df[cnt_features] = train_df[cnt_features].astype('float32')
train_df[cat_features] = train_df[cat_features].astype('uint8')

test_df[cnt_features] = test_df[cnt_features].astype('float32')
test_df[cat_features] = test_df[cat_features].astype('uint8')

In [32]:
memory_usage = train_df.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()
end_mem

1005.9515151977539

In [33]:
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))

Mem. usage decreased from 2189.64 MB to 1005.95 MB (54.06% reduction)


# Feature Engineering

## KMeans Clustering

In [34]:
from sklearn.cluster import KMeans

n_clusters_1 = 6
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters_1)]
kmeans = KMeans(n_clusters=n_clusters_1, n_init=50, max_iter=500, random_state=rnd_state)

ic(n_clusters_1)
#ic(cluster_cols);

ic| n_clusters_1: 6


6

In [35]:
# cluster distance instead of cluster number

# train
X_cd = kmeans.fit_transform(train_df[useful_features])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train_df.index)
train = train_df.join(X_cd)

# test
X_cd = kmeans.transform(test_df[useful_features])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test_df.index)
test = test_df.join(X_cd)

In [36]:
feature_cols += cluster_cols
train[feature_cols].head()

Unnamed: 0_level_0,f92,f8,f78,f77,f69,f58,f3,f22,f214,f179,f156,f136,f12,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0.618696,0.534873,0.198157,0.372637,0.199588,0.075109,0.223581,1,0.006978,0.112764,0.864594,0.822459,0.200924,0.189223,1.050896,1.017506,1.130244,0.323917,0.527002
1,0.688726,0.281971,0.23261,0.352968,0.228739,0.241071,0.213657,1,0.164553,0.008115,0.866689,0.820548,0.231828,0.305138,1.039516,1.045487,1.146268,0.284001,0.559371
2,0.512464,0.536272,0.261242,0.318504,0.164643,0.078052,0.207116,0,0.010954,0.011306,0.401814,0.819017,0.247791,1.115489,0.556324,0.49365,0.182968,1.14458,1.016904
3,0.459905,0.483958,0.17846,0.465532,0.179141,0.177084,0.22358,1,0.010339,0.012912,0.874224,0.819892,0.287755,0.203052,1.052789,1.020134,1.137029,0.329684,0.541285
4,0.511809,0.077115,0.189286,0.345664,0.149717,0.085606,0.548819,0,0.003894,0.188779,0.863533,0.82181,0.265285,1.119016,0.56953,0.502388,0.703449,1.150751,1.221849


In [37]:
ic(len(feature_cols))
#ic(feature_cols);

ic| len(feature_cols): 19


19

## Polynomial Features

- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [38]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True)

X_poly = poly.fit_transform(train[feature_cols])
T_poly = poly.transform(test[feature_cols])

poly_cols = [f"poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=train.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=test.index)

train = pd.concat([train, X_poly_df], axis=1)
test = pd.concat([test, T_poly_df], axis=1)


In [39]:
#feature_cols += ["poly79", "poly82", "poly66", "poly32", "poly81", "poly80", "poly42", "poly78", "poly59"]
feature_cols += poly_cols
train.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,poly182,poly183,poly184,poly185,poly186,poly187,poly188,poly189,poly190,poly191
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,1.069293,1.187769,0.340403,0.553824,1.15003,0.329587,0.536227,0.366105,0.59564,0.170705
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,1.0868,1.191563,0.295224,0.581475,1.198408,0.296919,0.584815,0.325541,0.641189,0.158862
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0.274629,0.101789,0.636757,0.565727,0.090322,0.565021,0.501994,0.209421,0.18606,1.163927
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,1.073985,1.197051,0.347088,0.569859,1.159921,0.336322,0.552183,0.37486,0.615457,0.178453
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0.286125,0.400636,0.655388,0.69588,0.353404,0.578123,0.613842,0.809494,0.859508,1.406043


## Mutual Information

In [40]:
from sklearn.feature_selection import mutual_info_regression

x = train.iloc[:5000,:][feature_cols] #.copy()
y = train.iloc[:5000,:]['target'] #.copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [41]:
import plotly.figure_factory as ff
import plotly.express as px

top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

In [42]:
ic(len(feature_cols))

ic| len(feature_cols): 210


210

# Model Training

## Functions

In [43]:
# TODO: New approach for evaluation metrics (not fully implemented yet)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def evaluation_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    return rmse, mae, r2

## Creating Kfolds

In [44]:
from sklearn.model_selection import StratifiedKFold

folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=rnd_state)

# Adding folds to dataset
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [45]:
train['kfold'].value_counts()

3    200000
2    200000
4    200000
0    200000
1    200000
Name: kfold, dtype: int64

In [48]:
print("--")
# mlflow.autolog()
# mlflow.xgboost.autolog()
mlflow.start_run()
run = mlflow.active_run()

# Log text to a file under the run's root artifact directory
mlflow.log_text("Test", "comments.txt")

# Print run id
print("run_id: {}; status: {}\n".format(run.info.run_id, run.info.status))

--
run_id: d453540400dd4b3b9d059b400aed34f1; status: RUNNING



In [49]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import time

final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test = test[feature_cols].copy()

    y_train = x_train["target"]
    y_valid = x_valid["target"]

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    xgb_params = {
        "eval_metric": "auc",
        "objective": "binary:logistic",
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        #"n_jobs": n_cpu,
        "predictor": "gpu_predictor",
        "n_estimators": 10000,
        "learning_rate": 0.01063045229441343,
        "gamma": 0.24652519525750877,
        "max_depth": 4,
        "seed": rnd_state,
        "min_child_weight": 366,
        "subsample": 0.6423040816299684,
        "colsample_bytree": 0.7751264493218339,
        "colsample_bylevel": 0.8675692743597421,
        "use_label_encoder": False,
        "lambda": 0,
        "alpha": 10,
    }

    model = XGBClassifier(**xgb_params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)

    preds_test = model.predict_proba(x_test)[:, 1]
    final_test_predictions.append(preds_test)


print("AVG AUC:", np.mean(scores))

KeyboardInterrupt: 

In [None]:
# MLFlow: Get metrics
# TODO: Test mlflow.autolog()
mlflow.log_params(params)
#mlflow.log_metric("rmse", rmse)
#mlflow.log_metric("r2", r2)
#mlflow.log_metric("mae", mae)
mlflow.log_metric("AVG AUC", np.mean(scores))

#features =", ".join(str(elem) for elem in feature_cols)
features = str(feature_cols)

with open("features.txt", 'w') as f:
    f.write(features)
    
mlflow.log_artifact("features.txt", artifact_path="features")

tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

# Register Experiment
if tracking_url_type_store != "file":
    # Register the model
    # There are other ways to use the Model Registry, which depends on the use case,
    # please refer to the doc for more information:
    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
    
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="xgboost-model",
        registered_model_name="XGBClassifier",
    )

else:
    print('Store type: file')
    mlflow.xgboost.log_model(model, "model")


Store type: file


In [None]:
# End run and get status
mlflow.end_run()
run = mlflow.get_run(run.info.run_id)
print("run_id: {}; status: {}".format(run.info.run_id, run.info.status))
print("--")

# Check for any active runs
print("Active run: {}".format(mlflow.active_run()))

run_id: eed7c979be864de3838955b2ddf60c40; status: FINISHED
--
Active run: None


#  Submit results

- 2021-10-20_submission_kmean-pca-fs17-xbg-nop.csv: 0.8569970265985022 | **0.85626**
- 2021-10-20_submission_kmean-3pca-fs17-xbg-nop.csv: 0.8568950821125828 | 0.85619
- 2021-10-20_submission_kmean-ALLpca-fs13-xbg-nop.csv: 0.8567778822323093 | not submitted
- 2021-10-20_submission_13fs-kmean-2pca-4nf-xbg-nop.csv: 0.8568443738048714 | not submitted
- 2021-10-20_submission_49fs-kmean-2pca-4nf-xbg-nop.csv: 0.8567888578426363 | 0.85599
- 2021-10-20_submission_13fs-kmean-1pca-4nf-poly-xbg-nop.csv: 0.8568639280432698 | 0.85615
- 2021-10-25_submission_f64d06226bb94f2fb2b33350b7a40692.csv: 0.8566269081302578 | 0.85600

In [None]:
import numpy as np
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
now = now.strftime("%Y-%m-%d")

objective = str(run.info.run_id)

curr_submission_fn = f"{now}_submission_{objective}.csv"

sample_df['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_df.to_csv(PATH_SUB + curr_submission_fn, index=False)

print(curr_submission_fn)


2021-10-25_submission_f64d06226bb94f2fb2b33350b7a40692.csv


In [None]:
#!kaggle competitions submit tabular-playground-series-oct-2021 -f {PATH_SUB+curr_submission_fn} -m {curr_submission_fn}

Successfully submitted to Tabular Playground Series - Oct 2021


  0%|          | 0.00/9.36M [00:00<?, ?B/s]
  0%|          | 8.00k/9.36M [00:00<07:18, 22.4kB/s]
  1%|          | 96.0k/9.36M [00:00<00:42, 229kB/s] 
  1%|▏         | 136k/9.36M [00:01<01:11, 135kB/s] 
  2%|▏         | 168k/9.36M [00:01<01:04, 149kB/s]
  2%|▏         | 192k/9.36M [00:01<01:31, 105kB/s]
  2%|▏         | 216k/9.36M [00:01<01:18, 122kB/s]
  3%|▎         | 240k/9.36M [00:01<01:08, 140kB/s]
  3%|▎         | 264k/9.36M [00:01<01:00, 157kB/s]
  3%|▎         | 304k/9.36M [00:02<00:47, 200kB/s]
  4%|▎         | 336k/9.36M [00:02<00:43, 216kB/s]
  4%|▍         | 368k/9.36M [00:02<00:39, 239kB/s]
  4%|▍         | 408k/9.36M [00:02<00:34, 273kB/s]
  5%|▍         | 448k/9.36M [00:02<00:30, 303kB/s]
  5%|▌         | 480k/9.36M [00:02<00:30, 305kB/s]
  5%|▌         | 512k/9.36M [00:02<00:30, 306kB/s]
  6%|▌         | 552k/9.36M [00:02<00:29, 315kB/s]
  6%|▋         | 600k/9.36M [00:02<00:26, 342kB/s]
  7%|▋         | 640k/9.36M [00:03<00:28, 326kB/s]
  7%|▋         | 688k/9.36M [00:




 76%|███████▌  | 7.08M/9.36M [00:21<00:06, 349kB/s]
 76%|███████▌  | 7.12M/9.36M [00:21<00:06, 359kB/s]
 76%|███████▋  | 7.16M/9.36M [00:21<00:06, 341kB/s]
 77%|███████▋  | 7.20M/9.36M [00:21<00:06, 361kB/s]
 77%|███████▋  | 7.24M/9.36M [00:21<00:05, 373kB/s]
 78%|███████▊  | 7.28M/9.36M [00:21<00:05, 370kB/s]
 78%|███████▊  | 7.32M/9.36M [00:22<00:06, 345kB/s]
 79%|███████▊  | 7.37M/9.36M [00:22<00:05, 377kB/s]
 79%|███████▉  | 7.41M/9.36M [00:22<00:05, 353kB/s]
 80%|███████▉  | 7.45M/9.36M [00:22<00:05, 357kB/s]
 80%|███████▉  | 7.48M/9.36M [00:22<00:05, 357kB/s]
 80%|████████  | 7.52M/9.36M [00:22<00:05, 334kB/s]
 81%|████████  | 7.56M/9.36M [00:22<00:05, 349kB/s]
 81%|████████  | 7.60M/9.36M [00:22<00:05, 359kB/s]
 82%|████████▏ | 7.64M/9.36M [00:23<00:04, 362kB/s]
 82%|████████▏ | 7.68M/9.36M [00:23<00:04, 360kB/s]
 82%|████████▏ | 7.72M/9.36M [00:23<00:04, 365kB/s]
 83%|████████▎ | 7.76M/9.36M [00:23<00:04, 360kB/s]
 83%|████████▎ | 7.80M/9.36M [00:23<00:04, 353kB/s]
 84%|███████

# Optimizing Hyperparameter


In [50]:
import time
import optuna
from optuna.samplers import TPESampler
from catboost.utils import eval_metric
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [51]:

def objective(trial):
    scores = []
    #ic(scores)

    for fold in range(folds):
        time.sleep(2)
        x_train = train[train.kfold != fold].copy()
        x_valid = train[train.kfold == fold].copy()
        # x_test = test[feature_cols].copy()

        y_train = x_train["target"]
        y_valid = x_valid["target"]

        x_train = x_train[feature_cols]
        x_valid = x_valid[feature_cols]

        params = {
            # "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "objective": trial.suggest_categorical("objective", ["CrossEntropy"]),
            # "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "iterations": trial.suggest_int("iterations", 1400, 3400),
            "depth": trial.suggest_int("depth", 4, 6),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "boosting_type": trial.suggest_categorical(
                "boosting_type", ["Ordered", "Plain"]
            ),
        }

        # if params["bootstrap_type"] == "Bayesian":
        #    params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)

        # elif params["bootstrap_type"] == "Bernoulli":
        #    params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        #print("Fitting...")
        model = CatBoostClassifier(
            **params, 
            random_seed=rnd_state,
            task_type="GPU", 
            devices="0:1", 
            eval_metric="AUC"
        )
        
        model.fit(
            x_train,
            y_train,
            eval_set=[(x_valid, y_valid)],
            verbose=False,
            early_stopping_rounds=100,
        )

        #print("Predicting...")
        preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        auc_train = roc_auc_score(y_train, preds_train)
        auc = roc_auc_score(y_valid, preds_valid)
        print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
        scores.append(auc)

    return np.mean(scores)


In [52]:

sampler = TPESampler(seed=rnd_state)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)


[32m[I 2021-10-25 20:54:28,830][0m A new study created in memory with name: no-name-febd97c0-8372-4c43-a6bf-36e8d3e32c46[0m


KeyboardInterrupt: 