04c: Version without PCA

2021-10-25: 
  - Clean-up
  - Using useful features only to avoid overfitting (if possible)
  - Testing 6 clusters and poly degree 2 incl poly for clusters

In [64]:
# The rest of the modules are loaded when required.
# To ensure a standalone character (for easier reusability).

import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import numpy as np
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px

import mlflow
import mlflow.xgboost
from urllib.parse import urlparse

# Debugging
from icecream import ic

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [65]:
# importing the tensorflow package
import tensorflow as tf

# Checking GPU support
print(tf.test.is_built_with_cuda()) # True
print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None)) # True
print(tf.config.list_physical_devices('GPU'))

True
True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [66]:
import os
import configparser

# Load external config file
config = configparser.ConfigParser()
config.read("../resources/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [67]:
#train_df = pd.read_csv(PATH_DATA_RAW+'train.csv',index_col=0)
#test_df = pd.read_csv(PATH_DATA_RAW+'test.csv',index_col=0)

train_df = pd.read_pickle(PATH_DATA_INT+'train-opt.pkl')
test_df = pd.read_pickle(PATH_DATA_INT+'test-opt.pkl')

sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

In [68]:
train_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,0,1,0,0,0,0,0,0,0,1
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,0,1,0,0,0,0,0,0,0,1
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,0,0,0,1,1,0,0,0,0,1
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,0,0,0,0,1,0,0,0,0,1
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0,1,1,0,1,0,0,1,0,1


In [69]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 287 entries, id to target
dtypes: float32(240), int32(1), int8(46)
memory usage: 963.2 MB


In [70]:
feature_cols = train_df.drop(['target','id'], axis=1).columns.tolist()
cnt_features = train_df.drop(['target', 'id'], axis=1).select_dtypes(exclude=['int32', 'int8']).columns.tolist()
cat_features = train_df.drop(['target', 'id'], axis=1).select_dtypes(exclude=['float32']).columns.tolist()

ic(len(feature_cols))
ic(len(cnt_features))
ic(len(cat_features));

ic| len(feature_cols): 285
ic| len(cnt_features): 240
ic| len(cat_features): 45


In [71]:
# useful_features = useful_features = ["f22", "f179", "f69", "f58", "f214", "f78", "f136", "f156", "f8", "f3", "f77", "f200", "f92", "f185", "f142", "f115", "f284"]
useful_features_set1 = (
    pd.read_csv(PATH_DATA_INT + "features_selected_6way_140.csv")["Feature"]
    .head(16)
    .tolist()
)

useful_features_set2 = [
    "f1",
    "f103",
    "f107",
    "f112",
    "f119",
    "f12",
    "f125",
    "f127",
    "f13",
    "f130",
    "f134",
    "f136",
    "f138",
    "f139",
    "f14",
    "f141",
    "f143",
    "f144",
    "f150",
    "f152",
    "f154",
    "f156",
    "f16",
    "f163",
    "f169",
    "f17",
    "f173",
    "f179",
    "f18",
    "f187",
    "f19",
    "f192",
    "f195",
    "f198",
    "f2",
    "f20",
    "f200",
    "f201",
    "f211",
    "f213",
    "f214",
    "f22",
    "f222",
    "f227",
    "f231",
    "f239",
    "f241",
    "f243",
    "f247",
    "f252",
    "f258",
    "f26",
    "f266",
    "f27",
    "f29",
    "f3",
    "f33",
    "f4",
    "f40",
    "f42",
    "f43",
    "f44",
    "f48",
    "f5",
    "f52",
    "f53",
    "f56",
    "f58",
    "f6",
    "f60",
    "f62",
    "f63",
    "f64",
    "f65",
    "f69",
    "f7",
    "f71",
    "f72",
    "f73",
    "f74",
    "f75",
    "f77",
    "f78",
    "f8",
    "f82",
    "f83",
    "f85",
    "f86",
    "f90",
    "f92",
    "f93",
    "f95",
    "f96",
    "f98",
    "f99",
]


In [72]:
#feature_cols = useful_features_set2
ic(len(feature_cols))


ic| len(feature_cols): 285


285

# Feature Engineering

In [73]:
train_df['mean_numeric'] = train_df[cnt_features].mean(axis=1)
train_df['std_numeric'] = train_df[cnt_features].std(axis=1)
#train_df['min_numeric'] = train_df[cnt_features].min(axis=1)
#train_df['max_numeric'] = train_df[cnt_features].max(axis=1)
train_df['sum_categoricals'] = train_df[cat_features].sum(axis=1)

test_df['mean_numeric'] = test_df[cnt_features].mean(axis=1)
test_df['std_numeric'] = test_df[cnt_features].std(axis=1)
#test_df['min_numeric'] = test_df[cnt_features].min(axis=1)
#test_df['max_numeric'] = test_df[cnt_features].max(axis=1)
test_df['sum_categoricals'] = test_df[cat_features].sum(axis=1)

In [74]:
feature_cols += ['mean_numeric', 'std_numeric','sum_categoricals']
ic(len(feature_cols));

ic| len(feature_cols): 288


In [75]:
train_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f279,f280,f281,f282,f283,f284,target,mean_numeric,std_numeric,sum_categoricals
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,0,0,0,0,0,0,1,0.233355,0.208569,17
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,0,0,0,0,0,0,1,0.22884,0.2085,16
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,1,1,0,0,0,0,1,0.220909,0.196083,13
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,0,1,0,0,0,0,1,0.230246,0.211794,17
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0,1,0,0,1,0,1,0.216252,0.20208,18


## KMeans Clustering

In [76]:
from sklearn.cluster import KMeans

n_clusters_1 = 6
cluster_cols = [f"cluster1{i+1}" for i in range(n_clusters_1)]
#kmeans = KMeans(n_clusters=n_clusters_1, n_init=50, max_iter=500, random_state=rnd_state)
kmeans = KMeans(n_clusters=n_clusters_1, init="k-means++", max_iter=500, random_state=rnd_state)

ic(n_clusters_1)
#ic(cluster_cols);

ic| n_clusters_1: 6


6

In [77]:
# cluster distance instead of cluster number

# train
X_cd = kmeans.fit_transform(train_df[useful_features_set1])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train_df.index)
train = train_df.join(X_cd)

# test
X_cd = kmeans.transform(test_df[useful_features_set1])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test_df.index)
test = test_df.join(X_cd)

In [78]:
feature_cols += cluster_cols
ic(len(feature_cols));

ic| len(feature_cols): 294


In [79]:
train[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f284,mean_numeric,std_numeric,sum_categoricals,cluster11,cluster12,cluster13,cluster14,cluster15,cluster16
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,0.233355,0.208569,17,0.739584,1.242293,0.585885,1.158385,0.418936,1.083924
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,0.22884,0.2085,16,0.564936,1.149203,0.279204,1.038267,0.531407,1.131821
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0.220909,0.196083,13,1.019494,0.197808,1.119335,0.502227,1.196503,0.657885
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0.230246,0.211794,17,0.595213,1.163702,0.322403,1.050408,0.555903,1.143232
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,0.216252,0.20208,18,1.224616,0.708701,1.123572,0.51246,1.21039,0.680379


## Polynomial Features

- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [80]:
from sklearn.preprocessing import PolynomialFeatures




In [81]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

X_poly = poly.fit_transform(train[useful_features_set1])
T_poly = poly.transform(test[useful_features_set1])

poly_cols = [f"usefulfset1_poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=train.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=test.index)

train = pd.concat([train, X_poly_df], axis=1)
test = pd.concat([test, T_poly_df], axis=1)


In [82]:
#feature_cols += ["poly79", "poly82", "poly66", "poly32", "poly81", "poly80", "poly42", "poly78", "poly59"]
feature_cols += poly_cols
train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,usefulfset1_poly8,usefulfset1_poly9,usefulfset1_poly10,usefulfset1_poly11,usefulfset1_poly12,usefulfset1_poly13,usefulfset1_poly14,usefulfset1_poly15,usefulfset1_poly16,usefulfset1_poly17
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,0.223581,1.0,0.006978,0.112764,0.864594,0.822459,0.200924,0.112203,0.57364,0.559151
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,0.213657,1.0,0.164553,0.008115,0.866689,0.820548,0.231828,0.111834,0.162314,0.145737
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,0.207116,0.0,0.010954,0.011306,0.401814,0.819017,0.247791,0.110486,0.163786,0.144596
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,0.22358,1.0,0.010339,0.012912,0.874224,0.819892,0.287755,0.361132,0.162876,0.146811
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0.548819,0.0,0.003894,0.188779,0.863533,0.82181,0.265285,0.113454,0.162357,0.148517


In [83]:
poly = PolynomialFeatures(degree=1, interaction_only=True)

X_poly = poly.fit_transform(train[cluster_cols])
T_poly = poly.transform(test[cluster_cols])

poly_cols = [f"cluster_poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=train.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=test.index)

train = pd.concat([train, X_poly_df], axis=1)
test = pd.concat([test, T_poly_df], axis=1)



In [84]:
feature_cols += poly_cols
ic(len(feature_cols));

ic| len(feature_cols): 318


In [85]:
train.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,usefulfset1_poly15,usefulfset1_poly16,usefulfset1_poly17,cluster_poly1,cluster_poly2,cluster_poly3,cluster_poly4,cluster_poly5,cluster_poly6,cluster_poly7
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,0.112203,0.57364,0.559151,1.0,0.739584,1.242293,0.585885,1.158385,0.418936,1.083924
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,0.111834,0.162314,0.145737,1.0,0.564936,1.149203,0.279204,1.038267,0.531407,1.131821
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,0.110486,0.163786,0.144596,1.0,1.019494,0.197808,1.119335,0.502227,1.196503,0.657885
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,0.361132,0.162876,0.146811,1.0,0.595213,1.163702,0.322403,1.050408,0.555903,1.143232
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0.113454,0.162357,0.148517,1.0,1.224616,0.708701,1.123572,0.51246,1.21039,0.680379


## Mutual Information

In [86]:
from sklearn.feature_selection import mutual_info_regression

x = train.iloc[:5000,:][feature_cols] #.copy()
y = train.iloc[:5000,:]['target'] #.copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [87]:
import plotly.figure_factory as ff
import plotly.express as px

top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

In [88]:
ic(len(feature_cols));

ic| len(feature_cols): 318


# Train Model

In [89]:
# Clear memory
import gc
gc.collect()

2498

## Functions

In [90]:
# TODO: New approach for evaluation metrics (not fully implemented yet)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def evaluation_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    return rmse, mae, r2

## Creating Kfolds

In [91]:
from sklearn.model_selection import StratifiedKFold

folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=rnd_state)

# Adding folds to dataset
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [92]:
train['kfold'].value_counts()

3    200000
2    200000
4    200000
0    200000
1    200000
Name: kfold, dtype: int64

## Baseline Model

In [93]:
print("--")
# mlflow.autolog()
# mlflow.xgboost.autolog()
mlflow.start_run()
run = mlflow.active_run()

# Log text to a file under the run's root artifact directory
mlflow.log_text("Test", "comments.txt")

# Print run id
print("run_id: {}; status: {}\n".format(run.info.run_id, run.info.status))

--
run_id: 5121f4ce57374a46b5a95235f95d80d2; status: RUNNING



In [94]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import time

final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold]  # .copy()
    x_valid = train[train.kfold == fold]  # .copy()
    x_test = test[feature_cols]  # .copy()

    y_train = x_train["target"]
    y_valid = x_valid["target"]

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    params = {
        "eval_metric": "auc",
        "objective": "binary:logistic",
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        # "n_jobs": n_cpu,
        "predictor": "gpu_predictor",
        "n_estimators": 10000,
        "learning_rate": 0.01063045229441343,
        "gamma": 0.24652519525750877,
        "max_depth": 4,
        "seed": rnd_state,
        "min_child_weight": 366,
        "subsample": 0.6423040816299684,
        "colsample_bytree": 0.7751264493218339,
        "colsample_bylevel": 0.8675692743597421,
        "use_label_encoder": False,
        "lambda": 0,
        "alpha": 10,
    }

    model = XGBClassifier(**params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc_valid = roc_auc_score(y_valid, preds_valid)
    scores.append(auc_valid)

    preds_test = model.predict_proba(x_test)[:, 1]
    final_test_predictions.append(preds_test)

    print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc_valid:.6f}", " AVG AUC:", np.mean(scores))

print("AVG AUC:", np.mean(scores))


Fold 0 , train: 0.869683 , valid: 0.858583
Fold 1 , train: 0.870319 , valid: 0.855754
Fold 2 , train: 0.869954 , valid: 0.857577
Fold 3 , train: 0.870421 , valid: 0.856007
Fold 4 , train: 0.870162 , valid: 0.856647
AVG AUC: 0.8569138102302041


In [95]:
# MLFlow: Get metrics
# TODO: Test mlflow.autolog()
mlflow.log_params(params)
# mlflow.log_metric("rmse", rmse)
# mlflow.log_metric("r2", r2)
# mlflow.log_metric("mae", mae)
mlflow.log_metric("AVG AUC", np.mean(scores))

# Save feature list for mlflow
# features =", ".join(str(elem) for elem in feature_cols)
features = str(feature_cols)

with open("features.txt", "w") as f:
    f.write(features)

mlflow.log_artifact("features.txt", artifact_path="features")

"""
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

# Register Experiment
if tracking_url_type_store != "file":
    # Register the model
    # There are other ways to use the Model Registry, which depends on the use case,
    # please refer to the doc for more information:
    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
    
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="xgboost-model",
        registered_model_name="XGBClassifier",
    )

else:
    print('Store type: file')
    mlflow.xgboost.log_model(model, "model")
"""

print("Store type: file")
mlflow.xgboost.log_model(model, "model")


Store type: file


In [96]:
# End run and get status
mlflow.end_run()

print("run_id: {}; status: {}".format(run.info.run_id, run.info.status))
print("--")

# Check for any active runs
print("Active run: {}".format(mlflow.active_run()))

run_id: 5121f4ce57374a46b5a95235f95d80d2; status: RUNNING
--
Active run: None


##  Submit Baseline Results

- 2021-10-20_submission_kmean-pca-fs17-xbg-nop.csv: 0.8569970265985022 | **0.85626**
- 2021-10-20_submission_kmean-3pca-fs17-xbg-nop.csv: 0.8568950821125828 | 0.85619
- 2021-10-20_submission_kmean-ALLpca-fs13-xbg-nop.csv: 0.8567778822323093 | not submitted
- 2021-10-20_submission_13fs-kmean-2pca-4nf-xbg-nop.csv: 0.8568443738048714 | not submitted
- 2021-10-20_submission_49fs-kmean-2pca-4nf-xbg-nop.csv: 0.8567888578426363 | 0.85599
- 2021-10-20_submission_13fs-kmean-1pca-4nf-poly-xbg-nop.csv: 0.8568639280432698 | 0.85615
- 2021-10-25_submission_f64d06226bb94f2fb2b33350b7a40692.csv: 0.8566269081302578 | 0.85600
- 2021-10-26_submission_e4bb062717c147ee84b6c5367c25d019.csv: 0.8543231035426082 | 0.85343
- 2021-10-26_submission_2eebfcd694b246da9f6f22da55753bf8.csv: 0.8567097047281612 | 0.85596
- 2021-10-26_submission_5121f4ce57374a46b5a95235f95d80d2.csv: 0.8569138102302041 | 0.85615

In [97]:
import numpy as np
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
now = now.strftime("%Y-%m-%d")

objective = str(run.info.run_id)

curr_submission_fn = f"{now}_submission_{objective}.csv"

sample_df['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_df.to_csv(PATH_SUB + curr_submission_fn, index=False)

print(curr_submission_fn)


2021-10-26_submission_5121f4ce57374a46b5a95235f95d80d2.csv


In [101]:
!kaggle competitions submit tabular-playground-series-oct-2021 -f {PATH_SUB+curr_submission_fn} -m {curr_submission_fn}

Successfully submitted to Tabular Playground Series - Oct 2021



  0%|          | 0.00/9.36M [00:00<?, ?B/s]
  0%|          | 8.00k/9.36M [00:00<08:07, 20.1kB/s]
  1%|          | 88.0k/9.36M [00:00<00:44, 216kB/s] 
  2%|▏         | 192k/9.36M [00:00<00:22, 431kB/s] 
  3%|▎         | 264k/9.36M [00:01<00:39, 244kB/s]
  3%|▎         | 312k/9.36M [00:01<00:37, 256kB/s]
  4%|▍         | 360k/9.36M [00:01<00:32, 292kB/s]
  4%|▍         | 408k/9.36M [00:01<00:30, 308kB/s]
  5%|▍         | 448k/9.36M [00:01<00:29, 316kB/s]
  5%|▌         | 488k/9.36M [00:01<00:28, 331kB/s]
  6%|▌         | 528k/9.36M [00:01<00:26, 344kB/s]
  6%|▌         | 568k/9.36M [00:02<00:26, 343kB/s]
  6%|▋         | 608k/9.36M [00:02<00:26, 341kB/s]
  7%|▋         | 648k/9.36M [00:02<00:25, 353kB/s]
  7%|▋         | 688k/9.36M [00:02<00:25, 361kB/s]
  8%|▊         | 728k/9.36M [00:02<00:24, 367kB/s]
  8%|▊         | 768k/9.36M [00:02<00:25, 359kB/s]
  8%|▊         | 808k/9.36M [00:02<00:25, 353kB/s]
  9%|▉         | 848k/9.36M [00:02<00:26, 335kB/s]
  9%|▉         | 888k/9.36M [00:

## Optimize Hyperparameter


In [118]:
# Clear memory
import gc
gc.collect()

465

In [119]:
import time
import optuna
from optuna.samplers import TPESampler
from catboost.utils import eval_metric
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [120]:
def objective(trial):
    time.sleep(2)
    scores = []
    # ic(scores)

    for fold in range(folds):
        x_train = train[train.kfold != fold]#.copy()
        x_valid = train[train.kfold == fold]#.copy()
        # x_test = test[feature_cols].copy()

        y_train = x_train["target"]
        y_valid = x_valid["target"]

        x_train = x_train[feature_cols]
        x_valid = x_valid[feature_cols]

        # param_list
        # XGBoost Parameters
        # tree_method_list = trial.suggest_categorical("tree_method",["exact", "approx", "hist"])
        tree_method_list = trial.suggest_categorical("tree_method", ["gpu_hist"])

        # booster: Consider only tree booster because it always outperforms the linear booster and thus the later is rarely used
        # booster_list = trial.suggest_categorical("booster", ["gbtree", "dart"])
        booster_list = trial.suggest_categorical("booster", ["gbtree"])

        n_estimators = trial.suggest_int("n_estimators", 3000, 15000) # 1000 - 10000
        max_depth = trial.suggest_int("max_depth", 4, 12) # 1-12
        min_child_weight = trial.suggest_int("min_child_weight", 100, 500) # 1, 25
        alpha_list = trial.suggest_int("alpha", 9, 25) # 1, 25
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1) # 0.01, 0.3, 0.01
        gamma_list = trial.suggest_float("gamma", 0.2, 0.4) # 0.01, 0.3, 0.01

        # scale_pos_weight: A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
        scale_pos_weight = trial.suggest_int("scale_pos_weight", 1, 30)

        subsample_list = trial.suggest_float("subsample", 0.1, 1.0) # 0.4, 1.0, 0.1
        colsample_bytree_list = trial.suggest_float("colsample_bytree", 0.1, 1.0) # 0.4, 1.0, 0.1
        colsample_bylevel_list = trial.suggest_float("colsample_bytree", 0.1, 1.0) # 0.5, 1.0, 0.1
        reg_lambda_list = trial.suggest_int("reg_lambda", 0, 5)

        # Classification
        # objective_list = trial.suggest_categorical("objective", ["binary:logistic","reg:logistic","binary:hinge","binary:logitraw"])
        objective_list = trial.suggest_categorical("objective", ["binary:logistic"])
        eval_metric_list = trial.suggest_categorical("eval_metric", ["auc"])

        params = {
            "objective": objective_list,
            "eval_metric": eval_metric_list,
            "booster": booster_list,
            "random_state": rnd_state,
            "tree_method": tree_method_list,
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "min_child_weight": min_child_weight,
            "learning_rate": learning_rate,
            "gamma": gamma_list,
            "scale_pos_weight": scale_pos_weight,
            "subsample": subsample_list,
            "colsample_bytree": colsample_bytree_list,
            "colsample_bylevel": colsample_bylevel_list,
            "reg_lambda": reg_lambda_list,
            "alpha": alpha_list,
            "verbosity": 1,
            "use_label_encoder": False,
            "gpu_id": 0,
            # "n_jobs": n_cpu,
            "predictor": "gpu_predictor",
        }

        model = XGBClassifier(**params)
        model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

        preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        auc_train = roc_auc_score(y_train, preds_train)
        auc_valid = roc_auc_score(y_valid, preds_valid)
        scores.append(auc_valid)

        avg_auc_score = np.mean(scores)

        # preds_test = model.predict_proba(x_test)[:, 1]
        # final_test_predictions.append(preds_test)

        print(f'Fold: {fold}, AUC Train: {auc_train:.6f}, AUC Valid: {auc_valid:.6f}, AUC Valid AVG: {avg_auc_score:.6f}')

    return avg_auc_score


In [121]:

sampler = TPESampler(seed=rnd_state)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)


[32m[I 2021-10-27 09:01:00,887][0m A new study created in memory with name: no-name-51636e7c-05f1-4524-a396-5405c44b9047[0m


Fold: 0, AUC Train: 0.990014, AUC Valid: 0.855864, AUC Valid AVG: 0.855864
