04c: Version without PCA

2021-10-25: 
  - Clean-up
  - Using useful features only to avoid overfitting (if possible)
  - Testing 6 clusters and poly degree 2 incl poly for clusters

In [1]:
# The rest of the modules are loaded when required.
# To ensure a standalone character (for easier reusability).

import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import numpy as np
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px

import mlflow
import mlflow.xgboost
from urllib.parse import urlparse

# Debugging
from icecream import ic

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline

In [2]:
# importing the tensorflow package
import tensorflow as tf

# Checking GPU support
print(tf.test.is_built_with_cuda())  # True
print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))  # True
# print(tf.config.list_physical_devices('GPU'))


True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


In [3]:
import os
import configparser

# Load external config file
config = configparser.ConfigParser()
config.read("../resources/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 8


In [4]:
import urllib, requests #for Telegram notifications

def send_telegram_message(message):
    """Sending messages to Telegram bot via requests.get()."""
    
    message = f"{FILENAME_NB}:\n{message}"

    # Using "try and except" to ensure that the notebook execution will not be stopped only because of problems with the bot.
    # Example: No network connection.
    # ISSUE: Be careful, an error messages will leak your Telegram Bot Token when uploaded to GitHub.
    try:
        url = 'https://api.telegram.org/bot%s/sendMessage?chat_id=%s&text=%s'%(token, chat_id, urllib.parse.quote_plus(message))
        _ = requests.get(url, timeout=10)
    
    except Exception as e:
        print('\n\nSending message to Telegram Bot was not successful.\n\n')
        print(e)
        
    return None

In [5]:
#train_df = pd.read_csv(PATH_DATA_RAW+'train.csv',index_col=0)
#test_df = pd.read_csv(PATH_DATA_RAW+'test.csv',index_col=0)

train_df = pd.read_pickle(PATH_DATA_INT+'train-opt.pkl')
test_df = pd.read_pickle(PATH_DATA_INT+'test-opt.pkl')

sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

In [6]:
train_df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,...,0,1,0,0,0,0,0,0,0,1
1,1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,...,0,1,0,0,0,0,0,0,0,1
2,2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,...,0,0,0,1,1,0,0,0,0,1
3,3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,...,0,0,0,0,1,0,0,0,0,1
4,4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,...,0,1,1,0,1,0,0,1,0,1


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Columns: 287 entries, id to target
dtypes: float32(240), int32(1), int8(46)
memory usage: 963.2 MB


In [8]:
feature_cols = train_df.drop(['target','id'], axis=1).columns.tolist()
cnt_features = train_df.drop(['target', 'id'], axis=1).select_dtypes(exclude=['int32', 'int8']).columns.tolist()
cat_features = train_df.drop(['target', 'id'], axis=1).select_dtypes(exclude=['float32']).columns.tolist()

ic(len(feature_cols))
ic(len(cnt_features))
ic(len(cat_features));

ic| len(feature_cols): 285
ic| len(cnt_features): 240
ic| len(cat_features): 45


In [9]:
useful_features_set1 = (
    #pd.read_csv(PATH_DATA_INT + "features_selected_6way_all.csv")["feature"]
    pd.read_csv("features_selected_6way_all.csv")["feature"]
    .head(16)
    .tolist()
)

useful_features_set2 = (
    #pd.read_csv(PATH_DATA_INT + "features_selected_shap_all.csv")["feature"]
    pd.read_csv("features_selected_shap_all.csv")["feature"]
    .head(20)
    .tolist()
)

In [10]:
useful_features_set1 += useful_features_set2
useful_features_set = list(set(useful_features_set1))

In [11]:
# Only working with useful features
#feature_cols = useful_features_set
ic(len(feature_cols))


ic| len(feature_cols): 285


285

# Feature Engineering

In [12]:
train_df['mean_numeric'] = train_df[cnt_features].mean(axis=1)
train_df['std_numeric'] = train_df[cnt_features].std(axis=1)
train_df['min_numeric'] = train_df[cnt_features].min(axis=1)
train_df['max_numeric'] = train_df[cnt_features].max(axis=1)
train_df['sum_categoricals'] = train_df[cat_features].sum(axis=1)

test_df['mean_numeric'] = test_df[cnt_features].mean(axis=1)
test_df['std_numeric'] = test_df[cnt_features].std(axis=1)
test_df['min_numeric'] = test_df[cnt_features].min(axis=1)
test_df['max_numeric'] = test_df[cnt_features].max(axis=1)
test_df['sum_categoricals'] = test_df[cat_features].sum(axis=1)

In [13]:
feature_cols += ['mean_numeric', 'std_numeric','sum_categoricals']
ic(len(feature_cols));

ic| len(feature_cols): 288


In [14]:
train_df[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f278,f279,f280,f281,f282,f283,f284,mean_numeric,std_numeric,sum_categoricals
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,0,0,0,0,0,0,0.233355,0.208569,17
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,0,0,0,0,0,0,0.22884,0.2085,16
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,1,1,0,0,0,0,0.220909,0.196083,13
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0,1,0,0,0,0,0.230246,0.211794,17
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,1,0,1,0,0,1,0,0.216252,0.20208,18


## KMeans Clustering

In [15]:
from sklearn.cluster import KMeans

n_clusters_1 = 6
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters_1)]
#kmeans = KMeans(n_clusters=n_clusters_1, n_init=50, max_iter=500, random_state=rnd_state)
kmeans = KMeans(n_clusters=n_clusters_1, init="k-means++", max_iter=500, random_state=rnd_state)

ic(n_clusters_1);
#ic(cluster_cols);

ic| n_clusters_1: 6


In [16]:
# cluster distance instead of cluster number

# train
X_cd = kmeans.fit_transform(train_df[useful_features_set])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train_df.index)
train = train_df.join(X_cd)

# test
X_cd = kmeans.transform(test_df[useful_features_set])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test_df.index)
test = test_df.join(X_cd)

In [17]:
feature_cols += cluster_cols
ic(len(feature_cols));

ic| len(feature_cols): 294


In [18]:
train[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f284,mean_numeric,std_numeric,sum_categoricals,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,0.233355,0.208569,17,1.198423,0.550193,1.144446,1.14117,1.519762,0.733143
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,0.22884,0.2085,16,1.119533,0.344957,1.060747,1.057851,1.457725,0.596446
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0.220909,0.196083,13,0.583228,1.130408,1.484626,0.447675,1.097732,1.030424
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0.230246,0.211794,17,1.144171,0.414889,1.086924,1.083061,1.476763,0.645714
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,0.216252,0.20208,18,0.676058,1.149557,1.525712,0.567068,1.152543,1.246617


## Polynomial Features

- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [19]:
from sklearn.preprocessing import PolynomialFeatures


In [20]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

X_poly = poly.fit_transform(train[useful_features_set])
T_poly = poly.transform(test[useful_features_set])

#X_poly = poly.fit_transform(train[feature_cols])
#T_poly = poly.transform(test[feature_cols])

poly_cols = [f"usefulf_cols_poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=train.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=test.index)

train = pd.concat([train, X_poly_df], axis=1)
test = pd.concat([test, T_poly_df], axis=1)


In [21]:
#feature_cols += ["poly79", "poly82", "poly66", "poly32", "poly81", "poly80", "poly42", "poly78", "poly59"]
feature_cols += poly_cols
train.head()
train[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,usefulf_cols_poly343,usefulf_cols_poly344,usefulf_cols_poly345,usefulf_cols_poly346,usefulf_cols_poly347,usefulf_cols_poly348,usefulf_cols_poly349,usefulf_cols_poly350,usefulf_cols_poly351,usefulf_cols_poly352
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0.138217,0.051206,0.22342,0.096293,0.105989,0.462448,0.199313,0.171325,0.073841,0.32218
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0.075958,0.062661,0.23347,0.095083,0.065589,0.244381,0.099527,0.201601,0.082104,0.305914
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0.101347,0.049371,0.075936,0.060192,0.140097,0.215482,0.170805,0.104971,0.083207,0.127979
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0.091528,0.033751,0.165336,0.088043,0.086367,0.423088,0.225298,0.156014,0.083079,0.406979
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0.017511,0.042983,0.196092,0.078494,0.014597,0.066591,0.026656,0.163455,0.065429,0.298492


In [22]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

X_poly = poly.fit_transform(train[cluster_cols])
T_poly = poly.transform(test[cluster_cols])

poly_cols = [f"cluster_poly{i+1}" for i in range(X_poly.shape[1])]

X_poly_df = pd.DataFrame(X_poly, columns=poly_cols, index=train.index)
T_poly_df = pd.DataFrame(T_poly, columns=poly_cols, index=test.index)

train = pd.concat([train, X_poly_df], axis=1)
test = pd.concat([test, T_poly_df], axis=1)

feature_cols += poly_cols
ic(len(feature_cols));


ic| len(feature_cols): 668


In [23]:
train[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,cluster_poly13,cluster_poly14,cluster_poly15,cluster_poly16,cluster_poly17,cluster_poly18,cluster_poly19,cluster_poly20,cluster_poly21,cluster_poly22
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0.629666,0.627863,0.836162,0.40337,1.306007,1.739286,0.839043,1.734306,0.836641,1.114203
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0.365912,0.364913,0.502853,0.205748,1.122112,1.546277,0.632678,1.542055,0.630951,0.869454
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,1.678233,0.506055,1.240884,1.1648,0.664629,1.629721,1.529795,0.491427,0.461295,1.131129
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0.450952,0.44935,0.612692,0.2679,1.177204,1.605128,0.701842,1.599423,0.699348,0.953567
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,1.753893,0.651877,1.324913,1.433057,0.865182,1.758448,1.901978,0.65357,0.706916,1.436779


## PCA

In [24]:
from sklearn.decomposition import PCA

pca = PCA()
X_pca = pca.fit_transform(train[useful_features_set])
T_pca = pca.transform(test[useful_features_set])

pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=train.index)
T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=test.index)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, T_pca], axis=1)


In [25]:
feature_cols += pca_cols
ic(len(feature_cols));

ic| len(feature_cols): 694


In [26]:
train[feature_cols].head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,-0.007449,0.041742,0.008015,0.008794,-0.056749,0.055197,-0.055726,-0.022102,-0.01423,-0.013506
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0.021862,0.02681,0.01957,0.02759,-0.010346,0.050788,-0.068806,-0.020384,-0.014595,-0.012869
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0.037561,-0.004055,-0.061239,-0.0338,-0.057732,-0.008028,-0.072553,-0.017812,-0.015997,-0.00809
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,-0.036675,0.139179,-0.058465,-0.032822,-0.02724,-0.037572,-0.056214,-0.018262,0.235932,0.003494
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,-0.003084,0.025693,-0.024514,-0.051906,-0.014463,0.037563,0.269032,-0.031772,-0.014744,-0.009106


## Mutual Information

In [27]:
from sklearn.feature_selection import mutual_info_regression

x = train.iloc[:50000,:][feature_cols] #.copy()
y = train.iloc[:50000,:]['target'] #.copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)


In [28]:
import plotly.figure_factory as ff
import plotly.express as px

top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()


In [29]:
ic(len(feature_cols));

ic| len(feature_cols): 694


# Train Model

In [30]:
# Clear memory
import gc
gc.collect()

282

## Functions

In [31]:
# TODO: New approach for evaluation metrics (not fully implemented yet)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score

def evaluation_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)

    print("RMSE: %s" % rmse)
    print("MAE: %s" % mae)
    print("R2: %s" % r2)

    return rmse, mae, r2

## Creating Kfolds

In [32]:
from sklearn.model_selection import StratifiedKFold

folds = 7
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=rnd_state)

# Adding folds to dataset
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [33]:
train['kfold'].value_counts()

0    142858
5    142857
2    142857
3    142857
6    142857
1    142857
4    142857
Name: kfold, dtype: int64

## Baseline Model

In [51]:
print("--")
# mlflow.autolog()
# mlflow.xgboost.autolog()
mlflow.start_run()
run = mlflow.active_run()

# Log text to a file under the run's root artifact directory
mlflow.log_text("Test", "comments.txt")

# Print run id
print("run_id: {}; status: {}\n".format(run.info.run_id, run.info.status))

--
run_id: cca52d823b44460d95a4bf475a4c2cb6; status: RUNNING



In [52]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

final_test_predictions = []
scores = []

for fold in range(folds):
    gc.collect()
    x_train = train[train.kfold != fold]  # .copy()
    x_valid = train[train.kfold == fold]  # .copy()
    x_test = test[feature_cols]  # .copy()

    y_train = x_train["target"]
    y_valid = x_valid["target"]

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    params = {
        "eval_metric": "auc",
        "objective": "binary:logistic",
        "tree_method": "gpu_hist",
        #"tree_method": "hist",
        "gpu_id": 0,
        #"n_jobs": n_cpu,
        "predictor": "gpu_predictor",
        "n_estimators": 10000,
        "learning_rate": 0.01063045229441343,
        "gamma": 0.24652519525750877,
        "max_depth": 4,
        "seed": rnd_state,
        "min_child_weight": 366,
        "subsample": 0.6423040816299684,
        "colsample_bytree": 0.7751264493218339,
        "colsample_bylevel": 0.8675692743597421,
        "use_label_encoder": False,
        "lambda": 0,
        "alpha": 10,
    }

    model = XGBClassifier(**params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False, early_stopping_rounds=200)

    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc_valid = roc_auc_score(y_valid, preds_valid)
    scores.append(auc_valid)

    preds_test = model.predict_proba(x_test)[:, 1]
    final_test_predictions.append(preds_test)

    print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc_valid:.6f}", " AVG AUC:", np.mean(scores))

baseline_auc_score = np.mean(scores)

print("AVG AUC:", baseline_auc_score)


Fold 0 , train: 0.869491 , valid: 0.859991  AVG AUC: 0.8599908229307615
Fold 1 , train: 0.870464 , valid: 0.854134  AVG AUC: 0.8570626300763118
Fold 2 , train: 0.870029 , valid: 0.856947  AVG AUC: 0.8570239635437513
Fold 3 , train: 0.870213 , valid: 0.856027  AVG AUC: 0.8567746176216864
Fold 4 , train: 0.869919 , valid: 0.856494  AVG AUC: 0.8567184226501089
Fold 5 , train: 0.869793 , valid: 0.856146  AVG AUC: 0.8566230921088716
Fold 6 , train: 0.869839 , valid: 0.856803  AVG AUC: 0.8566487584507287
AVG AUC: 0.8566487584507287


In [53]:
# MLFlow: Get metrics
# TODO: Test mlflow.autolog()
mlflow.log_params(params)
# mlflow.log_metric("rmse", rmse)
# mlflow.log_metric("r2", r2)
# mlflow.log_metric("mae", mae)
mlflow.log_metric("AVG AUC", np.mean(scores))

# Save feature list for mlflow
# features =", ".join(str(elem) for elem in feature_cols)
features = str(feature_cols)

with open("features.txt", "w") as f:
    f.write(features)

mlflow.log_artifact("features.txt", artifact_path="features")

"""
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

# Register Experiment
if tracking_url_type_store != "file":
    # Register the model
    # There are other ways to use the Model Registry, which depends on the use case,
    # please refer to the doc for more information:
    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
    
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="xgboost-model",
        registered_model_name="XGBClassifier",
    )

else:
    print('Store type: file')
    mlflow.xgboost.log_model(model, "model")
"""

print("Store type: file")
mlflow.xgboost.log_model(model, "model")


Store type: file


In [54]:
# End run and get status
mlflow.end_run()

print("run_id: {}; status: {}".format(run.info.run_id, run.info.status))
print("--")

# Check for any active runs
print("Active run: {}".format(mlflow.active_run()))

run_id: cca52d823b44460d95a4bf475a4c2cb6; status: RUNNING
--
Active run: None


In [55]:
message = f'XGB Baseline model finished. AVG AUV: {baseline_auc_score}'
send_telegram_message(message)

##  Submit Baseline Results

- 2021-10-20_submission_kmean-pca-fs17-xbg-nop.csv: 0.8569970265985022 | **0.85626**
- 2021-10-20_submission_kmean-3pca-fs17-xbg-nop.csv: 0.8568950821125828 | 0.85619
- 2021-10-20_submission_kmean-ALLpca-fs13-xbg-nop.csv: 0.8567778822323093 | not submitted
- 2021-10-20_submission_13fs-kmean-2pca-4nf-xbg-nop.csv: 0.8568443738048714 | not submitted
- 2021-10-20_submission_49fs-kmean-2pca-4nf-xbg-nop.csv: 0.8567888578426363 | 0.85599
- 2021-10-20_submission_13fs-kmean-1pca-4nf-poly-xbg-nop.csv: 0.8568639280432698 | 0.85615
- 2021-10-25_submission_f64d06226bb94f2fb2b33350b7a40692.csv: 0.8566269081302578 | 0.85600
- 2021-10-26_submission_e4bb062717c147ee84b6c5367c25d019.csv: 0.8543231035426082 | 0.85343
- 2021-10-26_submission_2eebfcd694b246da9f6f22da55753bf8.csv: 0.8567097047281612 | 0.85596
- 2021-10-26_submission_5121f4ce57374a46b5a95235f95d80d2.csv: 0.8569138102302041 | 0.85615
- 2021-10-29_submission_d76c94d9ea504d61bccb4762b4ace306.csv: 0.8567225314111455 | 0.85598
- 2021-10-30_submission_SHAP_features_only_dc19bfab65cc4e70b5626ef59a2eca00.csv: 0.8432140469452438 | 0.84184
- 2021-10-30_submission_SHAP+6way_features_sep_202590310d454132aa124c58635f507a.csv: 0.8567704500416857 | 0.85591

In [56]:
import numpy as np
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
now = now.strftime("%Y-%m-%d")

mlflow_run_id = str(run.info.run_id)
objective = "SHAP+6way_features_sep"

curr_submission_fn = f"{now}_submission_{objective}_{mlflow_run_id}.csv"

sample_df['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_df.to_csv(PATH_SUB + curr_submission_fn, index=False)

print(curr_submission_fn)


2021-10-30_submission_SHAP+6way_features_sep_cca52d823b44460d95a4bf475a4c2cb6.csv


In [57]:
!kaggle competitions submit tabular-playground-series-oct-2021 -f {PATH_SUB+curr_submission_fn} -m {curr_submission_fn}

Successfully submitted to Tabular Playground Series - Oct 2021



  0%|          | 0.00/9.36M [00:00<?, ?B/s]
  0%|          | 8.00k/9.36M [00:00<09:08, 17.9kB/s]
  1%|          | 112k/9.36M [00:00<00:36, 265kB/s]  
  2%|▏         | 208k/9.36M [00:00<00:22, 424kB/s]
  3%|▎         | 280k/9.36M [00:00<00:23, 412kB/s]
  4%|▎         | 344k/9.36M [00:01<00:24, 383kB/s]
  4%|▍         | 400k/9.36M [00:01<00:24, 387kB/s]
  5%|▍         | 448k/9.36M [00:01<00:24, 384kB/s]
  5%|▌         | 496k/9.36M [00:01<00:24, 382kB/s]
  6%|▌         | 544k/9.36M [00:01<00:25, 366kB/s]
  6%|▌         | 592k/9.36M [00:01<00:24, 373kB/s]
  7%|▋         | 632k/9.36M [00:01<00:24, 372kB/s]
  7%|▋         | 672k/9.36M [00:01<00:24, 372kB/s]
  7%|▋         | 712k/9.36M [00:02<00:24, 368kB/s]
  8%|▊         | 752k/9.36M [00:02<00:24, 366kB/s]
 10%|▉         | 920k/9.36M [00:02<00:12, 723kB/s]
 10%|█         | 0.98M/9.36M [00:02<00:17, 516kB/s]
 11%|█         | 1.04M/9.36M [00:02<00:18, 462kB/s]
 12%|█▏        | 1.09M/9.36M [00:02<00:20, 424kB/s]
 12%|█▏        | 1.14M/9.36M [

## Optimize Hyperparameter


In [62]:
# Clear memory
import gc
gc.collect()

189

In [63]:
import time
import optuna
from optuna.samplers import TPESampler
from catboost.utils import eval_metric
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import time

In [64]:
def objective(trial):
    scores = []
    # ic(scores)

    for fold in range(folds):
        gc.collect()
        x_train = train[train.kfold != fold]#.copy()
        x_valid = train[train.kfold == fold]#.copy()
        # x_test = test[feature_cols].copy()

        y_train = x_train["target"]
        y_valid = x_valid["target"]

        x_train = x_train[feature_cols]
        x_valid = x_valid[feature_cols]

        # param_list
        # XGBoost Parameters
        # tree_method_list = trial.suggest_categorical("tree_method",["exact", "approx", "hist"])
        tree_method_list = trial.suggest_categorical("tree_method", ["gpu_hist"])

        # booster: Consider only tree booster because it always outperforms the linear booster and thus the later is rarely used
        # booster_list = trial.suggest_categorical("booster", ["gbtree", "dart"])
        booster_list = trial.suggest_categorical("booster", ["gbtree"])

        n_estimators = trial.suggest_int("n_estimators", 3000, 15000) # 1000 - 10000
        max_depth = trial.suggest_int("max_depth", 4, 12) # 1-12
        min_child_weight = trial.suggest_int("min_child_weight", 100, 500) # 1, 25
        alpha_list = trial.suggest_int("alpha", 9, 25) # 1, 25
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1) # 0.01, 0.3, 0.01
        gamma_list = trial.suggest_float("gamma", 0.2, 0.4) # 0.01, 0.3, 0.01

        # scale_pos_weight: A value greater than 0 should be used in case of high class imbalance as it helps in faster convergence.
        scale_pos_weight = trial.suggest_int("scale_pos_weight", 1, 30)

        subsample_list = trial.suggest_float("subsample", 0.1, 1.0) # 0.4, 1.0, 0.1
        colsample_bytree_list = trial.suggest_float("colsample_bytree", 0.1, 1.0) # 0.4, 1.0, 0.1
        colsample_bylevel_list = trial.suggest_float("colsample_bytree", 0.1, 1.0) # 0.5, 1.0, 0.1
        reg_lambda_list = trial.suggest_int("reg_lambda", 0, 5)

        # Classification
        # objective_list = trial.suggest_categorical("objective", ["binary:logistic","reg:logistic","binary:hinge","binary:logitraw"])
        objective_list = trial.suggest_categorical("objective", ["binary:logistic"])
        eval_metric_list = trial.suggest_categorical("eval_metric", ["auc"])

        params = {
            "objective": objective_list,
            "eval_metric": eval_metric_list,
            "booster": booster_list,
            "random_state": rnd_state,
            "tree_method": tree_method_list,
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "min_child_weight": min_child_weight,
            "learning_rate": learning_rate,
            "gamma": gamma_list,
            "scale_pos_weight": scale_pos_weight,
            "subsample": subsample_list,
            "colsample_bytree": colsample_bytree_list,
            "colsample_bylevel": colsample_bylevel_list,
            "reg_lambda": reg_lambda_list,
            "alpha": alpha_list,
            "verbosity": 1,
            "use_label_encoder": False,
            "gpu_id": 0,
            #"n_jobs": n_cpu,
            "predictor": "gpu_predictor",
        }

        model = XGBClassifier(**params)
        model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False, early_stopping_rounds=200)

        preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        auc_train = roc_auc_score(y_train, preds_train)
        auc_valid = roc_auc_score(y_valid, preds_valid)
        scores.append(auc_valid)

        avg_auc_score = np.mean(scores)

        # preds_test = model.predict_proba(x_test)[:, 1]
        # final_test_predictions.append(preds_test)

        print(f'Fold: {fold}, AUC Train: {auc_train:.6f}, AUC Valid: {auc_valid:.6f}, AUC Valid AVG: {avg_auc_score:.6f}')

        # Stop earlier when avg score is below (adjusted) baseline score
        baseline_auc_score = 0.8566487584507287
        if avg_auc_score < baseline_auc_score-0.0003:
            print('-----')
            print("Early stop. AVG score is below baseline score. No improvement expected.")
            print('-----')
            
            return np.nan
    
    print('-----')
    
    return avg_auc_score


In [65]:

sampler = TPESampler(seed=rnd_state)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=200)


[32m[I 2021-10-31 11:36:40,674][0m A new study created in memory with name: no-name-c80ff474-1a41-4cd2-bdea-60c43b8270cd[0m


Fold: 0, AUC Train: 0.912337, AUC Valid: 0.858935, AUC Valid AVG: 0.858935


[33m[W 2021-10-31 11:51:27,067][0m Trial 0 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.916648, AUC Valid: 0.853304, AUC Valid AVG: 0.856120
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.913188, AUC Valid: 0.858764, AUC Valid AVG: 0.858764


[33m[W 2021-10-31 12:04:27,037][0m Trial 1 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.907696, AUC Valid: 0.853241, AUC Valid AVG: 0.856002
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.870817, AUC Valid: 0.859059, AUC Valid AVG: 0.859059


[33m[W 2021-10-31 12:09:42,142][0m Trial 2 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.874269, AUC Valid: 0.853257, AUC Valid AVG: 0.856158
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.861899, AUC Valid: 0.856816, AUC Valid AVG: 0.856816


[33m[W 2021-10-31 12:12:22,336][0m Trial 3 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.864087, AUC Valid: 0.850772, AUC Valid AVG: 0.853794
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.900299, AUC Valid: 0.858424, AUC Valid AVG: 0.858424


[33m[W 2021-10-31 12:21:49,894][0m Trial 4 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.907343, AUC Valid: 0.852926, AUC Valid AVG: 0.855675
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.870818, AUC Valid: 0.857951, AUC Valid AVG: 0.857951


[33m[W 2021-10-31 12:27:20,739][0m Trial 5 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.869839, AUC Valid: 0.852214, AUC Valid AVG: 0.855082
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.875341, AUC Valid: 0.859652, AUC Valid AVG: 0.859652
Fold: 1, AUC Train: 0.879658, AUC Valid: 0.854161, AUC Valid AVG: 0.856906
Fold: 2, AUC Train: 0.879747, AUC Valid: 0.856748, AUC Valid AVG: 0.856854
Fold: 3, AUC Train: 0.880350, AUC Valid: 0.855783, AUC Valid AVG: 0.856586
Fold: 4, AUC Train: 0.879113, AUC Valid: 0.856282, AUC Valid AVG: 0.856525
Fold: 5, AUC Train: 0.878840, AUC Valid: 0.855878, AUC Valid AVG: 0.856417


[32m[I 2021-10-31 13:01:31,731][0m Trial 6 finished with value: 0.856431293086664 and parameters: {'tree_method': 'gpu_hist', 'booster': 'gbtree', 'n_estimators': 7664, 'max_depth': 6, 'min_child_weight': 432, 'alpha': 15, 'learning_rate': 0.03528410587186427, 'gamma': 0.30853921663164974, 'scale_pos_weight': 5, 'subsample': 0.8219772826786357, 'colsample_bytree': 0.16709557931179375, 'reg_lambda': 5, 'objective': 'binary:logistic', 'eval_metric': 'auc'}. Best is trial 6 with value: 0.856431293086664.[0m


Fold: 6, AUC Train: 0.879077, AUC Valid: 0.856515, AUC Valid AVG: 0.856431
-----
Fold: 0, AUC Train: 0.860781, AUC Valid: 0.856788, AUC Valid AVG: 0.856788


[33m[W 2021-10-31 13:04:32,034][0m Trial 7 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.863085, AUC Valid: 0.851565, AUC Valid AVG: 0.854176
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.903619, AUC Valid: 0.857905, AUC Valid AVG: 0.857905


[33m[W 2021-10-31 13:13:31,698][0m Trial 8 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.906434, AUC Valid: 0.852233, AUC Valid AVG: 0.855069
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.879902, AUC Valid: 0.857494, AUC Valid AVG: 0.857494


[33m[W 2021-10-31 13:18:25,433][0m Trial 9 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.887448, AUC Valid: 0.851565, AUC Valid AVG: 0.854529
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.891751, AUC Valid: 0.856944, AUC Valid AVG: 0.856944


[33m[W 2021-10-31 13:22:01,313][0m Trial 10 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.897111, AUC Valid: 0.851176, AUC Valid AVG: 0.854060
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.863874, AUC Valid: 0.857972, AUC Valid AVG: 0.857972


[33m[W 2021-10-31 13:25:44,647][0m Trial 11 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.866113, AUC Valid: 0.852573, AUC Valid AVG: 0.855273
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.938649, AUC Valid: 0.858656, AUC Valid AVG: 0.858656


[33m[W 2021-10-31 13:45:08,033][0m Trial 12 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.940390, AUC Valid: 0.852820, AUC Valid AVG: 0.855738
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.870513, AUC Valid: 0.858427, AUC Valid AVG: 0.858427


[33m[W 2021-10-31 13:48:39,431][0m Trial 13 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.874174, AUC Valid: 0.852832, AUC Valid AVG: 0.855630
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.872020, AUC Valid: 0.859227, AUC Valid AVG: 0.859227
Fold: 1, AUC Train: 0.876476, AUC Valid: 0.853744, AUC Valid AVG: 0.856486
Fold: 2, AUC Train: 0.876567, AUC Valid: 0.856531, AUC Valid AVG: 0.856501


[33m[W 2021-10-31 14:06:39,537][0m Trial 14 failed, because the objective function returned nan.[0m


Fold: 3, AUC Train: 0.874739, AUC Valid: 0.855392, AUC Valid AVG: 0.856224
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.875261, AUC Valid: 0.858156, AUC Valid AVG: 0.858156


[33m[W 2021-10-31 14:10:06,150][0m Trial 15 failed, because the objective function returned nan.[0m


Fold: 1, AUC Train: 0.879550, AUC Valid: 0.852443, AUC Valid AVG: 0.855300
-----
Early stop. AVG score is below baseline score. No improvement expected.
-----
Fold: 0, AUC Train: 0.879319, AUC Valid: 0.859293, AUC Valid AVG: 0.859293
