In [1]:
# The rest of the modules are loaded when required.
# To ensure a standalone character (for easier reusability).

import os # for detecting CPU cores
import configparser # to load standard config and parameters
import pandas as pd
import numpy as np
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px

# Debugging
from icecream import ic

warnings.filterwarnings('ignore')
%load_ext watermark
%matplotlib inline

In [2]:
import os
import configparser

# Load external config file
config = configparser.ConfigParser()
config.read("../resources/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)

Number of CPUs used: 16


In [3]:
train_df = pd.read_csv(PATH_DATA_RAW+'train.csv',index_col=0)
test_df = pd.read_csv(PATH_DATA_RAW+'test.csv',index_col=0)
sample_df = pd.read_csv(PATH_DATA_RAW+'sample_submission.csv')

In [4]:
train_df.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,0,1,0,0,0,0,0,0,0,1
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,0,1,0,0,0,0,0,0,0,1
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0,0,0,1,1,0,0,0,0,1
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,0,0,0,0,1,0,0,0,0,1
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,0,1,1,0,1,0,0,1,0,1


In [5]:
memory_usage = train_df.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()
start_mem

2189.63623046875

In [6]:
feature_cols = train_df.drop(['target'], axis=1).columns.tolist()
cnt_features = train_df.drop(['target'], axis=1).select_dtypes(exclude=['int64']).columns.tolist()
cat_features = train_df.drop(['target'], axis=1).select_dtypes(exclude=['float64']).columns.tolist()

ic(len(feature_cols))
ic(len(cnt_features))
ic(len(cat_features));

ic| len(feature_cols): 285
ic| len(cnt_features): 240
ic| len(cat_features): 45


In [115]:
useful_features = pd.read_csv(PATH_DATA_INT+'features_selected_6way_140.csv')['Feature'].head(13).tolist()
ic(len(useful_features));

ic| len(useful_features): 50


In [116]:
train_df[cnt_features] = train_df[cnt_features].astype('float32')
train_df[cat_features] = train_df[cat_features].astype('uint8')

test_df[cnt_features] = test_df[cnt_features].astype('float32')
test_df[cat_features] = test_df[cat_features].astype('uint8')

In [117]:
memory_usage = train_df.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()
end_mem

1005.9515151977539

In [118]:
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))

Mem. usage decreased from 2189.64 MB to 1005.95 MB (54.06% reduction)


# Transform Continuous Data

- TODO: Test power transformation after clustering

In [119]:
"""
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson', standardize=True)
X_pt = power.fit_transform(train_df[cnt_features])

columns = train_df[cnt_features].columns
X_pt_df = pd.DataFrame(X_pt, columns=columns, index=train_df.index)
ic(X_pt_df.shape)

train_df = pd.concat([X_pt, train_df[cat_features], train_df[["target"]]], axis=1)
ic(train_df.shape);
"""

'\nfrom sklearn.preprocessing import PowerTransformer\n\npower = PowerTransformer(method=\'yeo-johnson\', standardize=True)\nX_pt = power.fit_transform(train_df[cnt_features])\n\ncolumns = train_df[cnt_features].columns\nX_pt_df = pd.DataFrame(X_pt, columns=columns, index=train_df.index)\nic(X_pt_df.shape)\n\ntrain_df = pd.concat([X_pt, train_df[cat_features], train_df[["target"]]], axis=1)\nic(train_df.shape);\n'

# Feature Engineering

## KMeans Clustering

In [120]:
from sklearn.cluster import KMeans

n_clusters_1 = 6
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters_1)]
kmeans = KMeans(n_clusters=n_clusters_1, n_init=50, max_iter=500, random_state=rnd_state)

ic(n_clusters_1)
ic(cluster_cols);

ic| n_clusters_1: 10
ic| cluster_cols: ['cluster1',
                   'cluster2',
                   'cluster3',
                   'cluster4',
                   'cluster5',
                   'cluster6',
                   'cluster7',
                   'cluster8',
                   'cluster9',
                   'cluster10']


In [121]:
# cluster distance instead of cluster number

# train
X_cd = kmeans.fit_transform(train_df[useful_features])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train_df.index)
train = train_df.join(X_cd)

# test
X_cd = kmeans.transform(test_df[useful_features])
X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test_df.index)
test = test_df.join(X_cd)

In [122]:
feature_cols += cluster_cols
train.head()

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,cluster9,cluster10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.205979,0.410993,0.176775,0.223581,0.423543,0.47614,0.41359,0.612021,0.534873,0.147295,...,1.652053,1.175303,0.694005,1.248848,1.195834,1.536456,1.170066,1.589425,1.244636,1.322187
1,0.181004,0.473119,0.011734,0.213657,0.619678,0.441593,0.230407,0.686013,0.281971,0.238509,...,1.608058,1.245062,0.596556,1.157909,1.181327,1.596651,1.249077,1.530442,1.16491,1.259687
2,0.182583,0.307431,0.32595,0.207116,0.605699,0.309695,0.493337,0.751107,0.536272,0.286813,...,0.483898,1.276153,1.543081,1.202257,1.176569,0.791505,1.276879,0.668768,1.190399,1.110147
3,0.18024,0.494592,0.008367,0.22358,0.760618,0.439211,0.432055,0.776147,0.483958,0.260886,...,1.269966,0.754572,1.173282,0.600388,1.530689,1.251395,0.761569,1.166801,1.559918,0.783079
4,0.177172,0.495513,0.014263,0.548819,0.625396,0.562493,0.117158,0.561255,0.077115,0.158321,...,1.366215,1.682124,1.278935,1.619867,0.774782,1.351643,1.686304,1.27528,0.834744,1.693265


In [123]:
ic(len(feature_cols))
#ic(feature_cols);

ic| len(feature_cols): 10


10

In [124]:
"""
sns.set_style('darkgrid')
fig = plt.figure(figsize = (10,5))
sns.kdeplot(data=train[cluster_cols])
plt.show()
"""

"\nsns.set_style('darkgrid')\nfig = plt.figure(figsize = (10,5))\nsns.kdeplot(data=train[cluster_cols])\nplt.show()\n"

## Add New Features

In [125]:
"""
def add_feature(df):
    df["new_f1"] = (df["cluster1"])/(df["cluster3"])
    df["new_f2"] = (df["cluster5"])/(df["cluster2"])
    df["new_f3"] = (df["cluster6"])/(df["cluster4"])
    return df

new_features = ["new_f1","new_f2","new_f3"]
train = add_feature(train)
test = add_feature(test)

feature_cols += new_features

ic(len(feature_cols))
#ic(feature_cols);
"""


'\ndef add_feature(df):\n    df["new_f1"] = (df["cluster1"])/(df["cluster3"])\n    df["new_f2"] = (df["cluster5"])/(df["cluster2"])\n    df["new_f3"] = (df["cluster6"])/(df["cluster4"])\n    return df\n\nnew_features = ["new_f1","new_f2","new_f3"]\ntrain = add_feature(train)\ntest = add_feature(test)\n\nfeature_cols += new_features\n\nic(len(feature_cols))\n#ic(feature_cols);\n'

## Mutual Information

In [126]:
from sklearn.feature_selection import mutual_info_regression

x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['target'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [127]:
import plotly.figure_factory as ff
import plotly.express as px

top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

# Baseline Score

In [128]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X = train_df.drop(["target"], axis=1)
y = train_df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.33, random_state=rnd_state, stratify=y
)

clf = CatBoostClassifier(
        random_seed=rnd_state,
        verbose=False,
        task_type="GPU",
        devices="0:1",
        eval_metric="AUC",
        objective="CrossEntropy",
        learning_rate=0.03638707926890429,
        iterations=2800,
        depth=6,
        l2_leaf_reg=7.796036733526901,
        boosting_type="Plain",
    )
_ = clf.fit(X_train, y_train)

train_roc_auc_score = roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1])
valid_roc_auc_scoce = roc_auc_score(y_valid, clf.predict_proba(X_valid)[:, 1])

print(train_roc_auc_score) # 0.8617102921465771 (fs18, no_new_f), same with fs16, and 3 new_f
print(valid_roc_auc_scoce) # 0.8558234199478256 (fs18, no_new_f), same with fs16, and 3 new_f


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

KeyboardInterrupt: 

# Creating Kfolds

In [None]:
from sklearn.model_selection import StratifiedKFold

folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=rnd_state)

# Adding folds to dataset
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
train['kfold'].value_counts()

3    200000
2    200000
4    200000
0    200000
1    200000
Name: kfold, dtype: int64

# Optimizing Hyperparameter

In [None]:
import time
import optuna
from optuna.samplers import TPESampler
from catboost.utils import eval_metric
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [None]:

def objective(trial):
    scores = []
    #ic(scores)

    for fold in range(folds):
        time.sleep(2)
        x_train = train[train.kfold != fold].copy()
        x_valid = train[train.kfold == fold].copy()
        # x_test = test[feature_cols].copy()

        y_train = x_train["target"]
        y_valid = x_valid["target"]

        x_train = x_train[feature_cols]
        x_valid = x_valid[feature_cols]

        params = {
            # "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "objective": trial.suggest_categorical("objective", ["CrossEntropy"]),
            # "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
            "iterations": trial.suggest_int("iterations", 1400, 3400),
            "depth": trial.suggest_int("depth", 4, 6),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
            "boosting_type": trial.suggest_categorical(
                "boosting_type", ["Ordered", "Plain"]
            ),
        }

        # if params["bootstrap_type"] == "Bayesian":
        #    params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)

        # elif params["bootstrap_type"] == "Bernoulli":
        #    params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        #print("Fitting...")
        model = CatBoostClassifier(
            **params, 
            random_seed=rnd_state,
            task_type="GPU", 
            devices="0:1", 
            eval_metric="AUC"
        )
        
        model.fit(
            x_train,
            y_train,
            eval_set=[(x_valid, y_valid)],
            verbose=False,
            early_stopping_rounds=100,
        )

        #print("Predicting...")
        preds_train = model.predict_proba(x_train)[:, 1]
        preds_valid = model.predict_proba(x_valid)[:, 1]
        auc_train = roc_auc_score(y_train, preds_train)
        auc = roc_auc_score(y_valid, preds_valid)
        print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
        scores.append(auc)

    return np.mean(scores)


In [None]:

sampler = TPESampler(seed=rnd_state)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)


[32m[I 2021-10-19 23:51:55,498][0m A new study created in memory with name: no-name-e81100cb-ce96-41a1-82c0-17a224dea76e[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Fold 0 , train: 0.790784 , valid: 0.791549
Fold 1 , train: 0.791917 , valid: 0.788331
Fold 2 , train: 0.791483 , valid: 0.791858
Fold 3 , train: 0.791617 , valid: 0.788509


[32m[I 2021-10-20 00:00:50,526][0m Trial 0 finished with value: 0.7900205130103007 and parameters: {'objective': 'CrossEntropy', 'learning_rate': 0.04370861069626263, 'iterations': 3302, 'depth': 6, 'l2_leaf_reg': 6.387926357773329, 'boosting_type': 'Ordered'}. Best is trial 0 with value: 0.7900205130103007.[0m


Fold 4 , train: 0.791443 , valid: 0.789854
Fold 0 , train: 0.792900 , valid: 0.792324


KeyboardInterrupt: 

# Fit with best parameters

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

final_test_predictions = []
scores = []

for fold in range(folds):
    #ic(fold)
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test = test[feature_cols].copy()

    y_train = x_train["target"]
    y_valid = x_valid["target"]

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    model = CatBoostClassifier(
        random_seed=rnd_state,
        verbose=False,
        task_type="GPU",
        devices="0:1",
        eval_metric="AUC",
        objective="CrossEntropy",
        learning_rate=0.03638707926890429,
        iterations=2800,
        depth=6,
        l2_leaf_reg=7.796036733526901,
        boosting_type="Plain",
    )

    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)

    preds_test = model.predict_proba(x_test)[:, 1]
    final_test_predictions.append(preds_test)


print("AVG AUC:", np.mean(scores))

Fold 0 , train: 0.798539 , valid: 0.793618


KeyboardInterrupt: 

In [None]:
"""
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

final_test_predictions = []
scores = []

for fold in range(folds):
    ic(fold)
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test = test[feature_cols].copy()

    y_train = x_train["target"]
    y_valid = x_valid["target"]

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    xgb_params = {
        "eval_metric": "auc",
        "objective": "binary:logistic",
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        "predictor": "gpu_predictor",
        "n_estimators": 10000,
        "learning_rate": 0.01063045229441343,
        "gamma": 0.24652519525750877,
        "max_depth": 4,
        "seed": rnd_state,
        "min_child_weight": 366,
        "subsample": 0.6423040816299684,
        "colsample_bytree": 0.7751264493218339,
        "colsample_bylevel": 0.8675692743597421,
        "use_label_encoder": False,
        "lambda": 0,
        "alpha": 10,
    }

    print('Fitting...')
    model = XGBClassifier(**xgb_params)
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    print('Predicting...')
    preds_train = model.predict_proba(x_train)[:, 1]
    preds_valid = model.predict_proba(x_valid)[:, 1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold", fold, ", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)

    preds_test = model.predict_proba(x_test)[:, 1]
    final_test_predictions.append(preds_test)


print("AVG AUC:", np.mean(scores))
"""

'\nfrom xgboost import XGBClassifier\nfrom sklearn.metrics import roc_auc_score\n\nfinal_test_predictions = []\nscores = []\n\nfor fold in range(folds):\n    ic(fold)\n    x_train = train[train.kfold != fold].copy()\n    x_valid = train[train.kfold == fold].copy()\n    x_test = test[feature_cols].copy()\n\n    y_train = x_train["target"]\n    y_valid = x_valid["target"]\n\n    x_train = x_train[feature_cols]\n    x_valid = x_valid[feature_cols]\n\n    xgb_params = {\n        "eval_metric": "auc",\n        "objective": "binary:logistic",\n        "tree_method": "gpu_hist",\n        "gpu_id": 0,\n        "predictor": "gpu_predictor",\n        "n_estimators": 10000,\n        "learning_rate": 0.01063045229441343,\n        "gamma": 0.24652519525750877,\n        "max_depth": 4,\n        "seed": rnd_state,\n        "min_child_weight": 366,\n        "subsample": 0.6423040816299684,\n        "colsample_bytree": 0.7751264493218339,\n        "colsample_bylevel": 0.8675692743597421,\n        "use_

#  Submit results

In [None]:
import numpy as np
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
now = now.strftime("%Y-%m-%d")

objective = "kmean-fs13-ctb-op"

curr_submission_fn = f"{now}_submission_{objective}.csv"

sample_df['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
#sample_df['target'] = round(sample_df['target'],0)
sample_df.to_csv(PATH_SUB + curr_submission_fn, index=False)

print(curr_submission_fn)


2021-10-19_submission_kmean-fs13-ctb-op.csv


In [None]:
#!kaggle competitions submit tabular-playground-series-oct-2021 -f {PATH_SUB+curr_submission_fn} -m {curr_submission_fn}

Successfully submitted to Tabular Playground Series - Oct 2021



  0%|          | 0.00/13.5M [00:00<?, ?B/s]
  0%|          | 8.00k/13.5M [00:00<04:11, 56.1kB/s]
  1%|          | 104k/13.5M [00:00<00:27, 503kB/s]  
  1%|          | 168k/13.5M [00:05<08:47, 26.5kB/s]
  2%|▏         | 232k/13.5M [00:05<05:23, 43.0kB/s]
  2%|▏         | 272k/13.5M [00:05<04:07, 55.9kB/s]
  2%|▏         | 312k/13.5M [00:05<03:15, 70.7kB/s]
  3%|▎         | 352k/13.5M [00:05<02:36, 87.9kB/s]
  3%|▎         | 384k/13.5M [00:06<02:10, 105kB/s] 
  3%|▎         | 416k/13.5M [00:06<01:47, 127kB/s]
  3%|▎         | 456k/13.5M [00:06<01:26, 159kB/s]
  4%|▎         | 496k/13.5M [00:06<01:11, 190kB/s]
  4%|▍         | 536k/13.5M [00:06<01:02, 218kB/s]
  4%|▍         | 568k/13.5M [00:06<00:57, 236kB/s]
  4%|▍         | 600k/13.5M [00:06<00:56, 241kB/s]
  5%|▍         | 632k/13.5M [00:06<00:54, 249kB/s]
  5%|▍         | 664k/13.5M [00:06<00:51, 260kB/s]
  5%|▌         | 696k/13.5M [00:07<00:49, 269kB/s]
  5%|▌         | 728k/13.5M [00:07<00:51, 259kB/s]
  6%|▌         | 768k/13.5M