# History

2021-11-05:
- Initial version. New competition.

# Open tasks
- TODO: Add LogisticRegression classifier
- TODO: Add better function to submit best baseline results (stacked or non-stacked)
- TODO: Write function to write scores into a dataframe/csv-file for better documentation and tracking: Switch to MLflow
- TODO: Refactor write_scores_to_json() or replace it with df/csv-file approach

# Purpose

The objective of the **stage two** notebook is to calculate the **baseline score of un-optimized ML algorithms** as a baseline for future optimization, including feature selection/reduction, feature normalization/transformation, and hyper-parameters tuning. The workflow will use the optimized dataframes (reduced storage and memory usage). For this competition, we are focusing on the **ROC AUC score**.

# Setup Environment

## Import Basic Modules

In [1]:
import os # for cpu count
import configparser # to load standard config.ini
import pandas as pd
import matplotlib.pyplot as plt
import urllib, requests # for Telegram notifications
import json
import joblib
from datetime import datetime

import tensorflow as tf # to check GPU support

from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

%load_ext watermark
%matplotlib inline

## Define Parameters

In [2]:
# Load external config file
config = configparser.ConfigParser()
config.read("../src/config.ini")

PATH_DATA_RAW = config["PATHS"]["PATH_DATA_RAW"]
PATH_DATA_INT = config["PATHS"]["PATH_DATA_INT"]
PATH_DATA_PRO = config["PATHS"]["PATH_DATA_PRO"]
PATH_REPORTS = config["PATHS"]["PATH_REPORTS"]
PATH_MODELS = config["PATHS"]["PATH_MODELS"]
PATH_SUB = config["PATHS"]["PATH_SUB"]

# Telegram Bot
token = config["TELEGRAM"]["token"]
chat_id = config["TELEGRAM"]["chat_id"]
FILENAME_NB = "02_baseline_models" # for Telegram messages

# Set global randome state
rnd_state = 42

# Define available cpu cores
n_cpu = os.cpu_count()
print("Number of CPUs used:", n_cpu)


Number of CPUs used: 8


In [3]:
# Checking GPU support
print(tf.test.is_built_with_cuda())  # True
print(tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None))  # True
# print(tf.config.list_physical_devices('GPU'))

True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True


## Global Functions

In [4]:
def send_telegram_message(message):
    """Sending messages to Telegram bot via requests.get()."""
    
    message = f"{FILENAME_NB}:\n{message}"

    # Using "try and except" to ensure that the notebook execution will not be stopped only because of problems with the bot.
    # Example: No network connection.
    # ISSUE: Be careful, an error messages will leak your Telegram Bot Token when uploaded to GitHub.
    try:
        url = 'https://api.telegram.org/bot%s/sendMessage?chat_id=%s&text=%s'%(token, chat_id, urllib.parse.quote_plus(message))
        _ = requests.get(url, timeout=10)
    
    except Exception as e:
        print('\n\nSending message to Telegram Bot was not successful.\n\n')
        print(e)
        
    return None
    

In [5]:
def calculate_train_scores(model, X_train, y_train, y_train_pred):
    # Training set performance
    train_accuracy = accuracy_score(y_train, y_train_pred)  # Calculate Accuracy
    train_mcc = matthews_corrcoef(y_train, y_train_pred)  # Calculate MCC
    train_f1 = f1_score(y_train, y_train_pred, average="weighted")  # Calculate F1-score
    train_rocauc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    
    return train_accuracy, train_mcc, train_f1, train_rocauc


In [6]:
def calculate_valid_scores(model, X_valid, y_valid, y_valid_pred):
    # Validation set performance
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)  # Calculate Accuracy
    valid_mcc = matthews_corrcoef(y_valid, y_valid_pred)  # Calculate MCC
    valid_f1 = f1_score(y_valid, y_valid_pred, average="weighted")  # Calculate F1-score
    valid_rocauc = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1])

    return valid_accuracy, valid_mcc, valid_f1, valid_rocauc
    

In [7]:
def print_scores():
    print("Model performance for Training set")
    print("- Accuracy: %s" % train_accuracy)
    print("- MCC: %s" % train_mcc)
    print("- F1 score: %s" % train_f1)
    print("- ROC AUC score: %s" % train_rocauc)
    print("----------------------------------")
    print("Model performance for Validation set")
    print("- Accuracy: %s" % valid_accuracy)
    print("- MCC: %s" % valid_mcc)
    print("- F1 score: %s" % valid_f1)
    print("- ROC AUC score: %s" % valid_rocauc)

    return None
    

In [8]:
def write_scores_to_json(filename):
    dummy_scores_dict = {}
    dummy_scores_list = []

    dummy_scores_dict['Accuracy Train'] = train_accuracy
    dummy_scores_dict['MCC Train'] = train_mcc
    dummy_scores_dict['F1 Train'] = train_f1
    dummy_scores_dict['ROC AUC Train'] = train_rocauc
    dummy_scores_list.append(dummy_scores_dict)

    dummy_scores_dict = {}
    dummy_scores_dict['Accuracy Valid'] = valid_accuracy
    dummy_scores_dict['MCC Valid'] = valid_mcc
    dummy_scores_dict['F1 Valid'] = valid_f1
    dummy_scores_dict['ROC AUC Valid'] = valid_rocauc
    dummy_scores_list.append(dummy_scores_dict)

    # datetime object containing current date and time
    now = datetime.now()
    now = now.strftime("%Y-%m-%d")

    # Serializing and write json file
    json_object = json.dumps(dummy_scores_list, indent = 4) 
    filename =  now+'_'+filename
    with open(PATH_REPORTS + filename, "w") as outfile: 
        outfile.write(json_object)

    return None
    

# Load Data

In [10]:
train_df = pd.read_pickle(PATH_DATA_INT + "train-opt.pkl")  


In [11]:
train_df.shape

(600000, 102)

In [12]:
# Reducing sample size
# sample_size = 500000
# X = train_df[:sample_size]
# y = train_df[:sample_size]['target']
# assert y.index.tolist() == X.index.tolist()
# X = X.drop(['id','target'], axis=1)

# Using full dataset
#X = train_df.drop(["id", "target"], axis=1).to_numpy()
#y = train_df["target"].to_numpy()


# Using numpy arrays: https://vitalflux.com/pandas-dataframe-vs-numpy-array-what-to-use/
X = train_df.drop(["id", "target"], axis=1).values # using numpy array
y = train_df["target"].values # using numpy array


In [13]:
X.shape, y.shape

((600000, 100), (600000,))

# Run Baseline Classifiers

In [14]:
# Enable / disable baseline classifiers
# Do not forget to add/remove classifiers in the stacking section, accordingly
dummy_enabled = "yes"
xgbc_enabled = "yes"
lgbc_enabled = "yes"
ctbc_enabled = "yes"
rfc_enabled = "no" # Carefurl: tree grows big: .pkl file is around 1GB
dtc_enabled = "yes"
knnc_enabled = "no" # disable when sample size > 100.000
mlpc_enabled = "yes"

# Evaluation Metric
eval_metric = "AUC"


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=rnd_state, stratify=y)


In [16]:
X_train.shape, X_valid.shape

((402000, 100), (198000, 100))

In [17]:
# Sending for first bot message
message = "------------START------------"
send_telegram_message(message)


## Dummy Classifier

In [18]:
from sklearn.dummy import DummyClassifier

# Define model
duc = DummyClassifier(strategy="stratified")

if dummy_enabled == "yes":
    try:
        # Train model
        duc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = duc.predict(X_train)
        y_valid_pred = duc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(
            duc, X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(
            duc, X_valid, y_valid, y_valid_pred
        )

        print_scores()
        filename = "dummy_baseline_scores.json"
        write_scores_to_json(filename)

        #message = f"DummyClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        #send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {duc} failed: {e}\n"
        send_telegram_message(message)
        print(f"\n{e}\n")


Model performance for Training set
- Accuracy: 0.49971641791044774
- MCC: -0.0007158149923818674
- F1 score: 0.4997153862573974
- ROC AUC score: 0.4992864590238203
----------------------------------
Model performance for Validation set
- Accuracy: 0.5010555555555556
- MCC: 0.001968000804853239
- F1 score: 0.5010558271733098
- ROC AUC score: 0.4976358338031067


## XGBoost

In [19]:
import xgboost as xgb

if xgbc_enabled == "yes":
    try:
        # Define model
        xgbc = xgb.XGBClassifier(
            random_state=rnd_state,
            n_jobs=n_cpu,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            use_label_encoder=False,
            eval_metric=eval_metric.lower(),
        )

        # Train model
        xgbc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = xgbc.predict(X_train)
        y_valid_pred = xgbc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(
            xgbc, X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(
            xgbc, X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "xgbc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "xgbc_baseline_model.pkl"
        joblib.dump(xgbc, PATH_MODELS + filename)

        # Send messages
        message = f"XGBClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {xgbc} failed: {e}\n"
        send_telegram_message(message)
        print(f"\n{e}\n")


Model performance for Training set
- Accuracy: 0.739636815920398
- MCC: 0.47915750722924866
- F1 score: 0.7396045989898449
- ROC AUC score: 0.7908411173210788
----------------------------------
Model performance for Validation set
- Accuracy: 0.6898131313131313
- MCC: 0.3794717164213431
- F1 score: 0.6897757777983862
- ROC AUC score: 0.7302524575048538


## LightGBM

In [20]:
import lightgbm as lgb

if lgbc_enabled == "yes":
    try:
        # Define model
        lgbc = lgb.LGBMClassifier(
            random_state=rnd_state,
            n_jobs=n_cpu,
            eval_metric=eval_metric
        )

        # Train model
        lgbc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = lgbc.predict(X_train)
        y_valid_pred = lgbc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(lgbc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(lgbc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "lgbc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "lgbc_baseline_model.pkl"
        joblib.dump(lgbc, PATH_MODELS+filename)

        # Send messages
        message = f"LGBClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {lgbc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')


Model performance for Training set
- Accuracy: 0.711634328358209
- MCC: 0.42313528003810374
- F1 score: 0.7115611478353585
- ROC AUC score: 0.7523186556056192
----------------------------------
Model performance for Validation set
- Accuracy: 0.6935707070707071
- MCC: 0.3869851339188961
- F1 score: 0.6934901492194795
- ROC AUC score: 0.7324083009279608


## Catboost

In [21]:
import catboost as ctb

if ctbc_enabled == "yes":
    try:
        # Define model
        ctbc = ctb.CatBoostClassifier(
            random_state=rnd_state,
            verbose=0,
            eval_metric=eval_metric,
            task_type="GPU"
        )

        # Train model
        ctbc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = ctbc.predict(X_train)
        y_valid_pred = ctbc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(ctbc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(ctbc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "ctbc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "ctbc_baseline_model.pkl"
        joblib.dump(ctbc, PATH_MODELS+filename)

        # Send messages
        message = (f"CatBoostClassifier finished. Validation AUC ROC Score: {valid_rocauc}")
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {ctbc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')


Model performance for Training set
- Accuracy: 0.721455223880597
- MCC: 0.4427828186768833
- F1 score: 0.7213978211816185
- ROC AUC score: 0.751828573388908
----------------------------------
Model performance for Validation set
- Accuracy: 0.712570707070707
- MCC: 0.4250031524427747
- F1 score: 0.7125145062780892
- ROC AUC score: 0.7431837490569371


## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

if rfc_enabled == "yes":
    try:
        # Define model
        rfc = RandomForestClassifier(random_state=rnd_state, n_jobs=n_cpu)

        # Train model
        rfc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = rfc.predict(X_train)
        y_valid_pred = rfc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(rfc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(rfc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "rfc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "rfc_baseline_model.pkl"
        joblib.dump(rfc, PATH_MODELS+filename)

        # Send messages
        message = f"RandomForestClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {rfc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')
		

##  Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

if dtc_enabled == "yes":
    try:
        # Define model
        dtc = DecisionTreeClassifier(max_depth=5, random_state=rnd_state)

        # Train model
        dtc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = dtc.predict(X_train)
        y_valid_pred = dtc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(dtc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(dtc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "dtc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "dtc_baseline_model.pkl"
        joblib.dump(dtc, PATH_MODELS+filename)

        # Send messages
        message = f"DecisionTreeClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {dtc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')

Model performance for Training set
- Accuracy: 0.5946915422885573
- MCC: 0.18987298499929253
- F1 score: 0.5921751169391201
- ROC AUC score: 0.6269208288033784
----------------------------------
Model performance for Validation set
- Accuracy: 0.5906565656565657
- MCC: 0.18170968165367538
- F1 score: 0.5880974878412564
- ROC AUC score: 0.6216633988408427


## KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

if knnc_enabled == "yes":
    try:
        # Define model
        knnc = KNeighborsClassifier(n_neighbors=3, n_jobs=n_cpu)

        # Train model
        knnc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = knnc.predict(X_train)
        y_valid_pred = knnc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(knnc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(knnc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "knnc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "knnc_baseline_model.pkl"
        joblib.dump(knnc, PATH_MODELS+filename)

        # Send messages
        message = f"KNNClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {knnc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')

## Neural Network

In [25]:
from sklearn.neural_network import MLPClassifier

if mlpc_enabled == "yes":
    try:
        # Define model
        mlpc = MLPClassifier(alpha=1, max_iter=200, random_state=rnd_state)

        # Train model
        mlpc.fit(X_train, y_train)

        # Make predictions
        y_train_pred = mlpc.predict(X_train)
        y_valid_pred = mlpc.predict(X_valid)

        # Training set performance
        train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(mlpc,
            X_train, y_train, y_train_pred
        )

        # Validation set performance
        valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(mlpc,
            X_valid, y_valid, y_valid_pred
        )

        # Print and write scores
        print_scores()
        filename = "mlpc_baseline_scores.json"
        write_scores_to_json(filename)

        # Store base model
        filename = "mlpc_baseline_model.pkl"
        joblib.dump(mlpc, PATH_MODELS+filename)

        # Send messages
        message = f"MLPClassifier finished. Validation AUC ROC Score: {valid_rocauc}"
        send_telegram_message(message)

    except Exception as e:
        message = f"\nFitting of {mlpc} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')

Model performance for Training set
- Accuracy: 0.6940273631840796
- MCC: 0.39256028607451837
- F1 score: 0.6928719489792595
- ROC AUC score: 0.7337734122917466
----------------------------------
Model performance for Validation set
- Accuracy: 0.6954191919191919
- MCC: 0.39549752057732285
- F1 score: 0.6942285070921432
- ROC AUC score: 0.7359253109205993


# Stacking

## StackingClassifier (sklearn)

- https://towardsdatascience.com/stacking-made-easy-with-sklearn-e27a0793c92b
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

In [26]:
from sklearn.ensemble import StackingClassifier # only works with estimators from sklearn
from sklearn.linear_model import LogisticRegression

estimators = [
    # ("rfc", rfc),
    ("dtc", dtc),
    # ("knnc", knnc),
    ("mlpc", mlpc),
]

final_estimator = LogisticRegression(random_state=rnd_state)

# Build stack model
stack_skl_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator) 

try:
    # Train stacked model
    stack_skl_model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = stack_skl_model.predict(X_train)
    y_valid_pred = stack_skl_model.predict(X_valid)

    # Training set performance
    train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(
        stack_skl_model, X_train, y_train, y_train_pred
    )

    # Validation set performance
    valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(
        stack_skl_model, X_valid, y_valid, y_valid_pred
    )

    # Print and write scores
    print_scores()
    filename = "stack_sklearn_baseline_scores.json"
    write_scores_to_json(filename)

    # Store base model
    filename = "stack_sklearn_baseline_model.pkl"
    joblib.dump(stack_skl_model, PATH_MODELS+filename)

    # Send messages
    message = f"Stacking (sklearn) finished. Validation AUC ROC Score: {valid_rocauc}"
    send_telegram_message(message)

except Exception as e:
        message = f"\nFitting of {stack_skl_model} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')


## StackingCVClassifier (mlxtend)

- https://developer.ibm.com/articles/stack-machine-learning-models-get-better-results/
- http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/

In [None]:
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingCVClassifier

lr = LogisticRegression(random_state=rnd_state)

stack_mlx_model = StackingCVClassifier(
    classifiers=[xgbc, lgbc, ctbc],
    meta_classifier=lr,
    cv=5,
    use_features_in_secondary=True,
    store_train_meta_features=True,
    shuffle=True,
    random_state=rnd_state,
    verbose=1,
    n_jobs=n_cpu
)

try:
    stack_mlx_model.fit(X_train, y_train)
    y_valid_pred = stack_mlx_model.predict(X_valid)

    # Training set performance
    train_accuracy, train_mcc, train_f1, train_rocauc = calculate_train_scores(
        stack_mlx_model, X_train, y_train, y_train_pred
    )

    # Validation set performance
    valid_accuracy, valid_mcc, valid_f1, valid_rocauc = calculate_valid_scores(
        stack_mlx_model, X_valid, y_valid, y_valid_pred
    )

    # Print and write scores
    print_scores()
    filename = "stack_sklearn_baseline_scores.json"
    write_scores_to_json(filename)

    # Store base model
    filename = "stack_mlxtend_baseline_model.pkl"
    joblib.dump(stack_mlx_model, PATH_MODELS+filename)

    # Send messages
    message = f"Stacking (mlxtend) finished. Validation AUC ROC Score: {valid_rocauc}"
    send_telegram_message(message)

except Exception as e:
        message = f"\nFitting of {stack_mlx_model} failed: {e}\n"
        send_telegram_message(message)
        print(f'\n{e}\n')


Fitting 3 classifiers...
Fitting classifier1: xgbclassifier (1/3)


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:   48.3s finished


Fitting classifier2: lgbmclassifier (2/3)


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:  1.3min finished


Fitting classifier3: catboostclassifier (3/3)


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:  4.3min finished




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model performance for Training set
- Accuracy: 0.770744776119403
- MCC: 0.5432228202079334
- F1 score: 0.7703951329077331
- ROC AUC score: 0.8451570677452912
----------------------------------
Model performance for Validation set
- Accuracy: 0.765439393939394
- MCC: 0.5319586543941238
- F1 score: 0.765214664396255
- ROC AUC score: 0.8414691798204225


# Make final predications

In [None]:
test_df = pd.read_pickle(PATH_DATA_INT + "test-opt.pkl")
X_test = test_df.drop("id", axis=1).values
X_test.shape


(500000, 285)

In [None]:
# Train (best) model on "full" data set
# _ = stack_skl_model.fit(X, y)

In [None]:
# Make predictions
y_test_pred = stack_skl_model.predict(X_test)


# Submit (Best) Baseline Results

This section needs to be finalized.

In [None]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
now = now.strftime("%Y-%m-%d")

objective = "stack_mlx_model-baseline"

curr_submission_fn = f"{now}_submission_{objective}.csv"

my_submission = pd.DataFrame({"id": test_df["id"], "target": y_test_pred})
my_submission.to_csv(PATH_SUB + curr_submission_fn, index=False)

print(curr_submission_fn)


2021-11-05_submission_stack_mlx_model-baseline.csv


In [None]:
!kaggle competitions submit tabular-playground-series-nov-2021 -f {PATH_SUB+curr_submission_fn} -m {curr_submission_fn}

Successfully submitted to Tabular Playground Series - Nov 2021



  0%|          | 0.00/5.25M [00:00<?, ?B/s]
  0%|          | 8.00k/5.25M [00:00<01:44, 52.7kB/s]
  2%|▏         | 96.0k/5.25M [00:00<00:11, 451kB/s] 
  4%|▎         | 200k/5.25M [00:00<00:07, 682kB/s] 
  5%|▌         | 280k/5.25M [00:00<00:10, 485kB/s]
  6%|▋         | 344k/5.25M [00:00<00:11, 438kB/s]
  7%|▋         | 400k/5.25M [00:00<00:11, 433kB/s]
  8%|▊         | 448k/5.25M [00:01<00:12, 417kB/s]
  9%|▉         | 496k/5.25M [00:01<00:13, 383kB/s]
 10%|▉         | 536k/5.25M [00:01<00:12, 385kB/s]
 11%|█         | 584k/5.25M [00:01<00:12, 404kB/s]
 12%|█▏        | 632k/5.25M [00:01<00:12, 391kB/s]
 13%|█▎        | 672k/5.25M [00:01<00:12, 387kB/s]
 13%|█▎        | 712k/5.25M [00:01<00:12, 370kB/s]
 14%|█▍        | 752k/5.25M [00:01<00:12, 372kB/s]
 15%|█▍        | 792k/5.25M [00:02<00:12, 374kB/s]
 15%|█▌        | 832k/5.25M [00:02<00:12, 373kB/s]
 19%|█▊        | 0.98M/5.25M [00:02<00:06, 722kB/s]
 20%|█▉        | 1.05M/5.25M [00:02<00:07, 557kB/s]
 21%|██        | 1.11M/5.25M [

# Watermark

In [None]:
%load_ext watermark

In [None]:
%watermark

Last updated: 2021-10-14T20:08:15.118277+02:00

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.28.0

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 13, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [None]:
%watermark --iversions

lightgbm: 2.3.1
pandas  : 1.0.5
requests: 2.26.0
sys     : 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
joblib  : 1.0.0
catboost: 1.0.0
xgboost : 1.1.1
json    : 2.0.9

