In [1]:
import copy

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", 200)

from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_hist_gradient_boosting  # noqa

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

from sklearn.metrics import make_scorer, f1_score, roc_auc_score

print("import success")

import success


In [2]:
# import pre-processed datasets

# train data
data = pd.read_csv(r"../files/for_train.csv", index_col="respondent_id")

# target
labels = pd.read_csv(r"../files/training_set_labels.csv", index_col="respondent_id")
label_h1n1 = labels[["h1n1_vaccine"]]

# check whether rows are equal
print("data => rows: %s, cols: %s" % (data.shape[0], data.shape[1]))
print("labels => rows: %s, cols: %s" % (labels.shape[0], labels.shape[1]))

data => rows: 53415, cols: 104
labels => rows: 26707, cols: 2


In [3]:
target = label_h1n1.values.ravel()

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

# size = 1000
size = len(y_train)
X_train_pruned = X_train.head(size).copy()
y_train_pruned = copy.deepcopy(y_train[:size])

In [4]:
# select models for pre-mature testing

# Reference: https://towardsdatascience.com/comprehensive-guide-to-multiclass-classification-with-sklearn-127cc500f362

models = [
    BernoulliNB(),
    GaussianNB(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    KNeighborsClassifier(),
    LinearSVC(),
    LogisticRegression(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    XGBClassifier(),
    XGBRFClassifier(),
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    CatBoostClassifier(verbose=0),
    SVC(),
    GradientBoostingClassifier(),
    SGDClassifier(),
    Perceptron(),
]

In [5]:
# pre-mature testing to select a good model
def fit_predict(model):
    model.fit(X_train_pruned, y_train_pruned)
    predicted_vals = model.predict(X_test)
    # score = f1_score(y_test, predicted_vals, average="weighted")
    score = roc_auc_score(y_test, predicted_vals)

    return model, score

result = []
for model in models:
    result.append(fit_predict(model))
    print("Completed model: %s" % model)

# summary
print("\n--------------------------------------------\n")

result.sort(key = lambda i: i[-1], reverse=True)

for model, score in result:
    print("%s:\t%s" %(model, score))

Completed model: BernoulliNB()
Completed model: GaussianNB()
Completed model: DecisionTreeClassifier()
Completed model: ExtraTreeClassifier()
Completed model: ExtraTreesClassifier()
Completed model: KNeighborsClassifier()




Completed model: LinearSVC()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Completed model: LogisticRegression()
Completed model: RandomForestClassifier()
Completed model: AdaBoostClassifier()




Completed model: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)




Completed model: XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=8, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)
Completed model: HistGradientBoostingClassifier()
Completed model: LGBMClassifier()
Completed model: <catboost.core.CatBoostClassifier object at 0x000002DFF6C7AEC8>
Completed model: SVC()
Completed model: GradientBoostingClassifier()
Completed model: SGDClassifier()
Completed model: Perceptron()

--------------------------------------------

<catboost.core.CatBoostClassifier object at 0x000002DFF6C7AEC8>:	0.7324576513446753


CatBoostClassifier():	0.7324576513446753
XGBClassifier():	0.729031643296415
HistGradientBoostingClassifier():	0.7259685713657436
LGBMClassifier():	0.7251508656120647
GradientBoostingClassifier():	0.7154656231642458
AdaBoostClassifier():	0.714123408751305
SGDClassifier():	0.7062110663054758
SVC():	0.700592258130722
XGBRFClassifier():	0.7005225191075499
Perceptron():	0.699123759540686
LogisticRegression():	0.6973833206455782
RandomForestClassifier():	0.6964025763647539
LinearSVC():	0.6926278732006337
ExtraTreesClassifier():	0.6917247214365819
DecisionTreeClassifier():	0.688407196313256
BernoulliNB():	0.6846388806572641
KNeighborsClassifier():	0.6705061524268866
GaussianNB():	0.6508811096253464
ExtraTreeClassifier():	0.6392658973035292

It seems ***CatBoostClassifier*** works best overall