In [37]:
import copy

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", 200)

from sklearn.experimental import enable_halving_search_cv
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.multiclass import OneVsOneClassifier

from sklearn.metrics import make_scorer, roc_auc_score, f1_score

print("import success")

import success


In [38]:
# import pre-processed datasets

# train data
data = pd.read_csv(r"source/for_train.csv", index_col="id")

# target
labels = pd.read_csv(r"source/train_labels.csv", index_col="id")

# check whether rows are equal
print("data => rows: %s, cols: %s" % (data.shape[0], data.shape[1]))
print("labels => rows: %s, cols: %s" % (labels.shape[0], labels.shape[1]))

data => rows: 74250, cols: 114
labels => rows: 59400, cols: 1


In [39]:
target = labels.values.ravel()

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

# size = 1000
size = len(y_train)
X_train_pruned = X_train.head(size).copy()
y_train_pruned = copy.deepcopy(y_train[:size])

In [40]:
# select models for pre-mature testing

# Reference: https://towardsdatascience.com/comprehensive-guide-to-multiclass-classification-with-sklearn-127cc500f362

models = [
    BernoulliNB(),
    GaussianNB(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    KNeighborsClassifier(),
    LinearSVC(multi_class = 'crammer_singer'),
    LogisticRegression(multi_class="multinomial"),
    # LogisticRegressionCV(multi_class="multinomial")
]

In [41]:
# pre-mature testing to select a good model
def fit_predict(model):
    model.fit(X_train_pruned, y_train_pruned)
    predicted_vals = model.predict(X_test)
    score = f1_score(y_test, predicted_vals, average="weighted")

    return model, score

# result = []
# for model in models:
#     result.append(fit_predict(model))
#     print("Completed model: %s" % model)
#
# # summary
# print("--------------------------------------------")
#
# result.sort(key = lambda i: i[-1], reverse=True)
#
# for model, score in result:
#     print("%s:\t%s" %(model, score))

Results are as follows:

ExtraTreesClassifier():	                        0.7893193452304608
KNeighborsClassifier():	                        0.7667426178771187
ExtraTreeClassifier():	                        0.75045228775177
DecisionTreeClassifier():	                    0.7443432891788146
LogisticRegression(multi_class='multinomial'):	0.7051223181663652
LinearSVC(multi_class='crammer_singer'):	    0.6933519410970397
BernoulliNB():	                                0.6790484625264323
GaussianNB():	                                0.37601736307413447

From this observation, it can be concluded that ExtraTreesClassifier()
performs best with basic settings

In [42]:
# These are for one-vs-one classification
models = [
    # NuSVC(),
    SVC(),
    GaussianProcessClassifier(multi_class="one_vs_one")
]

In [None]:
# pre-mature testing to select a good model
def fit_predict(model):
    ovo = OneVsOneClassifier(estimator=model, n_jobs=4)
    ovo.fit(X_train_pruned, y_train_pruned)
    predicted_vals = ovo.predict(X_test)
    score = f1_score(y_test, predicted_vals, average="weighted")

    return model, score

result = []
for model in models:
    result.append(fit_predict(model))
    print("Completed model: %s" % model)

# summary
print("--------------------------------------------")

result.sort(key = lambda i: i[-1], reverse=True)

for model, score in result:
    print("%s:\t%s" %(model, score))

SVC():	0.6965628342500507
GaussianProcessClassifier(multi_class='one_vs_one'):	0.658912128474546
