In [1]:
import copy

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", 200)

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

from sklearn.metrics import make_scorer, f1_score

print("import success")

import success


In [2]:
# import pre-processed datasets

# train data
data = pd.read_csv(r"source/for_train.csv", index_col="id")

# target
labels = pd.read_csv(r"source/train_labels.csv", index_col="id")

# check whether rows are equal
print("data => rows: %s, cols: %s" % (data.shape[0], data.shape[1]))
print("labels => rows: %s, cols: %s" % (labels.shape[0], labels.shape[1]))

data => rows: 74250, cols: 114
labels => rows: 59400, cols: 1


In [3]:
target = labels.values.ravel()

train = data[data.type.eq("train")].drop("type", axis=1)
test = data[data.type.eq("test")].drop("type", axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.2,
    shuffle = True,
    stratify = target,
    random_state = 42
)

size = len(y_train)
X_train_pruned = X_train.head(size).copy()
y_train_pruned = copy.deepcopy(y_train[:size])

In [4]:
# select models for pre-mature testing

# Reference: https://towardsdatascience.com/comprehensive-guide-to-multiclass-classification-with-sklearn-127cc500f362

models = [
    BernoulliNB(),
    GaussianNB(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    ExtraTreesClassifier(),
    KNeighborsClassifier(),
    LinearSVC(multi_class = 'crammer_singer'),
    LogisticRegression(multi_class="multinomial"),
    # LogisticRegressionCV(multi_class="multinomial"),
    RandomForestClassifier(criterion="entropy"),
    AdaBoostClassifier(),
    XGBClassifier(),
    XGBRFClassifier(),
    HistGradientBoostingClassifier(),
    LGBMClassifier(),
    CatBoostClassifier(verbose=0)
]

In [5]:
# pre-mature testing to select a good model
def fit_predict(model):
    model.fit(X_train_pruned, y_train_pruned)
    predicted_vals = model.predict(X_test)
    score = f1_score(y_test, predicted_vals, average="weighted")

    return model, score

result = []
for model in models:
    result.append(fit_predict(model))
    print("Completed model: %s" % model)

# summary
print("--------------------------------------------")

result.sort(key = lambda i: i[-1], reverse=True)

for model, score in result:
    print("%s:\t%s" %(model, score))

Completed model: <catboost.core.CatBoostClassifier object at 0x000002DD3EE240C8>
Completed model: RandomForestClassifier(criterion='entropy', n_jobs=4)
Completed model: LGBMClassifier(n_jobs=4)




Completed model: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
--------------------------------------------
RandomForestClassifier(criterion='entropy', n_jobs=4):	0.795930235349091
<catboost.core.CatBoostClassifier object at 0x000002DD3EE240C8>:	0.7815671070480033
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
           

#### Note that only first 3000 rows are used to fit and test different classifiers

Some of them are resource hungry, so I had to limit row count

Results are as follows:

CatBoostClassifier()                            0.7235016674367357
RandomForestClassifier():	                    0.7220672374568503
LGBMClassifier():	                            0.720531724640514
XGBClassifier():	                            0.7194778397818585
HistGradientBoostingClassifier():	            0.7189712814704061
ExtraTreesClassifier():	                        0.7127838117079437
LogisticRegression(multi_class='multinomial'):	0.6924761104786776
LinearSVC(multi_class='crammer_singer'):	    0.6903410937095616
XGBRFClassifier():	                            0.6849082996886071
AdaBoostClassifier():	                        0.6768951452216663
DecisionTreeClassifier():	                    0.6695058355336658
ExtraTreeClassifier():	                        0.6680078146995786
KNeighborsClassifier():	                        0.6672741948775417
BernoulliNB():	                                0.6658124805827229
GaussianNB():	                                0.141613926068683

From this observation, it can be concluded that CatBoostClassifier()
performs best with basic settings.

But following are can be given a re-match with higher number (gave full dataset) of
rows and results are here:
RandomForestClassifier(criterion='entropy', n_jobs=4):	0.795930235349091
CatBoostClassifier():	                                0.7815671070480033
XGBClassifier():	                                    0.7776228703571458
LGBMClassifier(n_jobs=4):	                            0.7705512503062181


following classifiers didn't work because of not enough
system resources / configuration issues

1. LogisticRegressionCV()

In [6]:
# These are for one-vs-one classification
models = [
    # NuSVC(),
    SVC(),
    GaussianProcessClassifier(multi_class="one_vs_one")
]

In [7]:
# pre-mature testing to select a good model
def fit_predict(model):
    ovo = OneVsOneClassifier(estimator=model, n_jobs=1)
    ovo.fit(X_train_pruned, y_train_pruned)
    predicted_vals = ovo.predict(X_test)
    score = f1_score(y_test, predicted_vals, average="weighted")

    return model, score

# result = []
# for model in models:
#     result.append(fit_predict(model))
#     print("Completed model: %s" % model)
#
# # summary
# print("--------------------------------------------")
#
# result.sort(key = lambda i: i[-1], reverse=True)
#
# for model, score in result:
#     print("%s:\t%s" %(model, score))

Results are as follows:

SVC():	                                                0.7086297175832617
GaussianProcessClassifier(multi_class='one_vs_one'):	0.6934369848273506

From this observation, it can be concluded that SVC()
performs best with basic settings

following classifiers didn't work because of not enough
system resources / configuration issues

1. NuSVC()
2. GaussianProcessClassifier(multi_class="one_vs_one") -> could not handle large data. My system is not capable

In [8]:
# These are for one-vs-rest classification
models = [
    GradientBoostingClassifier(),
    GaussianProcessClassifier(multi_class="one_vs_rest"),
    LinearSVC(multi_class="ovr"),
    LogisticRegression(multi_class="ovr"),
    # LogisticRegressionCV(multi_class="ovr"),
    SGDClassifier(),
    Perceptron()
]

In [9]:
# pre-mature testing to select a good model
def fit_predict(model):
    ovo = OneVsRestClassifier(estimator=model, n_jobs=1)
    ovo.fit(X_train_pruned, y_train_pruned)
    predicted_vals = ovo.predict(X_test)
    score = f1_score(y_test, predicted_vals, average="weighted")

    return model, score

# result = []
# for model in models:
#     result.append(fit_predict(model))
#     print("Completed model: %s" % model)
#
# # summary
# print("--------------------------------------------")
#
# result.sort(key = lambda i: i[-1], reverse=True)
#
# for model, score in result:
#     print("%s:\t%s" %(model, score))

Results are as follows:

GradientBoostingClassifier():	        0.7002652846652517
GaussianProcessClassifier():	        0.6966250634142289
LogisticRegression(multi_class='ovr'):	0.6908173731656669
LinearSVC():	                        0.6893327474872507
SGDClassifier():	                    0.6877661844945466
Perceptron():	                        0.6620726289681607

From this observation, it can be concluded that GradientBoostingClassifier()
performs best with basic settings

following classifiers didn't work because of not enough
system resources / configuration issues

1. LogisticRegressionCV(multi_class="ovr")

Overall, RandomForestClassifier() is chosen as best classifier in given conditions


Results are as follows:

SVC() 0.7588050231479048

From this observation, it can be concluded that ExtraTreesClassifier()
performs best with basic settings

following classifiers didn't work because of not enough
system resources / configuration issues

1. NuSVC()
2. GaussianProcessClassifier(multi_class="one_vs_one")
