In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='classifier')

def getClassifiers():
    all_clfs_names = []
    all_clfs_clf = []
    for name, ClassifierClass in estimators:
        try:
            clf = ClassifierClass()
            all_clfs_names.append(name)
            all_clfs_clf.append(clf)

            # develop
            # print("{'name':'" + name + "', 'params': [[{}],[{}]]},")
        except Exception as e:
            continue
            # print('Unable to import', name)
            # print(e)
    return all_clfs_names, all_clfs_clf

df = pd.read_csv(r'data/data_processed.csv')
df_continous = pd.read_csv(r'data/data_continous.csv')
df_categorical = pd.read_csv(r'data/data_categorical.csv')

# --------------------------------------------------------------------------------------------------
# copying job satisfaction into new array
JSat = df['JobSatisfaction'].values

# trying only binary classification
# any value of satisfaction greater than two will be "Satisfied" and anything less or equal will be "Unsatisfied"
JSat_threshold = 2

JSat_binary = [0 if val <= JSat_threshold else val for val in JSat]
JSat_binary = [1 if val > JSat_threshold else val for val in JSat_binary]
print(JSat_binary)

# splitting inputs by row index
# continous data
df_training_continous = df_continous.iloc[:1200,:]
df_validation_continous = df_continous.iloc[1200:,:]
# categorical data
df_training_categorical = df_categorical.iloc[:1200,:]
df_validation_categorical = df_categorical.iloc[1200:,:]
# splitting outputs by number
JSat_training = JSat[:1200]
JSat_validation = JSat[1200:]
# --------------------------------------------------------------------------------------------------

all_clfs_names, all_clfs_clf = getClassifiers()
estimators_trimmed = zip(all_clfs_names, all_clfs_clf)

# remove RidgeClassifierCV because it's included in RidgeClassifier with this script
estimators_trimmed = [(i,j) for i,j in estimators_trimmed if i != "RidgeClassifierCV"]
# remove because it should only be used with ensemble methods and is out of scope rn
estimators_trimmed = [(i,j) for i,j in estimators_trimmed if i != "ExtraTreeClassifier"]

In [None]:
# define possible parameters for all models, this is the mother of config files :)
all_possible_model_params = [
{'name':'AdaBoostClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}],[{"random_state":None}],[{"learning_rate":0.1}, {"learning_rate":1}, {"learning_rate":10}, {"learning_rate":100}]]}, # can implement estimator = any decisiion tree classifier with params... default = DecisionTreeClassifier(max_depth=1)
{'name':'BaggingClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}, {"n_estimators":10}, {"n_estimators":5}],[{"max_samples": 1.0},{"max_samples": 2.0},{"max_samples": 0.1},{"max_samples": 0.5},{"max_samples": 1},{"max_samples": 10}],[{"max_features": 1.0},{"max_features": 2.0},{"max_features": 2.5},{"max_features": 10}],[{"bootstrap": False}, {"bootstrap": True}]]}, # same as above
{'name':'BernoulliNB', 'params': [[{"alpha": 1.0}, {"alpha": 100.0}, {"alpha": 1e-12}, {"alpha": 1e-6}, {"alpha": 1e-3}], [{"binarize": None}, {"binarize": 0.0}, {"binarize": 1.0}, {"binarize": 2.5}, {"binarize": 5.0}], [{"fit_prior": True}, {"fit_prior": False}]]},
{'name':'CalibratedClassifierCV', 'params': [[{'method':'sigmoid'}, {'method:':'isotonic'}] ,[{"cv": None}, {"cv": 1}, {"cv": 3}, {"cv": 10}]]},
{'name':'CategoricalNB', 'params': [[{"fit_prior": True}, {"fit_prior": False}],[{"alpha": 0}, {"alpha": 1}, {"alpha": 3}, {"alpha": 5}]]},
{'name':'ComplementNB', 'params': [[{"fit_prior": True}, {"fit_prior": False}],[{"norm": True}, {"norm": False}], [{"alpha": 0}, {"alpha": 1}, {"alpha": 3}, {"alpha": 5}]]},
{'name':'DecisionTreeClassifier', 'params': [[{"splitter": "best"}, {"splitter": "random"}],[{"criterion": "gini"}, {"criterion": "entropy"}, {"criterion": "log_loss"}], [{"min_samples_split": 1}, {"min_samples_split": 2}]]},
{'name':'DummyClassifier', 'params': [[{"strategy": "prior"}],[{"random_state": None}]]},
{'name':'ExtraTreesClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}],[{"criterion": "gini"}, {"criterion": "entropy"}, {"criterion": "log_loss"}], [{"min_samples_split": 2}, {"min_samples_split": 3}, {"min_samples_split": 5}, {"min_samples_split": 10}], [{"max_features": "sqrt"}, {"max_features": "log2"}, {"max_features": None}], [{"min_samples_leaf": 1}, {"min_samples_leaf": 3}, {"min_samples_leaf": 5}, {"min_samples_leaf": 10}], [{"bootstrap": True}, {"bootstrap": False}], [{"n_jobs": -1}], [{"ccp_alpha": 0}, {"ccp_alpha": 1}, {"ccp_alpha": 3}], [{"class_weight": "balanced"}, {"class_weight": "balanced_subsample"}]]},
{'name':'GaussianNB', 'params': [[{"var_smoothing": 1e-9}, {"var_smoothing": 1e-8}, {"var_smoothing": 1e-10}],[{"priors": None}]]},
{'name':'GaussianProcessClassifier', 'params': [[{"max_iter_predict": 100}, {"max_iter_predict": 400}, {"max_iter_predict": 1000}],[{"n_jobs": -1}], [{"multi_class": "one_vs_rest"}, {"multi_class": "one_vs_one"}]]},
{'name':'GradientBoostingClassifier', 'params': [[{"loss": "log_loss"}, {"loss": "exponential"}],[{"n_estimators": 100}, {"n_estimators": 10}, {"n_estimators": 1e4}, {"n_estimators": 1e5}, {"n_estimators": 1e10}, {"n_estimators": 5}], [{"subsample": 1}, {"subsample": 0.9}, {"subsample": 0.5}, {"subsample": 0.3}],[{"min_samples_split": 2}, {"min_samples_split": 4}, {"min_samples_split": 8}], [{"max_depth": None}, {"max_depth": 3}], [{"max_features": "auto"}, {"max_features": "sqrt"}, {"max_features": "log2"}, {"max_features": None}], [{"ccp_alpha": 0}, {"ccp_alpha": 1}, {"ccp_alpha": 10}, {"ccp_alpha": 10}]]},
{'name':'HistGradientBoostingClassifier', 'params': [[{"max_iter": 100}, {"max_iter": 1000}, {"max_iter": 500}],[{"max_leaf_nodes": 31}, {"max_leaf_nodes": None}], [{"min_samples_leaf": 20}, {"min_samples_leaf": 10}], [{"max_bins": 255}, {"max_bins": 200}, {"max_bins": 150}, {"max_bins": 75}]]},
{'name':'KNeighborsClassifier', 'params': [[{"n_neighbors": 5}],[{"weights": "uniform"}, {"weights": "distance"}], [{"leaf_size": 10}, {"leaf_size": 20}, {"leaf_size": 30}, {"leaf_size": 40}, {"leaf_size": 5}, {"leaf_size": 15}, {"leaf_size": 25}, {"leaf_size": 35}, {"leaf_size": 50}], [{"p": 2}, {"p": 1}, {"p": 3}, {"p": 4}], [{"n_jobs": -1}]]},
{'name':'LabelPropagation', 'params': [[{"kernel": "rnn"}, {"kernel": "rbf"}],[{"gamma": 20}, {"gamma": 10}, {"gamma": 5}, {"gamma": 40}], [{"n_neighbors": 1}, {"n_neighbors": 3}, {"n_neighbors": 7}, {"n_neighbors": 10}, {"n_neighbors": 20}], [{"max_iter": 1000}, {"max_iter": 2000}, {"max_iter": 4000}, {"max_iter": 10000}], [{"n_jobs": -1}], [{"tol": 1e-3}, {"tol": 1e-2}, {"tol": 1e-4}]]},
{'name':'LabelSpreading', 'params': [[{"kernel": "rnn"}, {"kernel": "rbf"}],[{"gamma": 20}, {"gamma": 10}, {"gamma": 5}, {"gamma": 40}], [{"n_neighbors": 1}, {"n_neighbors": 3}, {"n_neighbors": 7}, {"n_neighbors": 10}, {"n_neighbors": 20}], [{"max_iter": 300}, {"max_iter": 1000}, {"max_iter": 2000}, {"max_iter": 30}], [{"n_jobs": -1}], [{"alpha": 0.2}, {"alpha": 0.1}, {"alpha": 0}, {"alpha": 0.5}, {"alpha": 0.9}, {"alpha": 1}]]},
{'name':'LinearDiscriminantAnalysis', 'params': [[{"solver": "svd"}, {"solver": "lsqr"}, {"solver": "eigen"}],[{"shrinkage": None}, {"shrinkage": "auto"}]]},
{'name':'LinearSVC', 'params': [[{"C": 1}, {"C": 0.5}, {"C": 0.9}, {"C": 2}],[{"multi_class": "ovr"}, {"multi_class": "crammer_singer"}], [{"intercept_scaling": 1}, {"intercept_scaling": 2}, {"intercept_scaling": 3}, {"intercept_scaling": 5}], [{"class_weight": "balanced"}, {"class_weight": None}], [{"max_iter": 1000}, {"max_iter": 2000}, {"max_iter": 4000}, {"max_iter": 10000}]]},
{'name':'LogisticRegression', 'params': [[{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}],[{"solver":'lbfgs'}, {"solver":'liblinear'}, {"solver":'newton-cg'}, {"solver":'sag'}, {"solver":'saga'}], [{"intercept_scaling": 1}, {"intercept_scaling": 2}, {"intercept_scaling": 3}, {"intercept_scaling": 5}], [{"class_weight": "balanced"}, {"class_weight": None}], [{"n_jobs": -1}], [{"max_iter": 1000}, {"max_iter": 2000}, {"max_iter": 4000}, {"max_iter": 10000}]]},
{'name':'LogisticRegressionCV', 'params': [[{"cv": None}, {"cv": 1}, {"cv": 3}, {"cv": 5}, {"cv": 8}, {"cv": 1}],[{"Cs":0.01}, {"Cs":0.1}, {"Cs":1}, {"Cs":10}],[{"solver":'lbfgs'}, {"solver":'liblinear'}, {"solver":'newton-cg'}, {"solver":'sag'}, {"solver":'saga'}], [{"intercept_scaling": 1}, {"intercept_scaling": 2}, {"intercept_scaling": 3}, {"intercept_scaling": 5}], [{"class_weight": "balanced"}, {"class_weight": None}], [{"n_jobs": -1}], [{"max_iter": 1000}, {"max_iter": 2000}, {"max_iter": 4000}, {"max_iter": 10000}]]},
{'name':'MLPClassifier', 'params': [[{"hidden_layer_sizes": 100}, {"hidden_layer_sizes": 35}, {"hidden_layer_sizes": 200}, {"hidden_layer_sizes": 500}],[{"activation": "identity"}, {"activation": "logistic"}, {"activation": "tanh"}, {"activation": "relu"}],[{"solver": "lbfgs"}, {"solver": "sgd"}, {"solver": "adam"}], [{"alpha": 1e-4}, {"alpha": 1e-5}, {"alpha": 1e-3}, {"alpha": 1e-6}], [{"learning_rate": "constant"}, {"learning_rate": "invscaling"}, {"learning_rate": "adaptive"}], [{"max_iter": 200}, {"max_iter": 2000}, {"max_iter": 4000}, {"max_iter": 10000}]]},
{'name':'MultinomialNB', 'params': [[{"alpha": 0}, {"alpha": 1.0}, {"alpha": 100.0}, {"alpha": 1e-12}, {"alpha": 1e-6}, {"alpha": 1e-3}],[{"fit_prior": True}, {"fit_prior": False}]]},
{'name':'NearestCentroid', 'params': [[{"metric": "euclidean"}, {"metric": "manhattan"}, {"metric": "cosine"}, {"metric": "haversine"}, {"metric": "cityblock"}],[{"shrink_threshold": None}, {"shrink_threshold": 1e-4}, {"shrink_threshold": 1e4}]]},
{'name':'NuSVC', 'params': [[{"C":1e-4}, {"C":1e-3}, {"C":1e-2}, {"C":0.1}, {"C":1}],[{"kernel":"linear"}, {"kernel":"rbf"}, {"kernel":"sigmoid"}, {"kernel":"poly"}], [{"degree":1}, {"degree":2}, {"degree":3}, {"degree":4}]]},
{'name':'PassiveAggressiveClassifier', 'params': [[{"C":1e-4}, {"C":1e-3}, {"C":1e-2}, {"C":0.1}, {"C":1}],[{"max_iter": 1000}, {"max_iter": -1}], [{"n_jobs": -1}]]},
{'name':'Perceptron', 'params': [[{"penalty": "l1"}, {"penalty": "l2"}, {"penalty": "elasticnet"}, {"penalty": None}],[{"alpha": 1e-4}, {"alpha": 1e-3}, {"alpha": 1e-2}, {"alpha": 1e-5}], [{"l1_ratio": 0.15}, {"l1_ratio": 0.05}, {"l1_ratio": 0.25}, {"l1_ratio": 0.5}, {"l1_ratio": 0.75}]]},
{'name':'QuadraticDiscriminantAnalysis', 'params': [[{"reg_param": 0.1}, {"reg_param": 0.2}, {"reg_param": 0.3}, {"reg_param": 0.4}, {"reg_param": 0.5}],[{"tol": 1e-4}, {"tol": 1e-3}, {"tol": 1e-6}]]},
{'name':'RadiusNeighborsClassifier', 'params': [[{"radius": 1}, {"radius": 2}, {"radius": 3}, {"radius": 1e-1}, {"radius": 1e-2}],[{"weights": "uniform"}, {"weights": "distance"}], [{"algorithm": "auto"}, {"algorithm": "ball_tree"}, {"algorithm": "kd_tree"}, {"algorithm": "brute"}], [{"p": 1}, {"p": 2}], [{"metric": "euclidean"}, {"metric": "manhattan"}, {"metric": "cosine"}, {"metric": "haversine"}, {"metric": "cityblock"}, {"metric": "minkowski"}], [{"n_jobs": -1}]]},
{'name':'RandomForestClassifier', 'params': [[{"n_estimators": 100}, {"n_estimators": 1e4}, {"n_estimators": 1e6}, {"n_estimators": 10}],[{"criterion": "gini"}, {"criterion": "entropy"}, {"criterion": "log_loss"}], [{"min_samples_split": 2}, {"min_samples_split": 3}, {"min_samples_split": 5}, {"min_samples_split": 1}], [{"max_features": "sqrt"}, {"max_features": "log2"}], [{"bootstrap": True}, {"bootstrap": False}], [{"n_jobs": -1}, {"n_jobs": None}], [{"ccp_alpha": 0}, {"ccp_alpha": 0.1}, {"ccp_alpha": 0.01}]]},
{'name':'RidgeClassifier', 'params': [[{"solver": "svd"}, {"solver": "lsqr"}, {"solver": "cholesky"}, {"solver": "sparse_cg"}, {"solver": "sag"}, {"solver": "saga"}, {"solver": "lbfgs"}], [{"alpha": 1}, {"alpha": 1e-2}, {"alpha": 1e-4}, {"alpha": 1e2}, {"alpha": 1e4}],[{"positive": True}, {"positive": False}]]},
{'name':'SGDClassifier', 'params': [[{"loss": "hinge"}, {"loss": "log_loss"}, {"loss": "modified_huber"}, {"loss": "squared_hinge"}, {"loss": "perceptron"}, {"loss": "squared_error"}, {"loss": "huber"}, {"loss": "epsilon_insensitive"}, {"loss": "squared_epsilon_insensitive"}],[{"penalty": "l1"}, {"penalty": "l2"}, {"penalty": "elasticnet"}, {"penalty": None}], [{"alpha": 1e-4}, {"alpha": 1e-3}, {"alpha": 1e-2}, {"alpha": 1e-1}, {"alpha": 1}, {"alpha": 1e2}, {"alpha": 1e3}, {"alpha": 1e4}], [{"l1_ratio": 0.15}, {"l1_ratio": 0.05}, {"l1_ratio": 0.25}, {"l1_ratio": 0.5}, {"l1_ratio": 0.75}], [{"max_iter": 1e4}, {"max_iter": 1e5}, {"max_iter": 1e6}], [{"n_jobs": None}, {"n_jobs": -1}], [{"epsilon": 1e-1}, {"epsilon": 1}, {"epsilon": 1e-2}], [{"learning_rate": "constant"}, {"learning_rate": "optimal"}, {"learning_rate": "invscaling"}, {"learning_rate": "adaptive"}], [{"eta0": 1e-4}]]},
{'name':'SVC', 'params': [[{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}],[{"kernel":"linear"}, {"kernel":"rbf"}, {"kernel":"sigmoid"}, {"kernel":"poly"}], [{"degree":1}, {"degree":2}, {"degree":3}, {"degree":4}]]}
]

In [None]:
# pair all possible parameters of a type with others of a different types... basically combinations of n by k from the parameters
import itertools

# count models for stats
total_nr_models = 0
# store params list by model
params_by_model = []

for model_family in all_possible_model_params:
    params = model_family["params"]

    # initialize an empty param dict for model
    param_dict = []
    # combinations of n by k from unique available params
    combinations = [p for p in itertools.product(*params)]
    # concatenate each tuple of unique params into a dict and save into param_dict
    for combination in combinations:
        param_dict.append(dict(itertools.chain.from_iterable(d.items() for d in combination)))

    # count models
    total_nr_models = total_nr_models + len(param_dict)

    # append param list asssociated with model name
    params_by_model.append({"name": model_family["name"], "params": param_dict})
    
    # debug
    # print(param_dict)
    # break

print(total_nr_models) # f*ck that's a lot

In [None]:

print(len(params_by_model))
model_classes = []
i = 0

for clf_name, clf in estimators_trimmed:
    
    # some errors without, unknown casting bug ?
    clf_name = str(clf_name)

    # make sure we are not passing the wrong parameters to the classifier
    if params_by_model[i]["name"] == clf_name:
        clf_params = params_by_model[i]["params"]
    else:
        print("Something went wrong! - Tried passing params from " + params_by_model[i]["name"] + " to " + clf_name + "!\n")

    model_classes.append([clf, clf_name, clf_params])
    # debug
    # print("Appended " + str(clf_name) + " with params " + str(clf_params) + "to object " + str(clf))
    # break
    
    # increment our index for the parameter lookup
    i += 1


In [None]:
from sklearn import metrics

# let's get some statistics about this process
global_stats = {
    "trained_models": 0,
    "failed_models": 0,
    "total_models": total_nr_models,
    "training_time": 0,
    "total_training_time": 0
}

model_stats = []

# let's also time the execution of these, and total execution :)
import time
# total time start
t_start = time.time()
# var for longest model train time
longest_total_model_train_time =0

insights = []
for Model, modelname, params_list in model_classes:

    # check that params are initialized
    if params_list != []:

        # model timer start
        t_start_model = time.time()
        # longest param train time
        longest_param_train_time = 0

        for params in params_list:
            # param timer start
            t_start_param = time.time()

            print("Starting " + str(modelname) + " with params: " + str(params) + "... " + str(global_stats["trained_models"] + global_stats["failed_models"]) + " OUT OF " + str(global_stats["total_models"]))

            # define insight vars in case of failure
            model_accuracy = 0
            model_log_loss = 0
            model_precision = 0
            model_recall = 0
            model_f1 = 0

            # in case something goes wrong or unsupported params
            try:
                # some models fail when called with args ? unknown bug
                model = Model
                # some models fail with *kwargs
                model.set_params(**params)
                model.fit(df_training_categorical, JSat_training)

                JSat_predicted = model.predict(df_validation_categorical)
                
                # compute model accuracy
                model_accuracy = metrics.accuracy_score(JSat_validation, JSat_predicted)
                # compute model log loss
                # model_log_loss = metrics.log_loss(JSat_validation, JSat_predicted, average="micro")
                # compute model precision
                model_precision = metrics.precision_score(JSat_validation, JSat_predicted, average="micro")
                #  compute model precision
                model_recall = metrics.recall_score(JSat_validation, JSat_predicted, average="micro")
                # compute model f1 score
                model_f1 = 2*(model_precision*model_recall)/(model_precision + model_recall)

                # param timer end
                t_end_param = time.time()
                # compute elapsed
                param_time = t_end_param - t_start_param

                print(" finished in " + str(param_time) + "s; had an f1 of: " + str(model_f1) + "\n")
                global_stats["trained_models"] += 1
            except Exception as e:
                print(" failed in " + str(param_time) + "s;\n")
                print(e)
                global_stats["failed_models"] += 1

            if longest_param_train_time < param_time:
                longest_param_train_time = param_time
            
            insights.append((modelname, model, params, model_accuracy, model_f1, model_log_loss, model_precision, model_recall, param_time))
        
        # model timer end
        t_end_model = time.time()
        # get total model train time
        model_time = t_end_model - t_start_model
        # compare longest total model train time
        if longest_total_model_train_time < model_time:
            longest_total_model_train_time = model_time

        # let's get stats by model
        model_stats.append({
            "name": modelname, 
            "tuned_params": len(params),
            "variations_tested": len(params_list), 
            "total_train_time": t_end_model - t_start_model, 
            "longest_train_time": longest_param_train_time,
            })

# total time end
t_end = time.time()
# compute total time
t_total = t_end - t_start

global_stats["total_training_time"] = t_total

print("\nFinalized all trainings in " + str(t_total) + "s. Phew!")


In [None]:
# visualize plots by different model parameters
plt_models_by_clf = {}

# reformat insights data to easier fit our plot requirements
for model_name, model_class, model_params, model_accuracy, model_f1, model_fbeta, model_log_loss, model_precision, model_time in insights:
    # check if key exists and if not add new dict key with model type (name)
    if not model_name in plt_models_by_clf:
        plt_models_by_clf.update((model_name, []))
    
    plt_models_by_clf[model_name].append({
        "model_class": model_class,
        "model_params": model_params,
        "model_accuracy": model_accuracy,
        "model_f1": model_f1,
        "model_fbeta": model_fbeta,
        "model_log_loss": model_log_loss,
        "model_precision": model_precision,
        "model_time": model_time
    })

In [None]:
# pick best accuracy models and compare them to the rest
plt_model_best = []
_count = 0
for model_name, model_class, model_params, model_score in insights:
    if not model_name in plt_model_best:
        plt_model_best.append({"name": model_name, "parameters": model_params, "accuracy": model_score})
    else:
        for best_name, best_parameters, best_accuracy in plt_model_best:
            _count = _count + 1
            if best_name == model_name:
                if best_accuracy < model_score:
                    plt_model_best[_count]["parameters"] = model_params
                    plt_model_best[_count]["accuracy"] = model_score
                if best_accuracy == model_score:
                    plt_model_best[_count]["parameters"] = best_parameters + model_params

                    
