In [75]:
import pandas as pd
import numpy as np
import os

from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='classifier')

def getClassifiers():
    all_clfs_names = []
    all_clfs_clf = []
    for name, ClassifierClass in estimators:
        try:
            clf = ClassifierClass()
            all_clfs_names.append(name)
            all_clfs_clf.append(clf)

            # develop
            # print("{'name':'" + name + "', 'params': [[{}],[{}]]},")
        except Exception as e:
            continue
            # print('Unable to import', name)
            # print(e)
    return all_clfs_names, all_clfs_clf

df = pd.read_csv(r'data/data_processed.csv')
df_continous = pd.read_csv(r'data/data_continous.csv')
df_categorical = pd.read_csv(r'data/data_categorical.csv')

# --------------------------------------------------------------------------------------------------
# copying job satisfaction into new array
JSat = df['JobSatisfaction'].values

# trying only binary classification
# any value of satisfaction greater than two will be "Satisfied" and anything less or equal will be "Unsatisfied"
JSat_threshold = 2

JSat_binary = [0 if val <= JSat_threshold else val for val in JSat]
JSat_binary = [1 if val > JSat_threshold else val for val in JSat_binary]
# print(JSat_binary)

# copying attrition into new column
JAtt = df["Attrition"].values

# Sivas Code -------------------------------------------------------------
# droplist = ['BusinessTravel','DailyRate','EmployeeNumber','HourlyRate','MonthlyRate','NumCompaniesWorked','Over18','StandardHours','TrainingTimesLastYear', 'Attrition']
# for val in droplist:
#     if val in df.columns.values:
#         df.drop(val, axis=1, inplace=True)
#         print(val + " dropped from the dataset successfully.")
# ---------------------------------------------------------------------------

# splitting inputs by row index
df = pd.read_csv(r'data/data_processed.csv')
df_cat = pd.read_csv(r'data/data_categorical.csv')

df_good = pd.concat([df_cat, df["MonthlyIncome"], df["BusinessTravel"], df["StockOptionLevel"], df["DistanceFromHome"]], axis=1)

# print(df_good)
# copy attrition and drop from dataframe
JAtt = df["Attrition"].values

df_good = df_good.drop(["Unnamed: 0"], axis=1)
df_good = df_good.drop(['Attrition'], axis=1)
# print(df_good)
# splitting outputs by number
JSat_training = JSat[:1200]
JSat_validation = JSat[1200:]

# splitting new df
df_train = df_good.iloc[:1200,:]
df_val = df_good.iloc[1200:,:]
# splitting Jatt into new sizes
JAtt_train = JAtt[:1200]
JAtt_val = JAtt[1200:]
# --------------------------------------------------------------------------------------------------

all_clfs_names, all_clfs_clf = getClassifiers()
estimators_trimmed = zip(all_clfs_names, all_clfs_clf)

# remove RidgeClassifierCV because it's included in RidgeClassifier with this script
estimators_trimmed = [(i,j) for i,j in estimators_trimmed if i != "RidgeClassifierCV"]
# remove because it should only be used with ensemble methods and is out of scope rn
estimators_trimmed = [(i,j) for i,j in estimators_trimmed if i != "ExtraTreeClassifier"]

# remove these because they have poor performance on our dataset
for name in ["BernoulliNB" , "CalibratedClassifierCV" , "CategoricalNB" , "ComplementNB" , "DecisionTreeClassifier" , "DummyClassifier" , "GaussianProcessClassifier" , "KNeighborsClassifier" ,  "LabelPropagation" , "LabelSpreading" , "LinearDiscriminantAnalysis" , "LinearSVC" , "LogisticRegression" , "LogisticRegressionCV" , "MLPClassifier" , "MultinomialNB" , "NearestCentroid" , "NuSVC" , "PassiveAggressiveClassifier" , "Perceptron" , "RadiusNeighborsClassifier" , "RidgeClassifier" , "SGDClassifier", "SVC"]:
    estimators_trimmed = [(i,j) for i,j in estimators_trimmed if i != name]

print(str(estimators_trimmed))

[('AdaBoostClassifier', AdaBoostClassifier()), ('BaggingClassifier', BaggingClassifier()), ('ExtraTreesClassifier', ExtraTreesClassifier()), ('GaussianNB', GaussianNB()), ('GradientBoostingClassifier', GradientBoostingClassifier()), ('HistGradientBoostingClassifier', HistGradientBoostingClassifier()), ('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()), ('RandomForestClassifier', RandomForestClassifier())]


In [76]:
# define possible parameters for all models, this is the mother of config files :)
all_possible_model_params = [
{'name':'AdaBoostClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}],[{"random_state":None}],[{"learning_rate":0.1}, {"learning_rate":1}, {"learning_rate":10}, {"learning_rate":100}]]}, # can implement estimator = any decisiion tree classifier with params... default = DecisionTreeClassifier(max_depth=1)
{'name':'BaggingClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}, {"n_estimators":10}, {"n_estimators":5}],[{"max_samples": 1.0},{"max_samples": 2.0},{"max_samples": 0.1},{"max_samples": 0.5},{"max_samples": 1},{"max_samples": 10}],[{"max_features": 1.0},{"max_features": 2.0},{"max_features": 2.5},{"max_features": 10}],[{"bootstrap": False}, {"bootstrap": True}]]}, # same as above
{'name':'ExtraTreesClassifier', 'params': [[{"n_estimators":100}, {"n_estimators":50}, {"n_estimators":200}],[{"criterion": "gini"}, {"criterion": "entropy"}, {"criterion": "log_loss"}], [{"min_samples_split": 2}, {"min_samples_split": 3}, {"min_samples_split": 5}, {"min_samples_split": 10}], [{"max_features": "sqrt"}, {"max_features": "log2"}, {"max_features": None}], [{"min_samples_leaf": 1}, {"min_samples_leaf": 3}, {"min_samples_leaf": 5}, {"min_samples_leaf": 10}], [{"bootstrap": True}, {"bootstrap": False}], [{"n_jobs": -1}], [{"ccp_alpha": 0}, {"ccp_alpha": 1}, {"ccp_alpha": 3}], [{"class_weight": "balanced"}, {"class_weight": "balanced_subsample"}]]},
{'name':'GaussianNB', 'params': [[{"var_smoothing": 1e-9}, {"var_smoothing": 1e-8}, {"var_smoothing": 1e-10}],[{"priors": None}]]},
{'name':'GradientBoostingClassifier', 'params': [[{"loss": "log_loss"}, {"loss": "exponential"}],[{"n_estimators": 100}, {"n_estimators": 10}, {"n_estimators": 1e4}, {"n_estimators": 1e5}, {"n_estimators": 1e10}, {"n_estimators": 5}], [{"subsample": 1}, {"subsample": 0.9}, {"subsample": 0.5}, {"subsample": 0.3}],[{"min_samples_split": 2}, {"min_samples_split": 4}, {"min_samples_split": 8}], [{"max_depth": None}, {"max_depth": 3}], [{"max_features": "auto"}, {"max_features": None}], [{"ccp_alpha": 0}, {"ccp_alpha": 1}, {"ccp_alpha": 10}, {"ccp_alpha": 10}]]},
{'name':'HistGradientBoostingClassifier', 'params': [[{"max_iter": 100}, {"max_iter": 1000}, {"max_iter": 500}],[{"max_leaf_nodes": 31}, {"max_leaf_nodes": None}], [{"min_samples_leaf": 20}, {"min_samples_leaf": 10}], [{"max_bins": 255}, {"max_bins": 200}, {"max_bins": 150}, {"max_bins": 75}]]},
{'name':'QuadraticDiscriminantAnalysis', 'params': [[{"reg_param": 0.1}, {"reg_param": 0.2}, {"reg_param": 0.3}, {"reg_param": 0.4}, {"reg_param": 0.5}],[{"tol": 1e-4}, {"tol": 1e-3}, {"tol": 1e-6}]]},
{'name':'RandomForestClassifier', 'params': [[{"n_estimators": 100}, {"n_estimators": 200}, {"n_estimators": 400}, {"n_estimators": 1e4}, {"n_estimators": 1e6}, {"n_estimators": 10}],[{"criterion": "gini"}, {"criterion": "entropy"}, {"criterion": "log_loss"}], [{"min_samples_split": 2}, {"min_samples_split": 3}, {"min_samples_split": 5}, {"min_samples_split": 1}], [{"max_features": "sqrt"}, {"max_features": "log2"}], [{"bootstrap": True}, {"bootstrap": False}], [{"n_jobs": -1}], [{"ccp_alpha": 0}, {"ccp_alpha": 0.1}, {"ccp_alpha": 0.01}, {"ccp_alpha": 1e-3}]]},
# {'name':'SVC', 'params': [[{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}],[{"kernel":"linear"}, {"kernel":"rbf"}, {"kernel":"sigmoid"}, {"kernel":"poly"}], [{"degree":1}, {"degree":2}, {"degree":3}, {"degree":4}]]}
]

In [77]:
# pair all possible parameters of a type with others of a different types... basically combinations of n by k from the parameters
import itertools

# count models for stats
total_nr_models = 0
# store params list by model
params_by_model = []

for model_family in all_possible_model_params:
    params = model_family["params"]

    # initialize an empty param dict for model
    param_dict = []
    # combinations of n by k from unique available params
    combinations = [p for p in itertools.product(*params)]
    # concatenate each tuple of unique params into a dict and save into param_dict
    for combination in combinations:
        param_dict.append(dict(itertools.chain.from_iterable(d.items() for d in combination)))

    # count models
    total_nr_models = total_nr_models + len(param_dict)

    # append param list asssociated with model name
    params_by_model.append({"name": model_family["name"], "params": param_dict})
    
    # debug
    # print(param_dict)
    # break

print(total_nr_models) # f*ck that's a lot

8958


In [78]:

print(len(params_by_model))
model_classes = []
i = 0

for clf_name, clf in estimators_trimmed:
    
    # some errors without, unknown casting bug ?
    clf_name = str(clf_name)

    # make sure we are not passing the wrong parameters to the classifier
    if params_by_model[i]["name"] == clf_name:
        clf_params = params_by_model[i]["params"]
    else:
        print("Something went wrong! - Tried passing params from " + params_by_model[i]["name"] + " to " + clf_name + "!\n")

    model_classes.append([clf, clf_name, clf_params])
    # debug
    # print("Appended " + str(clf_name) + " with params " + str(clf_params) + "to object " + str(clf))
    # break
    
    # increment our index for the parameter lookup
    i += 1


8


In [79]:
import time
from sklearn import metrics

# so we can hopefully clear some output cell from overflowing...
from IPython.display import clear_output

# let's get some statistics about this process
global_stats = {
    "trained_models": 0,
    "failed_models": 0,
    "total_models": total_nr_models,
    "training_time": 0,
    "total_training_time": 0
}

model_stats = []

# let's also time the execution of these, and total execution :)
# total time start
t_start = time.time()
# var for longest model train time
longest_total_model_train_time = 0

insights = []
for Model, modelname, params_list in model_classes:

    # check that params are initialized
    if params_list != []:

        # model timer start
        t_start_model = time.time()
        # longest param train time
        longest_param_train_time = 0

        for params in params_list:
            # param timer start
            t_start_param = time.time()

            clear_output()

            print("Starting " + str(modelname) + " with params: " + str(params) + "... " +
                  str(global_stats["trained_models"] + global_stats["failed_models"]) + " OUT OF " + str(global_stats["total_models"]))

            # define insight vars in case of failure
            model_accuracy = 0
            model_log_loss = 0
            model_precision = 0
            model_recall = 0
            model_f1 = 0

            # in case something goes wrong or unsupported params
            try:
                # some models fail when called with args ? unknown bug
                model = Model
                # some models fail with *kwargs
                model.set_params(**params)
                model.fit(df_train, JAtt_train)

                JAtt_predicted = model.predict(df_val)

                # compute model accuracy
                model_accuracy = metrics.accuracy_score(
                    JAtt_val, JAtt_predicted)
                # compute model log loss
                # model_log_loss = metrics.log_loss(JSat_validation, JSat_predicted, average="micro")
                # compute model precision
                model_precision = metrics.precision_score(
                    JAtt_val, JAtt_predicted, average="micro")
                #  compute model precision
                model_recall = metrics.recall_score(
                    JAtt_val, JAtt_predicted, average="micro")
                # compute model f1 score
                model_f1 = 2*(model_precision*model_recall) / \
                    (model_precision + model_recall)

                # param timer end
                t_end_param = time.time()
                # compute elapsed
                param_time = t_end_param - t_start_param

                print(" finished in " + str(param_time) +
                      "s; had an f1 of: " + str(model_f1) + "\n")
                global_stats["trained_models"] += 1
            except Exception as e:
                print(" failed in " + str(param_time) + "s;\n")
                print(e)
                global_stats["failed_models"] += 1

            if longest_param_train_time < param_time:
                longest_param_train_time = param_time

            insights.append((modelname, model, params, model_accuracy, model_f1,
                            model_log_loss, model_precision, model_recall, param_time))

        # model timer end
        t_end_model = time.time()
        # get total model train time
        model_time = t_end_model - t_start_model
        # compare longest total model train time
        if longest_total_model_train_time < model_time:
            longest_total_model_train_time = model_time

        # let's get stats by model
        model_stats.append({
            "name": modelname,
            "tuned_params": len(params),
            "variations_tested": len(params_list),
            "total_train_time": t_end_model - t_start_model,
            "longest_train_time": longest_param_train_time,
        })

# total time end
t_end = time.time()
# compute total time
t_total = t_end - t_start

global_stats["total_training_time"] = t_total

print("\nFinalized all trainings in " + str(t_total) + "s. Phew!")


Starting RandomForestClassifier with params: {'n_estimators': 10, 'criterion': 'log_loss', 'min_samples_split': 1, 'max_features': 'log2', 'bootstrap': False, 'n_jobs': -1, 'ccp_alpha': 0.001}... 8957 OUT OF 8958
 failed in 0.03798413276672363s;

min_samples_split == 1, must be >= 2.

Finalized all trainings in 2229.2598531246185s. Phew!


In [80]:
# in case I forgot something (and damn it I did... so many times...) and because the computation time is long I don't want to waste all the computation that I already did, so let's save it into a log file
# maybe I should've spent more time on a way to cache the models... not sure I have enough space for that. Hey! Guess I should've used google collab...
from datetime import datetime
import pickle

# datetime object containing current date and time
dt_string = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
print("date and time =", dt_string)

# save current insights into a file
with open("out/logs/insights-" + dt_string + ".pck", 'wb') as f:
    pickle.dump(insights, f)
with open("out/logs/globalstats-" + dt_string + ".pck", 'wb') as f:
    pickle.dump(global_stats, f)
with open("out/logs/modelstats-" + dt_string + ".pck", 'wb') as f:
    pickle.dump(model_stats, f)

f.close()

date and time = 24-01-2023-01-12-03


In [43]:
%%time
import pickle
# ^^^^ prevents this cell from being executed automatically
# load an insights log file
load_file_name = "insights-23-01-2023-20-16-17" + ".pck"
with open(load_file_name, 'rb') as f:
    insights = pickle.load(f)

CPU times: total: 141 ms
Wall time: 116 ms


In [81]:
# visualize plots by different model parameters
plt_models_by_clf = {}

# reformat insights data to easier fit our plot requirements
for model_name, model_class, model_params, model_accuracy, model_f1, model_fbeta, model_log_loss, model_precision, model_time in insights:
    # check if key exists and if not add new dict key with model type (name)
    if not model_name in plt_models_by_clf:
        plt_models_by_clf.update({str(model_name): []})
    
    plt_models_by_clf[model_name].append({
        "model_class": model_class,
        "model_params": model_params,
        "model_accuracy": model_accuracy,
        "model_f1": model_f1,
        "model_fbeta": model_fbeta,
        "model_log_loss": model_log_loss,
        "model_precision": model_precision,
        "model_time": model_time
    })

In [83]:
# pick best accuracy models and compare them to the rest
plt_model_best = []
_count = 0
for model_name, model_class, model_params, model_accuracy, model_f1, model_fbeta, model_log_loss, model_precision, model_time in insights:
    mdl_found = False
    for mdl in plt_model_best:
        if mdl["name"] == model_name:
            mdl_found = True
    
    if not mdl_found:
        plt_model_best.append({"name": model_name, "model_class": model_class, "parameters": model_params, "f1_score": model_f1, "model_time": model_time})
    else:
        for best_name, _, best_parameters, best_f1, best_time in plt_model_best:
            _count = _count + 1
            if best_name == model_name:
                print(best_name + " == " + model_name)
                if best_f1 < model_f1:
                    plt_model_best[_count]["parameters"] = model_params
                    plt_model_best[_count]["f1_score"] = model_f1
                    plt_model_best[_count]["model_time"] = model_time
                    plt_model_best[_count]["model_class"] = model_class
                if best_f1 == model_f1 and best_time == model_time:
                    plt_model_best[_count]["parameters"] = best_parameters + model_params
                if best_f1 == model_f1 and best_time > model_time:
                    plt_model_best[_count]["parameters"] = model_params
                    plt_model_best[_count]["model_time"] = model_time
                    plt_model_best[_count]["model_class"] = model_class
                
# print(str(plt_model_best))


In [84]:
# print all models
for model in plt_model_best:
    print("Model " + model["name"] + " with best f1 score " + str(model["f1_score"]) + " and it took " + str(round(model["model_time"], 2)) + "s to train")


Model AdaBoostClassifier with best f1 score 0.837037037037037 and it took 0.3s to train
Model BaggingClassifier with best f1 score 0.7037037037037037 and it took 0.9s to train
Model ExtraTreesClassifier with best f1 score 0.837037037037037 and it took 0.29s to train
Model GaussianNB with best f1 score 0.8407407407407409 and it took 0.0s to train
Model GradientBoostingClassifier with best f1 score 0.7296296296296296 and it took 0.91s to train
Model HistGradientBoostingClassifier with best f1 score 0.8481481481481481 and it took 0.9s to train
Model QuadraticDiscriminantAnalysis with best f1 score 0.8444444444444444 and it took 0.01s to train
Model RandomForestClassifier with best f1 score 0.8518518518518519 and it took 0.31s to train


In [91]:
# print models with best f1 score
best_f1 = 0
best_models = []

for model in plt_model_best:
    if model["f1_score"] > best_f1:
        best_models = [model]
    else:
        best_models.append(model)
        best_f1 = model["f1_score"]

for model in best_models:
    print("Model " + model["name"] + " with best f1 score " + str(model["f1_score"]) + " and it took " + str(round(model["model_time"], 2)) + "s to train")



Model RandomForestClassifier with best f1 score 0.8518518518518519 and it took 0.31s to train


In [92]:
# print model with best f1 and train time
best_time = 1000
best_model = None

for model in best_models:
    if model["model_time"] < best_time:
        best_model = model
        best_time = model["model_time"]

print(str(best_model))


{'name': 'RandomForestClassifier', 'model_class': RandomForestClassifier(bootstrap=False, ccp_alpha=0.001, criterion='log_loss',
                       max_features='log2', min_samples_split=1,
                       n_estimators=10, n_jobs=-1), 'parameters': {'n_estimators': 100, 'criterion': 'gini', 'min_samples_split': 2, 'max_features': 'sqrt', 'bootstrap': True, 'n_jobs': -1, 'ccp_alpha': 0}, 'f1_score': 0.8518518518518519, 'model_time': 0.3071432113647461}
