In [76]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import time
from imblearn.under_sampling import RandomUnderSampler


In [77]:
class Dataset():
    def __init__(self):

        # whole dataset
        df = pd.read_csv('dataset_2.csv')
        self.y = df['K2Q31A']
        self.X = df.drop(columns='K2Q31A')

        # chi square
        self.chi = pd.read_csv('chi.csv')
        self.chi = self.chi.loc[self.chi['Dr Sheikhy'] == 'Y']['feature name']

        # fisher's score
        self.fisher = pd.read_csv('fisher.csv')
        self.fisher = self.fisher.loc[self.fisher['Dr Sheikhy'] == 'Y']['Feature Name']

        # information gain
        self.inf = pd.read_csv('inf-gain.csv')
        self.inf = self.inf.loc[self.inf['Dr Sheikhy'] == 'Y']['Feature Name']

        # corelation
        self.cor = pd.read_csv('cor.csv')
        self.cor = self.cor.iloc[:,[0, 14]].where(self.cor.iloc[:,14] == 'Y').dropna().iloc[:, 0]

    def return_dataset(self) -> pd.DataFrame:
        return self.X, self.y

    def return_chi(self) -> pd.Series:
        return self.chi

    def return_fisher(self) -> pd.Series:
        return self.fisher

    def return_inf(self) -> pd.Series:
        return self.inf

    def return_cor(self) -> pd.Series:
        return self.cor

    # intersections of 2 sets
    def return_intersection_chi_fisher(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()))

    def return_intersection_chi_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.cor.tolist()))

    def return_intersection_fisher_inf(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_fisher_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_inf_cor(self) -> list:
        return list(set(self.inf.tolist()) & set(self.cor.tolist()))

    # intersections of 3 sets
    def return_intersection_chi_fisher_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_fisher_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_chi_inf_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))

    def return_intersection_fisher_inf_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))




In [78]:
class Model():
    def __init__(self, no_iters: int, selection: list, x: pd.DataFrame, y: pd.DataFrame, filename: str):

        self.no_iters = no_iters

        self.x = x.loc[:, x.columns.isin(selection)]
        self.y = y
        
        self.filename = str(self._create_filename(filename) + '.text').replace("'", "")

        # [name, acc, f1_score, recall, precision, time]
        # self.xgboost_result = []
        # self.adaboost_result = []
        # self.gradient_boost_result = []
        # self.random_forest_result = []
        # self.svm_result = []

        # classifiers
        self.xgboost_classifier = XGBClassifier(max_depth=5,
        learning_rate=0.005,
        n_estimators=100,
        objective='binary:logistic',
        random_state=42)

        self.adaboost_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
        n_estimators=100,
        learning_rate=0.05,
        algorithm='SAMME.R',
        random_state=42,
        base_estimator='deprecated')

        self.svm_classifier = SVC()

        self.random_forest_classifier = RandomForestClassifier()

        self.gradient_boost_classifier = GradientBoostingClassifier()

        # undersampling
        self.undersample = RandomUnderSampler(sampling_strategy='majority')

    def _append_items(self, target: list, name, acc, f1_score, recall, precision, time, no_iters):
        target.append(name)
        target.append(acc)
        target.append(f1_score)
        target.append(recall)
        target.append(precision)
        target.append(time)
        target.append(no_iters)
        return target

    def _train(self, classifier, random_state=0):

        X, y = self.undersample.fit_resample(self.x, self.y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

        y_train = -(y_train - 2)
        y_test = -(y_test - 2)

        start_time = time.time()

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)

        end_time = time.time()

        time0 = end_time - start_time
        accuracy = accuracy_score(y_test, y_pred) * 100
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)



        return accuracy, f1, recall, precision, time0
    # [name, acc, f1_score, recall, precision, time]


    def display_result(self, result, print_result=True):
        filename = self.filename
        with open(filename, 'a') as file:
            result_string = (f'algorithm name: {result[0]}\n'
                f'number of iterations: {str(self.no_iters)}\n'
                f'average accuracy: {(result[1]).mean():.4f}' + '\n'
                f'max accuracy: {(result[1]).max():.4f}' + '\n'
                f'std accuracy: {(result[1]).std():.4f}' + '\n'
                f'average f1 score: {(result[2]).mean():.4f}' + '\n'
                f'max f1 score: {(result[2]).max():.4f}' + '\n'
                f'std f1 score: {(result[2]).std():.4f}' + '\n'
                f'average recall: {(result[3]).mean():.4f}' + '\n'
                f'max recall: {(result[3]).max():.4f}' + '\n'
                f'std recall: {(result[3]).std():.4f}' + '\n'
                f'average precision: {(result[4]).mean():.4f}' + '\n'
                f'max precision: {(result[4]).max():.4f}' + '\n'
                f'std precision: {(result[4]).std():.4f}' + '\n'
                f'average time: {result[5].mean():.4f}' + '\n\n\n')
            if print_result:
                print(result_string)
            file.write(result_string)
        file.close()


    def fit_algorithm(self, classifier):

        scores = []
        times = []
        f1_scores = []
        recall_scores = []
        precision_scores = []

        for _ in range(self.no_iters):

            accuracy, f1, recall, precision, time0 = self._train(classifier)

            scores.append(accuracy)
            f1_scores.append(f1)
            recall_scores.append(recall)
            precision_scores.append(precision)
            times.append(time0)

        target = []


        name = classifier
        if  name == self.xgboost_classifier:
            name = XGBClassifier.__name__

        
        target = self._append_items(target=target, name=name, acc=np.array(scores),
                            f1_score=np.array(f1_scores), recall=np.array(recall_scores),
                            precision=np.array(precision_scores), time=np.array(times), no_iters=self.no_iters)
        self.display_result(target)

    def _create_filename(self, filename):
        index = filename.find("return_")
        if index == -1:
            return None
        else:
            return filename[index + len("return_"):]



In [79]:
no_iterarion =150

In [80]:
def data():
    print(5)
data.__name__

'data'

In [81]:
dataset  = Dataset()
X, y = dataset.return_dataset()

model = Model(no_iters=no_iterarion, selection=dataset.return_intersection_chi_inf(), x=X, y=y,
              filename=dataset.return_intersection_chi_inf.__name__)
model_intersection_chi_fisher_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_fisher_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_fisher_inf.__name__)
model_intersection_chi_fisher = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_fisher(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_fisher.__name__)
model_intersection_chi_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_inf.__name__)
model_intersection_fisher_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_fisher_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_fisher_inf.__name__)
model_intersection_chi_cor = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_cor(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_cor.__name__)
model_intersection_fisher_cor = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_fisher_cor(),
                                            x=X, y=y, filename=dataset.return_intersection_fisher_cor.__name__)
model_intersection_inf_cor = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_inf_cor(),
                                            x=X, y=y, filename=dataset.return_intersection_inf_cor.__name__)

In [82]:
classifiers =   [
                model.xgboost_classifier,
                model.adaboost_classifier,
                model.gradient_boost_classifier,
                model.random_forest_classifier,
                model.svm_classifier
                ]

chi_inf:

In [83]:
# model.fit_algorithm(classifier=classifiers[0])



In [84]:
# model.fit_algorithm(classifier=classifiers[1])


In [85]:
# model.fit_algorithm(classifier=classifiers[2])


In [86]:
# model.fit_algorithm(classifier=classifiers[3])


In [87]:
# model.fit_algorithm(classifier=classifiers[4])


chi_fisher_inf:

In [88]:
# model_intersection_chi_fisher_inf.fit_algorithm(classifier=classifiers[0])

chi fisher: 

In [89]:
model_intersection_chi_fisher.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 85.8906
max accuracy: 87.0533
std accuracy: 0.5472
average f1 score: 0.8528
max f1 score: 0.8637
std f1 score: 0.0050
average recall: 0.8243
max recall: 0.8298
std recall: 0.0029
average preci

In [90]:
model_intersection_chi_fisher.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 85.8016
max accuracy: 87.1119
std accuracy: 0.5286
average f1 score: 0.8516
max f1 score: 0.8630
std f1 score: 0.0048
average recall: 0.8216
max recall: 0.8274
std recall: 0.0026
average precision: 0.8839
max precision: 0.9118
std precision: 0.0102
average time: 1.1029





In [91]:
model_intersection_chi_fisher.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 85.6302
max accuracy: 86.8776
std accuracy: 0.5872
average f1 score: 0.8503
max f1 score: 0.8622
std f1 score: 0.0054
average recall: 0.8232
max recall: 0.8310
std recall: 0.0033
average precision: 0.8793
max precision: 0.9039
std precision: 0.0105
average time: 1.1244





In [92]:
model_intersection_chi_fisher.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 85.1814
max accuracy: 87.1705
std accuracy: 0.6034
average f1 score: 0.8481
max f1 score: 0.8664
std f1 score: 0.0054
average recall: 0.8347
max recall: 0.8440
std recall: 0.0040
average precision: 0.8621
max precision: 0.8953
std precision: 0.0104
average time: 1.1198





fisher_inf:

In [93]:
model_intersection_fisher_inf.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 86.4917
max accuracy: 87.8735
std accuracy: 0.6180
average f1 score: 0.8631
max f1 score: 0.8761
std f1 score: 0.0057
average recall: 0.8590
max recall: 0.8676
std recall: 0.0073
average preci

In [94]:
model_intersection_fisher_inf.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 86.0683
max accuracy: 87.3462
std accuracy: 0.4922
average f1 score: 0.8560
max f1 score: 0.8668
std f1 score: 0.0045
average recall: 0.8351
max recall: 0.8463
std recall: 0.0046
average precision: 0.8780
max precision: 0.9059
std precision: 0.0095
average time: 1.2698





In [95]:
model_intersection_fisher_inf.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 86.5593
max accuracy: 87.9320
std accuracy: 0.5941
average f1 score: 0.8625
max f1 score: 0.8758
std f1 score: 0.0055
average recall: 0.8504
max recall: 0.8605
std recall: 0.0042
average precision: 0.8750
max precision: 0.8962
std precision: 0.0100
average time: 1.2213





In [96]:
model_intersection_fisher_inf.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 85.7508
max accuracy: 87.4634
std accuracy: 0.6307
average f1 score: 0.8566
max f1 score: 0.8719
std f1 score: 0.0056
average recall: 0.8588
max recall: 0.8723
std recall: 0.0047
average precision: 0.8546
max precision: 0.8835
std precision: 0.0108
average time: 0.9990





chi_cor:

In [97]:
model_intersection_chi_cor.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 85.8856
max accuracy: 87.4048
std accuracy: 0.4942
average f1 score: 0.8530
max f1 score: 0.8669
std f1 score: 0.0044
average recall: 0.8261
max recall: 0.8357
std recall: 0.0027
average preci

In [98]:
model_intersection_chi_cor.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 85.9113
max accuracy: 87.3462
std accuracy: 0.5511
average f1 score: 0.8531
max f1 score: 0.8658
std f1 score: 0.0050
average recall: 0.8253
max recall: 0.8286
std recall: 0.0030
average precision: 0.8829
max precision: 0.9123
std precision: 0.0106
average time: 1.3574





In [99]:
model_intersection_chi_cor.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 85.9453
max accuracy: 87.1705
std accuracy: 0.4989
average f1 score: 0.8535
max f1 score: 0.8649
std f1 score: 0.0045
average recall: 0.8258
max recall: 0.8322
std recall: 0.0021
average precision: 0.8831
max precision: 0.9056
std precision: 0.0093
average time: 1.2326





In [100]:
model_intersection_chi_cor.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 84.1051
max accuracy: 85.7645
std accuracy: 0.6241
average f1 score: 0.8390
max f1 score: 0.8535
std f1 score: 0.0055
average recall: 0.8352
max recall: 0.8475
std recall: 0.0053
average precision: 0.8428
max precision: 0.8708
std precision: 0.0106
average time: 1.3307





fisher_cor:

In [101]:
model_intersection_fisher_cor.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 86.5495
max accuracy: 87.6977
std accuracy: 0.5793
average f1 score: 0.8644
max f1 score: 0.8746
std f1 score: 0.0051
average recall: 0.8647
max recall: 0.8676
std recall: 0.0028
average preci

In [102]:
model_intersection_fisher_cor.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 86.4194
max accuracy: 87.6977
std accuracy: 0.5569
average f1 score: 0.8608
max f1 score: 0.8730
std f1 score: 0.0051
average recall: 0.8472
max recall: 0.8582
std recall: 0.0045
average precision: 0.8750
max precision: 0.8972
std precision: 0.0100
average time: 0.9191





In [103]:
model_intersection_fisher_cor.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 86.6921
max accuracy: 87.9906
std accuracy: 0.6107
average f1 score: 0.8649
max f1 score: 0.8770
std f1 score: 0.0053
average recall: 0.8589
max recall: 0.8652
std recall: 0.0031
average precision: 0.8709
max precision: 0.8964
std precision: 0.0112
average time: 0.7112





In [104]:
model_intersection_fisher_cor.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 85.1318
max accuracy: 86.9947
std accuracy: 0.6303
average f1 score: 0.8479
max f1 score: 0.8642
std f1 score: 0.0060
average recall: 0.8360
max recall: 0.8534
std recall: 0.0066
average precision: 0.8602
max precision: 0.8959
std precision: 0.0102
average time: 0.5503





inf_cor:

In [105]:
model_intersection_inf_cor.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 86.5507
max accuracy: 87.9320
std accuracy: 0.5427
average f1 score: 0.8645
max f1 score: 0.8768
std f1 score: 0.0048
average recall: 0.8654
max recall: 0.8676
std recall: 0.0017
average preci

In [106]:
model_intersection_inf_cor.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 86.4663
max accuracy: 88.0492
std accuracy: 0.6143
average f1 score: 0.8616
max f1 score: 0.8765
std f1 score: 0.0056
average recall: 0.8501
max recall: 0.8617
std recall: 0.0059
average precision: 0.8736
max precision: 0.9027
std precision: 0.0113
average time: 0.4131





In [107]:
model_intersection_inf_cor.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 86.6085
max accuracy: 87.8735
std accuracy: 0.5598
average f1 score: 0.8645
max f1 score: 0.8757
std f1 score: 0.0049
average recall: 0.8615
max recall: 0.8676
std recall: 0.0031
average precision: 0.8675
max precision: 0.8918
std precision: 0.0100
average time: 0.3179





In [108]:
model_intersection_inf_cor.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 85.2306
max accuracy: 86.9361
std accuracy: 0.6461
average f1 score: 0.8492
max f1 score: 0.8654
std f1 score: 0.0061
average recall: 0.8391
max recall: 0.8570
std recall: 0.0067
average precision: 0.8597
max precision: 0.8866
std precision: 0.0107
average time: 0.3730



