In [60]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import time
from imblearn.under_sampling import RandomUnderSampler


In [61]:
class Dataset():
    def __init__(self):

        # whole dataset
        df = pd.read_csv('dataset_2.csv')
        self.y = df['K2Q31A']
        self.X = df.drop(columns='K2Q31A')

        # chi square
        self.chi = pd.read_csv('chi.csv')
        self.chi = self.chi.loc[self.chi['Dr Sheikhy'] == 'Y']['feature name']

        # fisher's score
        self.fisher = pd.read_csv('fisher.csv')
        self.fisher = self.fisher.loc[self.fisher['Dr Sheikhy'] == 'Y']['Feature Name']

        # information gain
        self.inf = pd.read_csv('inf-gain.csv')
        self.inf = self.inf.loc[self.inf['Dr Sheikhy'] == 'Y']['Feature Name']

        # corelation
        self.cor = pd.read_csv('cor.csv')
        self.cor = self.cor.iloc[:,[0, 14]].where(self.cor.iloc[:,14] == 'Y').dropna().iloc[:, 0]

    def return_dataset(self) -> pd.DataFrame:
        return self.X, self.y

    def return_chi(self) -> pd.Series:
        return self.chi

    def return_fisher(self) -> pd.Series:
        return self.fisher

    def return_inf(self) -> pd.Series:
        return self.inf

    def return_cor(self) -> pd.Series:
        return self.cor

    # intersections of 2 sets
    def return_intersection_chi_fisher(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()))

    def return_intersection_chi_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.cor.tolist()))

    def return_intersection_fisher_inf(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_fisher_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_inf_cor(self) -> list:
        return list(set(self.inf.tolist()) & set(self.cor.tolist()))

    # intersections of 3 sets
    def return_intersection_chi_fisher_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_fisher_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_chi_inf_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))

    def return_intersection_fisher_inf_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))




In [62]:
class Model():
    def __init__(self, no_iters: int, selection: list, x: pd.DataFrame, y: pd.DataFrame, filename: str):

        self.no_iters = no_iters

        self.x = x.loc[:, x.columns.isin(selection)]
        self.y = y
        
        self.filename = str(self._create_filename(filename) + '.text').replace("'", "")

        # [name, acc, f1_score, recall, precision, time]
        # self.xgboost_result = []
        # self.adaboost_result = []
        # self.gradient_boost_result = []
        # self.random_forest_result = []
        # self.svm_result = []

        # classifiers
        self.xgboost_classifier = XGBClassifier(max_depth=5,
        learning_rate=0.005,
        n_estimators=100,
        objective='binary:logistic',
        random_state=42)

        self.adaboost_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
        n_estimators=100,
        learning_rate=0.05,
        algorithm='SAMME.R',
        random_state=42,
        base_estimator='deprecated')

        self.svm_classifier = SVC()

        self.random_forest_classifier = RandomForestClassifier()

        self.gradient_boost_classifier = GradientBoostingClassifier()

        # undersampling
        self.undersample = RandomUnderSampler(sampling_strategy='majority')

    def _append_items(self, target: list, name, acc, f1_score, recall, precision, time, no_iters):
        target.append(name)
        target.append(acc)
        target.append(f1_score)
        target.append(recall)
        target.append(precision)
        target.append(time)
        target.append(no_iters)
        return target

    def _train(self, classifier, random_state=0):

        X, y = self.undersample.fit_resample(self.x, self.y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

        y_train = -(y_train - 2)
        y_test = -(y_test - 2)

        start_time = time.time()

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)

        end_time = time.time()

        time0 = end_time - start_time
        accuracy = accuracy_score(y_test, y_pred) * 100
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)



        return accuracy, f1, recall, precision, time0
    # [name, acc, f1_score, recall, precision, time]


    def display_result(self, result, print_result=True):
        filename = self.filename
        with open(filename, 'a') as file:
            result_string = (f'algorithm name: {result[0]}\n'
                f'number of iterations: {str(self.no_iters)}\n'
                f'average accuracy: {(result[1]).mean():.4f}' + '\n'
                f'max accuracy: {(result[1]).max():.4f}' + '\n'
                f'std accuracy: {(result[1]).std():.4f}' + '\n'
                f'average f1 score: {(result[2]).mean():.4f}' + '\n'
                f'max f1 score: {(result[2]).max():.4f}' + '\n'
                f'std f1 score: {(result[2]).std():.4f}' + '\n'
                f'average recall: {(result[3]).mean():.4f}' + '\n'
                f'max recall: {(result[3]).max():.4f}' + '\n'
                f'std recall: {(result[3]).std():.4f}' + '\n'
                f'average precision: {(result[4]).mean():.4f}' + '\n'
                f'max precision: {(result[4]).max():.4f}' + '\n'
                f'std precision: {(result[4]).std():.4f}' + '\n'
                f'average time: {result[5].mean():.4f}' + '\n\n\n')
            if print_result:
                print(result_string)
            file.write(result_string)
        file.close()


    def fit_algorithm(self, classifier):

        scores = []
        times = []
        f1_scores = []
        recall_scores = []
        precision_scores = []

        for _ in range(self.no_iters):

            accuracy, f1, recall, precision, time0 = self._train(classifier)

            scores.append(accuracy)
            f1_scores.append(f1)
            recall_scores.append(recall)
            precision_scores.append(precision)
            times.append(time0)

        target = []


        name = classifier
        if  name == self.xgboost_classifier:
            name = XGBClassifier.__name__

        
        target = self._append_items(target=target, name=name, acc=np.array(scores),
                            f1_score=np.array(f1_scores), recall=np.array(recall_scores),
                            precision=np.array(precision_scores), time=np.array(times), no_iters=self.no_iters)
        self.display_result(target)

    def _create_filename(self, filename):
        index = filename.find("return_")
        if index == -1:
            return None
        else:
            return filename[index + len("return_"):]



In [63]:
no_iterarion =150

In [64]:
def data():
    print(5)
data.__name__

'data'

In [65]:
dataset  = Dataset()
X, y = dataset.return_dataset()

model = Model(no_iters=no_iterarion, selection=dataset.return_intersection_chi_inf(), x=X, y=y,
              filename=dataset.return_intersection_chi_inf.__name__)
model_intersection_chi_fisher_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_fisher_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_fisher_inf.__name__)
model_intersection_chi_fisher = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_fisher(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_fisher.__name__)
model_intersection_chi_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_inf.__name__)
model_intersection_fisher_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_fisher_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_fisher_inf.__name__)
model_intersection_chi_fisher_inf = Model(no_iters=no_iterarion,
                                           selection=dataset.return_intersection_chi_fisher_inf(),
                                            x=X, y=y, filename=dataset.return_intersection_chi_fisher_inf.__name__)

In [66]:
classifiers =   [
                model.xgboost_classifier,
                model.adaboost_classifier,
                model.gradient_boost_classifier,
                model.random_forest_classifier,
                model.svm_classifier
                ]

In [67]:
# model.fit_algorithm(classifier=classifiers[0])



In [68]:
# model.fit_algorithm(classifier=classifiers[1])


In [69]:
# model.fit_algorithm(classifier=classifiers[2])


In [70]:
# model.fit_algorithm(classifier=classifiers[3])


In [71]:
# model.fit_algorithm(classifier=classifiers[4])


In [72]:
model_intersection_chi_fisher_inf.fit_algorithm(classifier=classifiers[0])

algorithm name: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.005, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)
number of iterations: 150
average accuracy: 85.8629
max accuracy: 87.5806
std accuracy: 0.5646
average f1 score: 0.8529
max f1 score: 0.8686
std f1 score: 0.0051
average recall: 0.8267
max recall: 0.8310
std recall: 0.0020
average preci

In [73]:
model_intersection_chi_fisher_inf.fit_algorithm(classifier=classifiers[1])

algorithm name: AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.05, n_estimators=100, random_state=42)
number of iterations: 150
average accuracy: 85.8715
max accuracy: 86.9947
std accuracy: 0.5231
average f1 score: 0.8522
max f1 score: 0.8619
std f1 score: 0.0047
average recall: 0.8220
max recall: 0.8274
std recall: 0.0026
average precision: 0.8849
max precision: 0.9094
std precision: 0.0100
average time: 0.8881





In [74]:
model_intersection_chi_fisher_inf.fit_algorithm(classifier=classifiers[2])

algorithm name: GradientBoostingClassifier()
number of iterations: 150
average accuracy: 85.6329
max accuracy: 86.8776
std accuracy: 0.5258
average f1 score: 0.8505
max f1 score: 0.8617
std f1 score: 0.0047
average recall: 0.8243
max recall: 0.8310
std recall: 0.0027
average precision: 0.8784
max precision: 0.9018
std precision: 0.0098
average time: 0.8880





In [75]:
model_intersection_chi_fisher_inf.fit_algorithm(classifier=classifiers[3])

algorithm name: RandomForestClassifier()
number of iterations: 150
average accuracy: 84.3753
max accuracy: 86.1746
std accuracy: 0.6516
average f1 score: 0.8407
max f1 score: 0.8566
std f1 score: 0.0059
average recall: 0.8320
max recall: 0.8440
std recall: 0.0046
average precision: 0.8498
max precision: 0.8812
std precision: 0.0109
average time: 0.9075



