In [346]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import time
import sklearn
from imblearn.under_sampling import RandomUnderSampler


In [347]:
class Dataset():
    def __init__(self):
        
        # whole dataset
        df = pd.read_csv('dataset_2.csv')
        self.y = df['K2Q31A']
        self.X = df.drop(columns='K2Q31A')

        # chi square
        self.chi = pd.read_csv('chi.csv')
        self.chi = self.chi.loc[self.chi['Dr Sheikhy'] == 'Y']['feature name']
        
        # fisher's score
        self.fisher = pd.read_csv('fisher.csv')
        self.fisher = self.fisher.loc[self.fisher['Dr Sheikhy'] == 'Y']['Feature Name']

        # information gain
        self.inf = pd.read_csv('inf-gain.csv')
        self.inf = self.inf.loc[self.inf['Dr Sheikhy'] == 'Y']['Feature Name']

        # corelation
        self.cor = pd.read_csv('cor.csv')
        self.cor = self.cor.iloc[:,[0, 14]].where(self.cor.iloc[:,14] == 'Y').dropna().iloc[:, 0]

    def return_dataset(self) -> pd.DataFrame:
        return self.X, self.y

    def return_chi(self) -> pd.Series:
        return self.chi
    
    def return_fisher(self) -> pd.Series:
        return self.fisher
    
    def return_inf(self) -> pd.Series:
        return self.inf
    
    def return_cor(self) -> pd.Series:
        return self.cor
    
    # intersections of 2 sets
    def return_intersection_chi_fisher(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()))

    def return_intersection_chi_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_fisher_inf(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_fisher_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_inf_cor(self) -> list:
        return list(set(self.inf.tolist()) & set(self.cor.tolist()))

    # intersections of 3 sets
    def return_intersection_chi_fisher_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.inf.tolist()))
    
    def return_intersection_chi_fisher_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_chi_inf_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_fisher_inf_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))

    
    

In [348]:
class Model():
    def __init__(self, no_iters: int, selection: list, x: pd.DataFrame, y: pd.DataFrame):

        self.no_iters = no_iters

        self.x = x.loc[:, x.columns.isin(selection)]
        self.y = y

        # [name, acc, f1_score, recall, precision, time]
        self.xgboost_result = []
        self.adaboost_result = []
        self.gradient_boost_result = []
        self.random_forest_result = []
        self.svm_result = []

        # classifiers
        self.xgboost_classifier = XGBClassifier(max_depth=5,
        learning_rate=0.01,
        n_estimators=100,
        objective='binary:logistic',
        random_state=42)

        self.adaboost_classifier = AdaBoostClassifier()

        self.svm_classifier = SVC()

        self.random_forest_classifier = RandomForestClassifier()

        self.gradient_boost_classifier = GradientBoostingClassifier()

        # undersampling
        self.undersample = RandomUnderSampler(sampling_strategy='majority')
    
    def _append_items(self, target: list, name, acc, f1_score, recall, precision, time, no_iters):
        target.append(name)
        target.append(acc)
        target.append(f1_score)
        target.append(recall)
        target.append(precision)
        target.append(time)
        target.append(no_iters)
        return target
        
    def _train(self, classifier, random_state=0):

        X, y = self.undersample.fit_resample(self.x, self.y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        
        y_train = -(y_train - 2)
        y_test = -(y_test - 2)

        start_time = time.time()

        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X_test)
        
        end_time = time.time()
        
        time0 = end_time - start_time
        accuracy = accuracy_score(y_test, y_pred) * 100
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)



        return accuracy, f1, recall, precision, time0
    # [name, acc, f1_score, recall, precision, time]

    
    def display_result(self, result):
        print(f'algorithm name: {result[0]}\n'
            f'number of iterations: {str(self.no_iters)}\n'
            f'average accuracy: {(result[1]).mean():.4f}' + '\n'
            f'max accuracy: {(result[1]).max():.4f}' + '\n'
            f'std accuracy: {(result[1]).std():.4f}' + '\n'
            f'average f1 score: {(result[2]).mean():.4f}' + '\n'
            f'max f1 score: {(result[2]).max():.4f}' + '\n'
            f'std f1 score: {(result[2]).std():.4f}' + '\n'
            f'average recall: {(result[3]).mean():.4f}' + '\n'
            f'max recall: {(result[3]).max():.4f}' + '\n'
            f'std recall: {(result[3]).std():.4f}' + '\n'
            f'average time: {result[4].mean():.4f}')
    
    
    def fit_algorithm(self, classifier):
        
        scores = []
        times = []
        f1_scores = []
        recall_scores = []
        precision_scores = []

        for _ in range(self.no_iters):

            accuracy, f1, recall, precision, time0 = self._train(classifier)
            
            scores.append(accuracy)
            f1_scores.append(f1)
            recall_scores.append(recall)            
            precision_scores.append(precision)
            times.append(time0)
   
        target = []


        name = classifier
        if  name == self.xgboost_classifier:
            name = XGBClassifier.__name__

            
        target = self._append_items(target=target, name=name, acc=np.array(scores),
                            f1_score=np.array(f1_scores), recall=np.array(recall_scores),
                            precision=np.array(precision_scores), time=np.array(times), no_iters=self.no_iters)
        self.display_result(target)



In [349]:
no_iterarion = 5

In [350]:
dataset  = Dataset()
X, y = dataset.return_dataset()

model = Model(no_iters=no_iterarion, selection=dataset.return_intersection_chi_inf(), x=X, y=y)

In [351]:
classifiers =   [
                model.xgboost_classifier,
                model.adaboost_classifier,
                model.gradient_boost_classifier,
                model.random_forest_classifier,
                model.svm_classifier
                ]

In [352]:
model.fit_algorithm(classifier=classifiers[0])



algorithm name: XGBClassifier
number of iterations: 5
average accuracy: 86.4206
max accuracy: 86.5847
std accuracy: 0.1939
average f1 score: 0.8608
max f1 score: 0.8620
std f1 score: 0.0016
average recall: 0.8470
max recall: 0.8522
std recall: 0.0029
average time: 0.8750


In [353]:
model.fit_algorithm(classifier=classifiers[1])


algorithm name: AdaBoostClassifier()
number of iterations: 5
average accuracy: 85.5185
max accuracy: 86.2917
std accuracy: 0.3946
average f1 score: 0.8478
max f1 score: 0.8545
std f1 score: 0.0035
average recall: 0.8135
max recall: 0.8168
std recall: 0.0023
average time: 0.8851


In [354]:
model.fit_algorithm(classifier=RandomForestClassifier())


algorithm name: RandomForestClassifier()
number of iterations: 5
average accuracy: 85.2958
max accuracy: 85.7645
std accuracy: 0.6439
average f1 score: 0.8512
max f1 score: 0.8556
std f1 score: 0.0057
average recall: 0.8485
max recall: 0.8534
std recall: 0.0049
average time: 0.8541


In [355]:
model.fit_algorithm(classifier=classifiers[3])


algorithm name: RandomForestClassifier()
number of iterations: 5
average accuracy: 85.2255
max accuracy: 85.6473
std accuracy: 0.4865
average f1 score: 0.8506
max f1 score: 0.8546
std f1 score: 0.0042
average recall: 0.8485
max recall: 0.8511
std recall: 0.0024
average time: 0.8528


In [356]:
model.fit_algorithm(classifier=classifiers[4])


algorithm name: SVC()
number of iterations: 5
average accuracy: 85.0029
max accuracy: 85.7059
std accuracy: 0.6981
average f1 score: 0.8465
max f1 score: 0.8527
std f1 score: 0.0059
average recall: 0.8338
max recall: 0.8357
std recall: 0.0014
average time: 0.8596
