In [54]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import time
import sklearn
from imblearn.under_sampling import RandomUnderSampler


In [55]:
class Dataset():
    def __init__(self):
        
        # whole dataset
        df = pd.read_csv('dataset_2.csv')
        self.y = df['K2Q31A']
        self.X = df.drop(columns='K2Q31A')

        # chi square
        self.chi = pd.read_csv('chi.csv')
        self.chi = self.chi.loc[self.chi['Dr Sheikhy'] == 'Y']['feature name']
        
        # fisher's score
        self.fisher = pd.read_csv('fisher.csv')
        self.fisher = self.fisher.loc[self.fisher['Dr Sheikhy'] == 'Y']['Feature Name']

        # information gain
        self.inf = pd.read_csv('inf-gain.csv')
        self.inf = self.inf.loc[self.inf['Dr Sheikhy'] == 'Y']['Feature Name']

        # corelation
        self.cor = pd.read_csv('cor.csv')
        self.cor = self.cor.iloc[:,[0, 14]].where(self.cor.iloc[:,14] == 'Y').dropna().iloc[:, 0]

    def return_dataset(self) -> pd.DataFrame:
        return self.X, self.y

    def return_chi(self) -> pd.Series:
        return self.chi
    
    def return_fisher(self) -> pd.Series:
        return self.fisher
    
    def return_inf(self) -> pd.Series:
        return self.inf
    
    def return_cor(self) -> pd.Series:
        return self.cor
    
    # intersections of 2 sets
    def return_intersection_chi_fisher(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()))

    def return_intersection_chi_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()))

    def return_intersection_chi_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_fisher_inf(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()))

    def return_intersection_fisher_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.cor.tolist()))

    def return_intersection_inf_cor(self) -> list:
        return list(set(self.inf.tolist()) & set(self.cor.tolist()))

    # intersections of 3 sets
    def return_intersection_chi_fisher_inf(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.inf.tolist()))
    
    def return_intersection_chi_fisher_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.fisher.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_chi_inf_cor(self) -> list:
        return list(set(self.chi.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))
    
    def return_intersection_fisher_inf_cor(self) -> list:
        return list(set(self.fisher.tolist()) & set(self.inf.tolist()) & set(self.cor.tolist()))

    
    

In [56]:
class Model():
    def __init__(self, no_iters: int, selection: list, x: pd.DataFrame, y: pd.DataFrame):

        self.no_iters = no_iters

        self.x = x.loc[:, x.columns.isin(selection)]
        self.y = y

        # [name, acc, f1_score, recall, precision, time]
        self.xgboost_result = []
        self.adaboost_result = []
        self.gradient_boost_result = []
        self.random_forest_result = []
        self.svm_result = []

        # classifiers
        self.xgboost_classifier = XGBClassifier(max_depth=5,
        learning_rate=0.01,
        n_estimators=100,
        objective='binary:logistic',
        random_state=42)

        self.adaboost_classifier = AdaBoostClassifier()

        self.svm_classifier = SVC()

        self.random_forest_classifier = RandomForestClassifier()

        self.gradient_boost_classifier = GradientBoostingClassifier()

        # undersampling
        self.undersample = RandomUnderSampler(sampling_strategy='majority')
    
    def _append_items(self, target: list, name, acc, f1_score, recall, precision, time):
        target.append(name)
        target.append(acc)
        target.append(f1_score)
        target.append(recall)
        target.append(precision)
        target.append(time)
        
    def _train(self, classifier, random_state=0):

        X, y = self.undersample.fit_resample(self.x, self.y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        
        y_train = -(y_train - 2)
        y_test = -(y_test - 2)

        start_time = time.time()

        classifier.fit(X_train, y_train)
        
        y_pred = classifier.predict(X_test)
        
        end_time = time.time()
        
        time0 = end_time - start_time
        accuracy = accuracy_score(y_test, y_pred) * 100
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)



        return accuracy, f1, recall, precision, time0
    
    def fit_algorithm(self, classifier):
        
        for _ in range(self.no_iters):

            scores = []
            times = []
            f1_scores = []
            recall_scores = []
            precision_scores = []

            accuracy, f1, recall, precision, time0 = self._train(classifier)
            
            scores.append(accuracy)
            f1_scores.append(f1)
            recall_scores.append(recall)            
            precision_scores.append(precision)
            times.append(time0)
   

        self._append_items(target=self.xgboost_result, name=XGBClassifier.__name__, acc=np.array(scores).mean(),
                            f1_score=np.array(f1_scores).mean(), recall=np.array(recall_scores).mean(),
                            precision=np.array(precision_scores).mean(), time=np.array(times).mean())

    def fit_adaboost(self):
        
        pass

    def fit_gradient_boost(self):
        pass

    def fit_random_forest(self):
        pass
    
    def fit_svm(self):
        pass

In [57]:
dataset  = Dataset()
X, y = dataset.return_dataset()

model = Model(1, dataset.return_intersection_chi_inf(), X, y)

In [58]:
model.fit_xgboost()
model.xgboost_result

# [name, acc, f1_score, recall, precision, time]


['XGBClassifier',
 86.5260691271236,
 0.8614457831325303,
 0.8451536643026005,
 0.8783783783783784,
 0.7129411697387695]