In [1]:
# Good Code for Hierarchical Classification
import numpy as np
import shutil
import os
import glob
import pandas as pd
import matplotlib.pylab as plt
import pickle
#plt.switch_backend('agg')
% matplotlib inline

from sklearn.model_selection import StratifiedKFold
import FATS

plt.rc('text', usetex=True)
plt.rc('font',**{'family':'serif','serif':['Palatino']})
figSize  = (12, 8)
fontSize = 20



In [2]:
import itertools
from scipy import interp
from itertools import cycle, islice

# Some preprocessing utilities
from sklearn.utils import shuffle
from sklearn.manifold.t_sne import TSNE
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# The different classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_curve, auc


In [3]:
def stars_label(data, label):
    '''Set variable names to specific class label'''
    stars = data[data.True_class_labels == label]
    return stars

# First Layer Hierarchical Level

In [4]:
def first_layer():
    '''
    We define first layer of the hierarchical tree. The first layer consists of Eclipsing Binaries, Rotational,
    and Pulsating 
    '''
    
    # First Layer
    eclipsing_binary_train       = pd.concat([contact_Bi_train, semi_det_Bi_train], axis=0)
    eclipsing_binary_train_class = np.full(len(eclipsing_binary_train), eclipsing_label, dtype=int)

    rotational_train       = rot_train
    rotational_train_class = np.full(len(rotational_train),rotational_label, dtype=int)

    pulsating_train       = pd.concat([RRab_train, RRc_train, RRd_train, blazhko_train, LPV_train, delta_scuti_train, ACEP_train, cep_ii_train] ,axis=0)
    pulsating_train_class = np.full(len(pulsating_train), pulsating_label, dtype=int)


    print("eclipsing_binary_train has {}".format(eclipsing_binary_train.shape))
    print("pulsating_train has {}".format(pulsating_train.shape))
    print("rotational_train has {}".format(rotational_train.shape))

    eclipsing_binary_test       = pd.concat([contact_Bi_test, semi_det_Bi_test], axis=0)
    eclipsing_binary_test_class = np.full(len(eclipsing_binary_test), eclipsing_label, dtype=int)

    rotational_test       = rot_test
    rotational_test_class = np.full(len(rotational_test), rotational_label, dtype=int)

    pulsating_test       = pd.concat([RRab_test, RRc_test, RRd_test, blazhko_test, LPV_test, delta_scuti_test, ACEP_test, cep_ii_test] ,axis=0)
    pulsating_test_class = np.full(len(pulsating_test), pulsating_label, dtype=int)


    print("eclipsing_binary_test has {}".format(eclipsing_binary_test.shape))
    print("pulsating_test has {}".format(pulsating_test.shape))
    print("rotational_test has {}".format(rotational_test.shape))
    
    first_layer_train       = pd.concat([eclipsing_binary_train, rotational_train, pulsating_train], axis=0)
    first_layer_train_class = np.concatenate((eclipsing_binary_train_class, rotational_train_class, pulsating_train_class), axis=0)
    training_data_FL        = pd.DataFrame(first_layer_train)
    training_data_FL['New_label'] = first_layer_train_class
#     print(training_data_FL.shape)

    first_layer_test       = pd.concat([eclipsing_binary_test, rotational_test, pulsating_test], axis=0)
    first_layer_test_class = np.concatenate((eclipsing_binary_test_class, rotational_test_class, pulsating_test_class), axis=0)
    testing_data_FL        = pd.DataFrame(first_layer_test)
    testing_data_FL['New_label'] = first_layer_test_class
    
    y_FL_training, y_FL_training_counts = np.unique(first_layer_train_class, return_counts=True)

    
    return training_data_FL, testing_data_FL, y_FL_training_counts


# Second Layer Hierarchical level for first Branch: Eclipsing Binaries (Ecl & EA)

In [5]:
def second_layer_EB():
    
    # Second Layer Eclipsing Binary    
    ecl_train = contact_Bi_train
    ecl_train_class = np.full(len(ecl_train), true_class_5, dtype=int)

    EA_train       = semi_det_Bi_train
    EA_train_class = np.full(len(EA_train),true_class_6, dtype=int)
 
    print("ecl train has {}".format(ecl_train.shape))
    print("EA_train has {}".format(EA_train.shape))

    ecl_test       = contact_Bi_test
    ecl_test_class = np.full(len(ecl_test), true_class_5, dtype=int)

    EA_test       = semi_det_Bi_test
    EA_test_class = np.full(len(EA_test), true_class_6, dtype=int)

    print("ecl_test has {}".format(ecl_test.shape))
    print("EA_test has {}".format(EA_test.shape))

    
    second_layer_EB_train       = pd.concat([ecl_train, EA_train], axis=0)
    second_layer_EB_train_class = np.concatenate((ecl_train_class,EA_train_class), axis=0)
    training_data_SL_EB         = pd.DataFrame(second_layer_EB_train)
    training_data_SL_EB['New_label'] = second_layer_EB_train_class
#     print(training_data_FL.shape)

    second_layer_EB_test       = pd.concat([ecl_test, EA_test], axis=0)
    second_layer_EB_test_class = np.concatenate((ecl_test_class, EA_test_class), axis=0)
    testing_data_SL_EB         = pd.DataFrame(second_layer_EB_test)
    testing_data_SL_EB['New_label'] = second_layer_EB_test_class
    
    y_SL_EB_training, y_SL_EB_training_counts = np.unique(second_layer_EB_train_class, return_counts=True)

    
    return training_data_SL_EB, testing_data_SL_EB, y_SL_EB_training_counts


# Second Layer Hierarchical level for 2nd Branch: RLCD
### RR Lyrae, LPV, Cepheid and $\delta$-Scuti

In [6]:
# Layer 2 RR Lyrae, LPV, Cepheid, Delta-Scuti
def second_layer_RLCD():
    
    # First Layer
    RR_Lyrae_train       = pd.concat([RRab_train,RRc_train,RRd_train,blazhko_train], axis=0)
    RR_Lyrae_train_class = np.full(len(RR_Lyrae_train), RR_Lyrae_label, dtype=int)

    LPV_train_class = np.full(len(LPV_train),LPV_label, dtype=int)

    cepheids_train       = pd.concat([ACEP_train,cep_ii_train] ,axis=0)
    cepheids_train_class = np.full(len(cepheids_train), cepheids_label, dtype=int)
    
    ds_train       = delta_scuti_train
    ds_train_class = np.full(len(ds_train), delta_scuti_label, dtype=int)


    print("RR Lyrae train has {}".format(RR_Lyrae_train.shape))
    print("LPV train has {}".format(LPV_train.shape))
    print("Cepheids train has {}".format(cepheids_train.shape))
    print("Delta Scuti train has {}".format(ds_train.shape))

    RR_Lyrae_test       = pd.concat([RRab_test,RRc_test,RRd_test,blazhko_test], axis=0)
    RR_Lyrae_test_class = np.full(len(RR_Lyrae_test), RR_Lyrae_label, dtype=int)

    LPV_test_class = np.full(len(LPV_test), LPV_label, dtype=int)

    cepheids_test       = pd.concat([ACEP_test, cep_ii_test] ,axis=0)
    cepheids_test_class = np.full(len(cepheids_test), cepheids_label, dtype=int)
    
    ds_test       = delta_scuti_test
    ds_test_class = np.full(len(ds_test), delta_scuti_label, dtype=int)


    print("RR_Lyrae_test has {}".format(RR_Lyrae_test.shape))
    print("LPV_test has {}".format(LPV_test.shape))
    print("cepheids_test has {}".format(cepheids_test.shape))
    print("Delta Scuti test has {}".format(ds_test.shape))
    
    second_layer_RLCD_train       = pd.concat([RR_Lyrae_train,LPV_train,cepheids_train,ds_train], axis=0)
    second_layer_RLCD_train_class = np.concatenate((RR_Lyrae_train_class,LPV_train_class,cepheids_train_class,ds_train_class), axis=0)
    training_data_SL_RLCD         = pd.DataFrame(second_layer_RLCD_train)
    training_data_SL_RLCD['New_label'] = second_layer_RLCD_train_class
#     print(training_data_FL.shape)

    second_layer_RLCD_test       = pd.concat([RR_Lyrae_test,LPV_test,cepheids_test,ds_test], axis=0)
    second_layer_RLCD_test_class = np.concatenate((RR_Lyrae_test_class,LPV_test_class,cepheids_test_class,ds_test_class), axis=0)
    testing_data_SL_RLCD         = pd.DataFrame(second_layer_RLCD_test)
    testing_data_SL_RLCD['New_label'] = second_layer_RLCD_test_class
    
    y_SL_RLCD_training, y_SL_RLCD_training_counts = np.unique(second_layer_RLCD_train_class, return_counts=True)

    print(y_SL_RLCD_training)
    print(y_SL_RLCD_training_counts)
    return training_data_SL_RLCD, testing_data_SL_RLCD, y_SL_RLCD_training_counts


# Third Layer Hierarchical level for first Branch: RRLyrae
### RRab, RRc, RRd, and Blazhko

In [7]:
# Layer 3 RR Lyrae classes
def third_layer_RRLyrae():
    
    # Third Layer
    RRab_train_class    = np.full(len(RRab_train), true_class_1, dtype=int)
    RRc_train_class     = np.full(len(RRc_train), true_class_2, dtype=int)
    RRd_train_class     = np.full(len(RRd_train), true_class_3, dtype=int)
    blazhko_train_class = np.full(len(blazhko_train), true_class_4, dtype=int)

    print("RRab train has {}".format(RRab_train.shape))
    print("RRc train has {}".format(RRc_train.shape))
    print("RRd train has {}".format(RRd_train.shape))
    print("Blazhko train has {}".format(blazhko_train.shape))
    
    RRab_test_class    = np.full(len(RRab_test), true_class_1, dtype=int)
    RRc_test_class     = np.full(len(RRc_test), true_class_2, dtype=int)
    RRd_test_class     = np.full(len(RRd_test), true_class_3, dtype=int)
    blazhko_test_class = np.full(len(blazhko_test), true_class_4, dtype=int)

    print("RRab test has {}".format(RRab_test.shape))
    print("RRc test has {}".format(RRc_test.shape))
    print("RRd test has {}".format(RRd_test.shape))
    print("Blazhko test has {}".format(blazhko_test.shape))

    
    third_layer_RRLyrae_train       = pd.concat([RRab_train,RRc_train,RRd_train,blazhko_train], axis=0)
    third_layer_RRLyrae_train_class = np.concatenate((RRab_train_class,RRc_train_class,RRd_train_class,blazhko_train_class), axis=0)
    training_data_TL_RRLyrae        = pd.DataFrame(third_layer_RRLyrae_train)
    training_data_TL_RRLyrae['New_label'] = third_layer_RRLyrae_train_class
#     print(training_data_FL.shape)

    third_layer_RRLyrae_test       = pd.concat([RRab_test,RRc_test,RRd_test,blazhko_test], axis=0)
    third_layer_RRLyrae_test_class = np.concatenate((RRab_test_class,RRc_test_class,RRd_test_class,blazhko_test_class), axis=0)
    testing_data_TL_RRLyrae         = pd.DataFrame(third_layer_RRLyrae_test)
    testing_data_TL_RRLyrae['New_label'] = third_layer_RRLyrae_test_class
    
    y_TL_RRLyrae_training, y_TL_RRLyrae_training_counts = np.unique(third_layer_RRLyrae_train_class, return_counts=True)

    
    return training_data_TL_RRLyrae, testing_data_TL_RRLyrae, y_TL_RRLyrae_training_counts


# Third Layer Hierarchical level for 2nd Branch: Cepheids
### ACEP and Cep-II

In [8]:
# Layer 3 RR Lyrae classes
def third_layer_Cepheids():
    
    # Third Layer
    ACEP_train_class   = np.full(len(ACEP_train), true_class_10, dtype=int)
    cep_ii_train_class = np.full(len(cep_ii_train), true_class_12, dtype=int)

    print("ACEP train has {}".format(ACEP_train.shape))
    print("Cep-II train has {}".format(cep_ii_train.shape))


    ACEP_test_class   = np.full(len(ACEP_test), true_class_10, dtype=int)
    cep_ii_test_class = np.full(len(cep_ii_test), true_class_12, dtype=int)

    print("ACEP test has {}".format(ACEP_test.shape))
    print("Cep-II test has {}".format(cep_ii_test.shape))
    
    third_layer_cep_train       = pd.concat([ACEP_train,cep_ii_train], axis=0)
    third_layer_cep_train_class = np.concatenate((ACEP_train_class,cep_ii_train_class), axis=0)
    training_data_TL_cep        = pd.DataFrame(third_layer_cep_train)
    training_data_TL_cep['New_label'] = third_layer_cep_train_class
#     print(training_data_FL.shape)

    third_layer_cep_test       = pd.concat([ACEP_test,cep_ii_test], axis=0)
    third_layer_cep_test_class = np.concatenate((ACEP_test_class,cep_ii_test_class), axis=0)
    testing_data_TL_cep        = pd.DataFrame(third_layer_cep_test)
    testing_data_TL_cep['New_label'] = third_layer_cep_test_class
    
    y_TL_cep_training, y_TL_cep_training_counts = np.unique(third_layer_cep_train_class, return_counts=True)

    
    return training_data_TL_cep, testing_data_TL_cep, y_TL_cep_training_counts


In [9]:
def normalisation(x_train,x_test):
    scaler                = StandardScaler().fit(x_train.iloc[:,0:nFeatures])
    X_train_normalisation = pd.DataFrame(scaler.transform(x_train.iloc[:,0:nFeatures]))
    y_train_label         = x_train.New_label
    filename_train        = x_train.File_Name

    X_test_normalisation = pd.DataFrame(scaler.transform(x_test.iloc[:,0:nFeatures]))
    y_test_label         = x_test.New_label
    filename_test        = x_test.File_Name
    
    # A check to see whether the mean of x_train and X_test are ~ 0 with std 1.0
#     print(X_train_normalisation.mean(axis=0))
#     print(X_train_normalisation.std(axis=0))
#     print(X_test_normalisation.mean(axis=0))
#     print(X_test_normalisation.std(axis=0))
    
    return X_train_normalisation, y_train_label, filename_train, X_test_normalisation,\
           y_test_label, filename_test


In [10]:
def gridsearch(X_train,y_train,classifer, param_grid, n_iter, cv, filename='./results'):
    grid  = RandomizedSearchCV(classifer, param_grid, n_iter = n_iter, cv = cv, scoring = "accuracy", n_jobs = -1,random_state=1)
    grid.fit(X_train,y_train)
    opt_parameters = grid.best_params_
    print(grid.best_params_)
    
    params_file = open(filename, 'w')
    params_file.write(str(grid.best_params_))
    params_file.close()
    return opt_parameters
    

In [11]:
def model_save(classifier_optimize, X_train, y_train, filename_model, save_model=False):
    fit_model      = classifier_optimize.fit(X_train, y_train)
    
    if save_model:
        pickle.dump(fit_model, open(filename_model, 'wb'))
        
    return fit_model

def model_fit(fit_model, filename_model, X_train, y_train, X_test, y_test, classifier_model='Random Forest Classifier',classes=["Type 1" , "Type 2"], filename ='./results/',load_model=False):
    if load_model:
        fit_model      = pickle.load(open(filename_model, 'rb'))
    
    else:
        fit_model = fit_model
        
    ypred          = fit_model.predict(X_test)
    probability    = fit_model.predict_proba(X_test)
    accuracy       = accuracy_score(y_test, ypred)
    MCC            = matthews_corrcoef(y_test, ypred)
    conf_mat       = confusion_matrix(y_test, ypred)
    
    misclassified     = np.where(y_test != ypred)[0]
 
    name_file = open(filename + ".txt", 'w')
    name_file.write('='*80+'\n')
    name_file.write('******* Testing Phase '+ str(classifier_model) +' for ' + str(classes) + ' *******\n')
    name_file.write('='*80+'\n')
    name_file.write("Accuracy: "                    + "%f" % float(accuracy) + '\n')
    name_file.write("Mathews Correlation Coef: "    + "%f" % float(MCC)      + '\n')
    name_file.write('='*80+'\n')
    name_file.write('='*80+'\n')
    name_file.write('Classification Report\n')
    name_file.write('='*80+'\n')
    name_file.write(classification_report(y_test, ypred, target_names = classes)+'\n')
    name_file.write('='*80+'\n')
    name_file.close()
        
    return ypred, accuracy, MCC, conf_mat

In [12]:
def plot_confusion_matrix(cm, classes_types,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """


    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    

    print(cm)
    plt.figure(figsize=(9,8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=16)
    cb=plt.colorbar(fraction=0.046, pad=0.04)
    cb.ax.tick_params(labelsize=16)
    tick_marks = np.arange(len(classes_types))
    plt.xticks(tick_marks, classes_types, rotation=45)
    plt.yticks(tick_marks, classes_types)
    plt.tick_params(axis='x', labelsize=16)
    plt.tick_params(axis='y', labelsize=16)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                 horizontalalignment="center",
                 color="white" if (cm[i, j] < 0.01) or (cm[i,j] >= 0.75)  else "black",fontsize=16)

    
    plt.ylabel('True label',fontsize = 16)
    plt.xlabel('Predicted label', fontsize = 16)
    plt.tight_layout()

In [13]:
def plot(conf_mat, classes_types, classifier_model, plot_title, X_test, y_test, nClasses,cmap=plt.cm.Reds):

    plt.figure(figsize=(8,6))
    
    plot_confusion_matrix(conf_mat, classes_types, normalize=True, title='Confusion matrix for ' + str(classifier_model) )
    plt.savefig(plot_title +'_CM.pdf')
    plt.close()


# Random Forest Classifier

In [14]:
def analysis_rf(X_train, y_train, types,save_model=False):
    n_estimators      = np.arange(50,1000,100)
    max_features      = ['auto', 'sqrt', 'log2']
    min_samples_split = np.arange(1,20,1)
    #max_depth         = np.arange(1,10,2)
    param_grid        = dict(n_estimators=n_estimators, max_features=max_features, \
                              min_samples_split=min_samples_split)#,max_depth=max_depth

        
    opt_parameters_rf = gridsearch(X_train,y_train,RandomForestClassifier(), param_grid, n_iter = 2, cv = 5, filename= results_dir + types+'_RF_hyparameters.txt')
    fit_model = model_save(RandomForestClassifier(**opt_parameters_rf), X_train=X_train, y_train=y_train, \
                           filename_model= results_dir + types+'_RF_model.sav', save_model=save_model)
    return opt_parameters_rf, fit_model

def final_prediction(fitModel,X_train, y_train, X_test, y_test, classes, types, nClasses,load_model=False):

    ypred, accuracy, MCC, conf_mat = model_fit(fitModel,filename_model= results_dir + types +'_RF_model.sav', X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test,\
                                                 classifier_model='Random Forest Classifier',classes=classes, filename =results_dir + types+'_RF',load_model=load_model)



    plotting = plot(conf_mat, classes_types=classes, classifier_model='Random Forest Classifier',\
                                  plot_title= plots_dir + types + '_RF', X_test=X_test, y_test=y_test, nClasses=nClasses,cmap=plt.cm.Reds)

    return ypred, accuracy, MCC, conf_mat



# XGBoost Classifier

In [15]:
def analysis_XGB(X_train, y_train, types,save_model=False,multi=True):
    eta       = [0.01]    
    if multi:
        objective = ['multi:softmax']
    else: 
        objective = ['binary:logistic']

    max_depth = np.arange(1,12,2)
    subsample = [0.5]
    param_grid  = dict(eta=eta,objective=objective,max_depth=max_depth,subsample=subsample)
        
    opt_parameters_XGB = gridsearch( X_train, y_train,XGBClassifier(), param_grid, n_iter = 5, cv = 5, filename= results_dir + types+ '_XGB_hyparameters.txt')
    fit_model = model_save(XGBClassifier(**opt_parameters_XGB), X_train=X_train, y_train=y_train, \
                           filename_model= results_dir + types + '_XGB_model.sav', save_model=save_model)
    return opt_parameters_XGB, fit_model

def final_prediction_XGB(fitModel,X_train, y_train, X_test, y_test, classes, types,nClasses,load_model=False):
    ypred, accuracy, MCC, conf_mat  = model_fit(fitModel,filename_model= results_dir + types +'_XGB_model.sav', X_train=X_train, y_train=y_train, X_test = X_test, y_test=y_test,\
                                                 classifier_model='XGBoost Classifier',classes=classes, filename =results_dir + types +'_XGB', load_model=load_model)

    plotting = plot(conf_mat, classes_types=classes, classifier_model='XGBoost Classifier',\
                                  plot_title= plots_dir + types +'_XGB', X_test=X_test, y_test=y_test, nClasses=nClasses,cmap=plt.cm.Blues)
    return ypred, accuracy, MCC, conf_mat


In [16]:
def smote_augmentation(training,testing):
    X_train_normalisation, y_train_np, filename_train, X_test_normalisation,\
    y_test_np, filename_test = normalisation(x_train=training,x_test=testing) 
    
    y_label_bf = np.unique(y_train_np)
    if augmentation:
        for i in range(len(y_label_bf)):
            print("Before OverSampling, counts of label {}: {}".format(y_label_bf[i],(y_train_np[y_train_np==y_label_bf[i]]).shape))



        # sm = SMOTE(random_state=2, ratio = 1.0,kind='svm')
        sm = SMOTE(ratio = 'all') # Using Kind: Regular
        X_train_aug, y_train_aug = sm.fit_sample(X_train_normalisation, y_train_np.ravel())
        data_1 = pd.DataFrame(X_train_aug)
        data_1['True_class_labels'] = y_train_aug
        X_train_norm = data_1.iloc[:,0:nFeatures]
        y_train_norm = data_1.iloc[:,nFeatures]

        y_label_af = np.unique(y_train_norm)
        print('-'*70)
        for j in range(len(y_label_af)):
                print("After OverSampling, counts of label {}: {}".format(y_label_af[j],y_train_norm.loc[y_train_norm==y_label_af[j]].shape))


        X_train = X_train_norm   
        y_train = y_train_norm
        y_test  = y_test_np
        X_test  = X_test_normalisation
        
        return X_train, y_train, X_test, y_test

In [17]:
nFeatures = 7

# The directory to save the files
plots_dir                 = './hierarchical-results_SMOTE/plots/'
results_dir               = './hierarchical-results_SMOTE/results/'
misclassify_dir           = r'./hierarchical-results_SMOTE/results/Misclassification_'


eclipsing_label = 20;rotational_label = 21;pulsating_label = 22;RR_Lyrae_label = 23; LPV_label = 24;\
delta_scuti_label = 25; cepheids_label   = 26

true_class_1=1;true_class_2=2;true_class_3=3;true_class_4=4;true_class_5=5;true_class_6=6;true_class_7=7;\
true_class_8=8;true_class_9=9;true_class_10=10;true_class_11=11;true_class_12=12;true_class_13=13
n_splits         = 5
data_preparation = True
multi_class      = True
augmentation     = True
save_model       = False
load_model       = False
feature_directory         ='./data/'
feature_data     = 'features_alldata.csv'
features         = pd.read_csv(feature_directory+feature_data)
print('The data consists of {} samples'.format(features.shape))


acc_rf_FL = [];mcc_rf_FL = [];acc_xgb_FL = [];mcc_xgb_FL = [];\
acc_rf_SL_EB = [];mcc_rf_SL_EB = [];acc_xgb_SL_EB = [];mcc_xgb_SL_EB = [];\
acc_rf_SL_RLCD = [];mcc_rf_SL_RLCD = [];acc_xgb_SL_RLCD = [];mcc_xgb_SL_RLCD = [];\
acc_rf_TL_RL = [];mcc_rf_TL_RL = [];acc_xgb_TL_RL = [];mcc_xgb_TL_RL = [];\
acc_rf_TL_Cep = [];mcc_rf_TL_Cep = [];acc_xgb_TL_Cep = [];mcc_xgb_TL_Cep = []

The data consists of (23143, 9) samples


In [18]:
X   = features.iloc[:,0:nFeatures]
y   = features.True_class_labels
skf       = StratifiedKFold(n_splits=n_splits, shuffle=True) 
split_num = 1
for train_index, test_index in skf.split(X, y):
    if data_preparation: 
        X_training      = features.iloc[train_index]
        X_testing       = features.iloc[test_index]

        
        RRab_train        = stars_label(X_training, true_class_1)
        RRc_train         = stars_label(X_training, true_class_2) 
        RRd_train         = stars_label(X_training, true_class_3)
        blazhko_train     = stars_label(X_training, true_class_4)
        contact_Bi_train  = stars_label(X_training, true_class_5)
        semi_det_Bi_train = stars_label(X_training, true_class_6)
        rot_train         = stars_label(X_training, true_class_7)
        LPV_train         = stars_label(X_training, true_class_8)
        delta_scuti_train = stars_label(X_training, true_class_9)
        ACEP_train        = stars_label(X_training, true_class_10)
        cep_ii_train      = stars_label(X_training, true_class_12)

        RRab_test        = stars_label(X_testing, true_class_1)
        RRc_test         = stars_label(X_testing, true_class_2) 
        RRd_test         = stars_label(X_testing, true_class_3)
        blazhko_test     = stars_label(X_testing, true_class_4)
        contact_Bi_test  = stars_label(X_testing, true_class_5)
        semi_det_Bi_test = stars_label(X_testing, true_class_6)
        rot_test         = stars_label(X_testing, true_class_7)
        LPV_test         = stars_label(X_testing, true_class_8)
        delta_scuti_test = stars_label(X_testing, true_class_9)
        ACEP_test        = stars_label(X_testing, true_class_10)
        cep_ii_test      = stars_label(X_testing, true_class_12)
        
        '-----------------------------------------------------------------------------'
                                        # FIRST LAYER
        '-----------------------------------------------------------------------------'
        training_data_FL, testing_data_FL, y_FL_training_counts = first_layer()
        
        X_train_FL, y_train_FL, X_test_FL, y_test_FL = smote_augmentation(training_data_FL, testing_data_FL)
        
        # Random Forest Classifier    
        if multi_class:
            classes_types_FL = ['Eclipsing','Rotational','Pulsating']
            types_FL         ='Type_FL_'+str(split_num)
            nClasses_FL      = len(classes_types_FL)

        else:
            classes_types_FL = ['Eclipsing','Rotational']
            types_FL         ='Type_Binary_Split_'+str(split_num)
            nClasses_FL      = 2

        opt_rf_FL, fit_model_rf_FL = analysis_rf(X_train_FL, y_train_FL, types_FL, save_model) # This part can be commented if you don't want to train the algorithm
        print(opt_rf_FL)   
        ypred_rf_FL, accuracy_rf_FL, MCC_rf_FL, conf_mat_rf_FL = final_prediction(fit_model_rf_FL,X_train_FL, y_train_FL, X_test_FL, y_test_FL, classes_types_FL, types_FL, nClasses_FL,load_model)

        # XGBoost Classifier   
        opt_xgb_FL, fit_model_xgb_FL = analysis_XGB(X_train_FL, y_train_FL, types_FL, save_model,multi=True) # This part can be commented when no training
        print(opt_xgb_FL)    
        ypred_xgb_FL, accuracy_xgb_FL, MCC_xgb_FL, conf_mat_xgb_FL = final_prediction_XGB(fit_model_xgb_FL, X_train_FL, y_train_FL, X_test_FL, y_test_FL, classes_types_FL, types_FL, nClasses_FL, load_model) 

        acc_rf_FL.append(accuracy_rf_FL)
        mcc_rf_FL.append(MCC_rf_FL)
        acc_xgb_FL.append(accuracy_xgb_FL)
        mcc_xgb_FL.append(MCC_xgb_FL)

        '-------------------------------------------------------------------------------'
                                # SECOND LAYER ECLIPSING BINARY
        '-------------------------------------------------------------------------------'
        training_data_SL_EB, testing_data_SL_EB, y_SL_EB_training_counts = second_layer_EB()
        X_train_SL_EB, y_train_SL_EB, X_test_SL_EB, y_test_SL_EB,= smote_augmentation(training_data_SL_EB, testing_data_SL_EB)
                
        # Random Forest Classifier    

        classes_types_SL_EB = ['Ecl','EA']
        types_SL_EB         ='Type_SL_Ecl_EA_'+str(split_num)
        nClasses_SL_EB      = 2

        opt_rf_SL_EB, fit_model_rf_SL_EB = analysis_rf(X_train_SL_EB, y_train_SL_EB, types_SL_EB, save_model) # This part can be commented if you don't want to train the algorithm
        print(opt_rf_FL)   
        ypred_rf_SL_EB, accuracy_rf_SL_EB, MCC_rf_SL_EB, conf_mat_rf_SL_EB = final_prediction(fit_model_rf_SL_EB,X_train_SL_EB, y_train_SL_EB, X_test_SL_EB, y_test_SL_EB, classes_types_SL_EB, types_SL_EB, nClasses_SL_EB,load_model)

        # XGBoost Classifier   
        opt_xgb_SL_EB, fit_model_xgb_SL_EB = analysis_XGB(X_train_SL_EB, y_train_SL_EB, types_SL_EB, save_model,multi=False) # This part can be commented when no training
        print(opt_xgb_SL_EB)    
        ypred_xgb_SL_EB, accuracy_xgb_SL_EB, MCC_xgb_SL_EB, conf_mat_xgb_SL_EB = final_prediction_XGB(fit_model_xgb_SL_EB, X_train_SL_EB, y_train_SL_EB, X_test_SL_EB, y_test_SL_EB, classes_types_SL_EB, types_SL_EB, nClasses_SL_EB, load_model) 

        acc_rf_SL_EB.append(accuracy_rf_SL_EB)
        mcc_rf_SL_EB.append(MCC_rf_SL_EB)
        acc_xgb_SL_EB.append(accuracy_xgb_SL_EB)
        mcc_xgb_SL_EB.append(MCC_xgb_SL_EB)

        '-----------------------------------------------------------------------------'
                            # SECOND LAYER RR LYRAE PULSATING LPV CEPHEIDS
        '-----------------------------------------------------------------------------'
        training_data_SL_RLCD, testing_data_SL_RLCD, y_SL_RLCD_training_counts = second_layer_RLCD()
        X_train_SL_RLCD, y_train_SL_RLCD, X_test_SL_RLCD, y_test_SL_RLCD = smote_augmentation(training_data_SL_RLCD,testing_data_SL_RLCD)


        # Random Forest Classifier    
        classes_types_SL_RLCD = ['RR Lyrae','LPV', 'Cepheids', '$\delta$-Scuti']
        types_SL_RLCD         ='Type_SL_RLCD_'+str(split_num)
        nClasses_SL_RLCD      = len(classes_types_SL_RLCD)

        opt_rf_SL_RLCD, fit_model_rf_SL_RLCD = analysis_rf(X_train_SL_RLCD, y_train_SL_RLCD, types_SL_RLCD, save_model) # This part can be commented if you don't want to train the algorithm
        print(opt_rf_SL_RLCD)   
        ypred_rf_SL_RLCD, accuracy_rf_SL_RLCD, MCC_rf_SL_RLCD, conf_mat_rf_SL_RLCD = final_prediction(fit_model_rf_SL_RLCD,X_train_SL_RLCD, y_train_SL_RLCD, X_test_SL_RLCD, y_test_SL_RLCD, classes_types_SL_RLCD, types_SL_RLCD, nClasses_SL_RLCD,load_model)

        # XGBoost Classifier   
        opt_xgb_SL_RLCD, fit_model_xgb_SL_RLCD = analysis_XGB(X_train_SL_RLCD, y_train_SL_RLCD, types_SL_RLCD, save_model,multi=True) # This part can be commented when no training
        print(opt_xgb_SL_RLCD)    
        ypred_xgb_SL_RLCD, accuracy_xgb_SL_RLCD, MCC_xgb_SL_RLCD, conf_mat_xgb_SL_RLCD = final_prediction_XGB(fit_model_xgb_SL_RLCD, X_train_SL_RLCD, y_train_SL_RLCD, X_test_SL_RLCD, y_test_SL_RLCD, classes_types_SL_RLCD, types_SL_RLCD, nClasses_SL_RLCD, load_model) 

        acc_rf_SL_RLCD.append(accuracy_rf_SL_RLCD)
        mcc_rf_SL_RLCD.append(MCC_rf_SL_RLCD)
        acc_xgb_SL_RLCD.append(accuracy_xgb_SL_RLCD)
        mcc_xgb_SL_RLCD.append(MCC_xgb_SL_RLCD)

        '-----------------------------------------------------------------------------'
                        # THIRD LAYER RR LYRAE: RRab, RRc, RRd, Blazhko
        '-----------------------------------------------------------------------------'
        training_data_TL_RRLyrae, testing_data_TL_RRLyrae, y_TL_RRLyrae_training_counts = third_layer_RRLyrae()
        
        X_train_TL_RL, y_train_TL_RL,X_test_TL_RL, y_test_TL_RL = smote_augmentation(training_data_TL_RRLyrae,testing_data_TL_RRLyrae)

        
        # Random Forest Classifier    
        classes_types_TL_RL = ['RRab', 'RRc', 'RRd', "Blazhko"]
        types_TL_RL         ='Type_TL_RRLyrae_'+str(split_num)
        nClasses_TL_RL      = len(classes_types_TL_RL)

        opt_rf_TL_RL, fit_model_rf_TL_RL = analysis_rf(X_train_TL_RL, y_train_TL_RL, types_TL_RL, save_model) # This part can be commented if you don't want to train the algorithm
        print(opt_rf_TL_RL)   
        ypred_rf_TL_RL, accuracy_rf_TL_RL, MCC_rf_TL_RL, conf_mat_rf_TL_RL = final_prediction(fit_model_rf_TL_RL,X_train_TL_RL, y_train_TL_RL, X_test_TL_RL, y_test_TL_RL, classes_types_TL_RL, types_TL_RL, nClasses_TL_RL,load_model)

        # XGBoost Classifier   
        opt_xgb_TL_RL, fit_model_xgb_TL_RL = analysis_XGB(X_train_TL_RL, y_train_TL_RL, types_TL_RL, save_model,multi=True) # This part can be commented when no training
        print(opt_xgb_TL_RL)    
        ypred_xgb_TL_RL, accuracy_xgb_TL_RL, MCC_xgb_TL_RL, conf_mat_xgb_TL_RL = final_prediction_XGB(fit_model_xgb_TL_RL, X_train_TL_RL, y_train_TL_RL, X_test_TL_RL, y_test_TL_RL, classes_types_TL_RL, types_TL_RL, nClasses_TL_RL, load_model) 

        acc_rf_TL_RL.append(accuracy_rf_TL_RL)
        mcc_rf_TL_RL.append(MCC_rf_TL_RL)
        acc_xgb_TL_RL.append(accuracy_xgb_TL_RL)
        mcc_xgb_TL_RL.append(MCC_xgb_TL_RL)


        '-----------------------------------------------------------------------------'
                                # THIRD LAYER Cepheids: ACEP and Cep-II
        '-----------------------------------------------------------------------------'
        training_data_TL_cep, testing_data_TL_cep, y_TL_cep_training_counts = third_layer_Cepheids()
        
        X_train_TL_Cep, y_train_TL_Cep, X_test_TL_Cep, y_test_TL_Cep = smote_augmentation(training_data_TL_cep,testing_data_TL_cep)

        # Random Forest Classifier    

        classes_types_TL_Cep = ['ACEP','CEP-II']
        types_TL_Cep         ='Type_TL_Cepheids_'+str(split_num)
        nClasses_TL_Cep      = 2

        opt_rf_TL_Cep, fit_model_rf_TL_Cep = analysis_rf(X_train_TL_Cep, y_train_TL_Cep, types_TL_Cep, save_model) # This part can be commented if you don't want to train the algorithm
        print(opt_rf_TL_Cep)   
        ypred_rf_TL_Cep, accuracy_rf_TL_Cep, MCC_rf_TL_Cep, conf_mat_rf_TL_Cep = final_prediction(fit_model_rf_TL_Cep,X_train_TL_Cep, y_train_TL_Cep, X_test_TL_Cep, y_test_TL_Cep, classes_types_TL_Cep, types_TL_Cep, nClasses_TL_Cep,load_model)

        # XGBoost Classifier   
        opt_xgb_TL_Cep, fit_model_xgb_TL_Cep = analysis_XGB(X_train_TL_Cep, y_train_TL_Cep, types_TL_Cep, save_model,multi=False) # This part can be commented when no training
        print(opt_xgb_TL_Cep)    
        ypred_xgb_TL_Cep, accuracy_xgb_TL_Cep, MCC_xgb_TL_Cep, conf_mat_xgb_TL_Cep = final_prediction_XGB(fit_model_xgb_TL_Cep, X_train_TL_Cep, y_train_TL_Cep, X_test_TL_Cep, y_test_TL_Cep, classes_types_TL_Cep, types_TL_Cep, nClasses_TL_Cep, load_model) 

        acc_rf_TL_Cep.append(accuracy_rf_TL_Cep)
        mcc_rf_TL_Cep.append(MCC_rf_TL_Cep)
        acc_xgb_TL_Cep.append(accuracy_xgb_TL_Cep)
        mcc_xgb_TL_Cep.append(MCC_xgb_TL_Cep)
   
        
    print('-'*50)
    print('Feature Extraction for Split {} is finished'.format(split_num))  
    split_num += 1
    
metrics = open("./hierarchical-results_SMOTE/metrics.txt", 'w')
metrics.write('='*80+'\n')
metrics.write('***Testing Phase Random Forest for ' + str(classes_types_FL) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_rf_FL)*100,np.std(acc_rf_FL)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_rf_FL)*100,np.std(mcc_rf_FL)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase Random Forest for ' + str(classes_types_SL_EB) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_rf_SL_EB)*100,np.std(acc_rf_SL_EB)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_rf_SL_EB)*100,np.std(mcc_rf_SL_EB)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase Random Forest for ' + str(classes_types_SL_RLCD) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_rf_SL_RLCD)*100,np.std(acc_rf_SL_RLCD)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_rf_SL_RLCD)*100,np.std(mcc_rf_SL_RLCD)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase Random Forest for ' + str(classes_types_TL_RL) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_rf_TL_RL)*100,np.std(acc_rf_TL_RL)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_rf_TL_RL)*100,np.std(mcc_rf_TL_RL)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase Random Forest for ' + str(classes_types_TL_Cep) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_rf_TL_Cep)*100,np.std(acc_rf_TL_Cep)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_rf_TL_Cep)*100,np.std(mcc_rf_TL_Cep)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase XGBoost for ' + str(classes_types_FL) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_xgb_FL)*100,np.std(acc_xgb_FL)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_xgb_FL)*100,np.std(mcc_xgb_FL)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase XGBoost for ' + str(classes_types_SL_EB) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_xgb_SL_EB)*100,np.std(acc_xgb_SL_EB)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_xgb_SL_EB)*100,np.std(mcc_xgb_SL_EB)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase XGBoost for ' + str(classes_types_SL_RLCD) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_xgb_SL_RLCD)*100,np.std(acc_xgb_SL_RLCD)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_xgb_SL_RLCD)*100,np.std(mcc_xgb_SL_RLCD)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase XGBoost for ' + str(classes_types_TL_RL) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_xgb_TL_RL)*100,np.std(acc_xgb_TL_RL)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_xgb_TL_RL)*100,np.std(mcc_xgb_TL_RL)) + '\n')

metrics.write('='*80+'\n')
metrics.write('***Testing Phase XGBoost for ' + str(classes_types_TL_Cep) + ' ***\n')
metrics.write('='*80+'\n')
metrics.write("Accuracy: ({} ± {}) %".format(np.mean(acc_xgb_TL_Cep)*100,np.std(acc_xgb_TL_Cep)) + '\n')
metrics.write("MCC: ({} ± {})".format(np.mean(mcc_xgb_TL_Cep)*100,np.std(mcc_xgb_TL_Cep)) + '\n')
metrics.close()


eclipsing_binary_train has (7214, 9)
pulsating_train has (8387, 9)
rotational_train has (2908, 9)
eclipsing_binary_test has (1804, 9)
pulsating_test has (2102, 9)
rotational_test has (728, 9)
Before OverSampling, counts of label 20: (7214,)
Before OverSampling, counts of label 21: (2908,)
Before OverSampling, counts of label 22: (8387,)
----------------------------------------------------------------------
After OverSampling, counts of label 20: (8387,)
After OverSampling, counts of label 21: (8387,)
After OverSampling, counts of label 22: (8387,)
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
Normalized confusion matrix
[[ 0.77217295  0.14412417  0.08370288]
 [ 0.16346154  0.67307692  0.16346154]
 [ 0.07992388  0.06089439  0.85918173]]
{'objective': 'multi:softmax', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
{'objective': 'multi:softmax', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
Norm



{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
Normalized confusion matrix
[[ 0.92369942  0.00231214  0.00578035  0.06820809]
 [ 0.00133156  0.89613848  0.1011984   0.00133156]
 [ 0.03960396  0.46534653  0.4950495   0.        ]
 [ 0.65714286  0.          0.          0.34285714]]
{'objective': 'multi:softmax', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
{'objective': 'multi:softmax', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
Normalized confusion matrix
[[ 0.91676301  0.00231214  0.00578035  0.07514451]
 [ 0.00133156  0.89347537  0.10386152  0.00133156]
 [ 0.02970297  0.40594059  0.56435644  0.        ]
 [ 0.42857143  0.          0.          0.57142857]]
ACEP train has (122, 9)
Cep-II train has (122, 9)
ACEP test has (31, 9)
Cep-II test has (31, 9)
Before OverSampling, counts of label 10: (122,)
Before OverSampling, counts of label 12: (122,)
----------------------------------------------



{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 1}
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 1}
Normalized confusion matrix
[[ 0.83870968  0.16129032]
 [ 0.19354839  0.80645161]]
--------------------------------------------------
Feature Extraction for Split 1 is finished
eclipsing_binary_train has (7214, 9)
pulsating_train has (8389, 9)
rotational_train has (2909, 9)
eclipsing_binary_test has (1804, 9)
pulsating_test has (2100, 9)
rotational_test has (727, 9)
Before OverSampling, counts of label 20: (7214,)
Before OverSampling, counts of label 21: (2909,)
Before OverSampling, counts of label 22: (8389,)
----------------------------------------------------------------------
After OverSampling, counts of label 20: (8389,)
After OverSampling, counts of label 21: (8389,)
After OverSampling, counts of label 22: (8389,)
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_spli



{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
Normalized confusion matrix
[[ 0.96774194  0.03225806]
 [ 0.16129032  0.83870968]]




{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 5}
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 5}
Normalized confusion matrix
[[ 0.93548387  0.06451613]
 [ 0.16129032  0.83870968]]
--------------------------------------------------
Feature Extraction for Split 2 is finished
eclipsing_binary_train has (7214, 9)
pulsating_train has (8392, 9)
rotational_train has (2909, 9)
eclipsing_binary_test has (1804, 9)
pulsating_test has (2097, 9)
rotational_test has (727, 9)
Before OverSampling, counts of label 20: (7214,)
Before OverSampling, counts of label 21: (2909,)
Before OverSampling, counts of label 22: (8392,)
----------------------------------------------------------------------
After OverSampling, counts of label 20: (8392,)
After OverSampling, counts of label 21: (8392,)
After OverSampling, counts of label 22: (8392,)
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_spli



{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
Normalized confusion matrix
[[ 0.96774194  0.03225806]
 [ 0.09677419  0.90322581]]




{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 1}
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 1}
Normalized confusion matrix
[[ 0.96774194  0.03225806]
 [ 0.16129032  0.83870968]]
--------------------------------------------------
Feature Extraction for Split 3 is finished
eclipsing_binary_train has (7214, 9)
pulsating_train has (8394, 9)
rotational_train has (2909, 9)
eclipsing_binary_test has (1804, 9)
pulsating_test has (2095, 9)
rotational_test has (727, 9)
Before OverSampling, counts of label 20: (7214,)
Before OverSampling, counts of label 21: (2909,)
Before OverSampling, counts of label 22: (8394,)
----------------------------------------------------------------------
After OverSampling, counts of label 20: (8394,)
After OverSampling, counts of label 21: (8394,)
After OverSampling, counts of label 22: (8394,)
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_spli



{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
{'n_estimators': 550, 'min_samples_split': 5, 'max_features': 'sqrt'}
Normalized confusion matrix
[[ 0.93333333  0.06666667]
 [ 0.23333333  0.76666667]]
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 5}
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 5}
Normalized confusion matrix
[[ 0.9  0.1]
 [ 0.2  0.8]]
--------------------------------------------------
Feature Extraction for Split 4 is finished
eclipsing_binary_train has (7216, 9)
pulsating_train has (8394, 9)
rotational_train has (2909, 9)
eclipsing_binary_test has (1802, 9)
pulsating_test has (2095, 9)
rotational_test has (727, 9)
Before OverSampling, counts of label 20: (7216,)
Before OverSampling, counts of label 21: (2909,)
Before OverSampling, counts of label 22: (8394,)
----------------------------------------------------------------------
After OverSampling, counts of label 20: (8394,)
After Over



{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
{'n_estimators': 750, 'min_samples_split': 4, 'max_features': 'auto'}
Normalized confusion matrix
[[ 0.96666667  0.03333333]
 [ 0.23333333  0.76666667]]




{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
{'objective': 'binary:logistic', 'subsample': 0.5, 'eta': 0.01, 'max_depth': 9}
Normalized confusion matrix
[[ 0.96666667  0.03333333]
 [ 0.2         0.8       ]]
--------------------------------------------------
Feature Extraction for Split 5 is finished


<matplotlib.figure.Figure at 0x105717590>

<matplotlib.figure.Figure at 0x137b67350>

<matplotlib.figure.Figure at 0x1370a1d50>

<matplotlib.figure.Figure at 0x138215d50>

<matplotlib.figure.Figure at 0x114ca0210>

<matplotlib.figure.Figure at 0x13757c890>

<matplotlib.figure.Figure at 0x13716b250>

<matplotlib.figure.Figure at 0x13a46ed90>

<matplotlib.figure.Figure at 0x13a47b590>

<matplotlib.figure.Figure at 0x13ae4bd10>

<matplotlib.figure.Figure at 0x115084490>

<matplotlib.figure.Figure at 0x1056ca9d0>

<matplotlib.figure.Figure at 0x1365e0c90>

<matplotlib.figure.Figure at 0x114cbb4d0>

<matplotlib.figure.Figure at 0x1155e84d0>

<matplotlib.figure.Figure at 0x1057175d0>

<matplotlib.figure.Figure at 0x137a70c50>

<matplotlib.figure.Figure at 0x137237b10>

<matplotlib.figure.Figure at 0x13ab52f10>

<matplotlib.figure.Figure at 0x13ab631d0>

<matplotlib.figure.Figure at 0x13c532910>

<matplotlib.figure.Figure at 0x13742cf90>

<matplotlib.figure.Figure at 0x138889c90>

<matplotlib.figure.Figure at 0x13741c050>

<matplotlib.figure.Figure at 0x137837410>

<matplotlib.figure.Figure at 0x13929a310>

<matplotlib.figure.Figure at 0x13a695dd0>

<matplotlib.figure.Figure at 0x137520610>

<matplotlib.figure.Figure at 0x117d0a650>

<matplotlib.figure.Figure at 0x13c72ff90>

<matplotlib.figure.Figure at 0x115c6e510>

<matplotlib.figure.Figure at 0x1056ca990>

<matplotlib.figure.Figure at 0x13b9e4f90>

<matplotlib.figure.Figure at 0x11ec95fd0>

<matplotlib.figure.Figure at 0x114953ed0>

<matplotlib.figure.Figure at 0x114958590>

<matplotlib.figure.Figure at 0x11837ab90>

<matplotlib.figure.Figure at 0x1151fdd10>

<matplotlib.figure.Figure at 0x13edf1310>

<matplotlib.figure.Figure at 0x11add03d0>

<matplotlib.figure.Figure at 0x1181f6290>

<matplotlib.figure.Figure at 0x117089950>

<matplotlib.figure.Figure at 0x13a1a1210>

<matplotlib.figure.Figure at 0x120b792d0>

<matplotlib.figure.Figure at 0x13a19ef50>

<matplotlib.figure.Figure at 0x117cb52d0>

<matplotlib.figure.Figure at 0x13ddf3850>

<matplotlib.figure.Figure at 0x1220d06d0>

<matplotlib.figure.Figure at 0x13edf10d0>

<matplotlib.figure.Figure at 0x1183d9090>