In [1]:
import math
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import confusion_matrix

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
result_exp1 = pd.read_csv('./results/Exp1/RF_Exp1_24features_wsALL_cwALL_bootstrap10.csv')
result_exp1_100 = pd.read_csv('./results/Exp1/RF_Exp1_24features_wsALL_cwALL_bootstrap100.csv')
result_exp1_1000 = pd.read_csv('./results/Exp1/RF_Exp1_24features_wsALL_cwALL_bootstrap1000.csv')

result_exp2 = pd.read_csv('./results/Exp2/RF_Exp2_24features_wsALL_cwALL_bootstrap10.csv')
result_exp2_100 = pd.read_csv('./results/Exp2/RF_Exp2_24features_wsALL_cwALL_bootstrap100.csv')
result_exp2_1000 = pd.read_csv('./results/Exp2/RF_Exp2_24features_wsALL_cwALL_bootstrap1000.csv')

result_exp3 = pd.read_csv('./results/Exp3/RF_Exp3_24features_wsALL_cwALL_bootstrap10.csv')
result_exp3_100 = pd.read_csv('./results/Exp3/RF_Exp3_24features_wsALL_cwALL_bootstrap100.csv')
result_exp3_1000 = pd.read_csv('./results/Exp3/RF_Exp3_24features_wsALL_cwALL_bootstrap1000.csv')

exp1_columns = np.load('./info/Exp1_all_columns.npy',allow_pickle=True)
exp2_columns = np.load('./info/Exp2_all_columns.npy',allow_pickle=True)
exp3_columns = np.load('./info/Exp3_all_columns.npy',allow_pickle=True)

In [3]:
def feature_runs(exp_result, column_list, n_top=5):
    # Separate class weight from Experiment column
    cw_list=[]
    for i in range(len(exp_result)):
        cw_list.append(int(exp_result['Experiment'][i].split('_cw')[-1].split('_bin')[0]))
    exp_result['class_weights'] = cw_list

    df = pd.DataFrame(column_list, columns =['Features'])

    # Get sub dataframe for each run
    for cw in exp_result['class_weights'].unique():
        # Separate dataframe for each class weight experiment
        sub_df = exp_result.loc[exp_result['class_weights'] == cw]
        sub_df = sub_df.reset_index(drop=True)
        
        run_df = pd.DataFrame(column_list, columns =['Features'])
        for i in range(len(sub_df)):
            # Extract Top most important features
            top_list = sub_df['Feature_importance'][i].split(',')[:n_top]
            for j in range(len(top_list)):
                top_list[j] = top_list[j].split('\'')[1]
            
            # Match extracted top features into pre-defined dataframe
            # Count the number of time each feature appear in the top list
            feature_count = np.zeros(len(run_df), dtype=int)
            for top_feature in top_list:
                idx = df[df['Features'] == top_feature].index
                feature_count[idx] += 1
                        
            # Generate dataframe for each bootstrap run
#             run_df = pd.concat([run_df,pd.Series(feature_count)],axis=0)
            run_df['run'+str(i)]=pd.Series(feature_count)
        
        # Summation of all the runs for each feature appear in the top list
        # Store the result into dataframe
        df['cw'+str(cw)] = run_df.sum(axis = 1, numeric_only=True)
        
    return df

In [4]:
def feature_top(exp_result, column_list, n_tf, n_rank=5):
    # n_rank: Number of top rank selected for each feature
    # n_tf: Number of top features
    df = feature_runs(exp_result,column_list, n_top=n_rank)
    df = df.loc[~(df[df.columns[1:]]==0).all(axis=1)]
    
    df = df.loc[df.sum(axis=1,numeric_only=True).sort_values(ascending=False).head(n_tf).index]
    df = df.reset_index(drop=True)
    
    return df

In [5]:
exp1_columns.shape, exp2_columns.shape, exp3_columns.shape

((1656,), (648,), (2304,))

In [6]:
log2_exp1 = math.ceil(np.log2(exp1_columns.shape[0]))
log2_exp2 = math.ceil(np.log2(exp2_columns.shape[0]))
log2_exp3 = math.ceil(np.log2(exp3_columns.shape[0]))

sqrt_exp1 = math.ceil(np.sqrt(exp1_columns.shape[0]))
sqrt_exp2 = math.ceil(np.sqrt(exp2_columns.shape[0]))
sqrt_exp3 = math.ceil(np.sqrt(exp3_columns.shape[0]))

In [7]:
log2_exp1, log2_exp2, log2_exp3

(11, 10, 12)

In [8]:
sqrt_exp1, sqrt_exp2, sqrt_exp3

(41, 26, 48)

In [9]:
log2_top_exp1 = feature_top(result_exp1,exp1_columns,log2_exp1)
log2_top_exp1_100 = feature_top(result_exp1_100,exp1_columns,log2_exp1)
log2_top_exp1_1000 = feature_top(result_exp1_1000,exp1_columns,log2_exp1)

log2_top_exp2 = feature_top(result_exp2,exp2_columns,log2_exp2)
log2_top_exp2_100 = feature_top(result_exp2_100,exp2_columns,log2_exp2)
log2_top_exp2_1000 = feature_top(result_exp2_1000,exp2_columns,log2_exp2)

log2_top_exp3 = feature_top(result_exp3,exp3_columns,log2_exp3)
log2_top_exp3_100 = feature_top(result_exp3_100,exp3_columns,log2_exp3)
log2_top_exp3_1000 = feature_top(result_exp3_1000,exp3_columns,log2_exp3)

In [10]:
sqrt_top_exp1 = feature_top(result_exp1,exp1_columns,sqrt_exp1)
sqrt_top_exp1_100 = feature_top(result_exp1_100,exp1_columns,sqrt_exp1)
sqrt_top_exp1_1000 = feature_top(result_exp1_1000,exp1_columns,sqrt_exp1)

sqrt_top_exp2 = feature_top(result_exp2,exp2_columns,sqrt_exp2)
sqrt_top_exp2_100 = feature_top(result_exp2_100,exp2_columns,sqrt_exp2)
sqrt_top_exp2_1000 = feature_top(result_exp2_1000,exp2_columns,sqrt_exp2)

sqrt_top_exp3 = feature_top(result_exp3,exp3_columns,sqrt_exp3)
sqrt_top_exp3_100 = feature_top(result_exp3_100,exp3_columns,sqrt_exp3)
sqrt_top_exp3_1000 = feature_top(result_exp3_1000,exp3_columns,sqrt_exp3)

In [11]:
len(log2_top_exp1), len(log2_top_exp1_100), len(log2_top_exp1_1000)

(11, 11, 11)

In [12]:
len(sqrt_top_exp1), len(sqrt_top_exp1_100), len(sqrt_top_exp1_1000)

(39, 41, 41)

In [17]:
log2_top_exp3

Unnamed: 0,Features,cw5,cw6,cw7,cw8,cw9,cw10,cw20,cw30
0,ABSNJZH_std_mean_ws8_ss4,6,8,6,6,6,6,8,3
1,TOTUSJH_mean_slice[40:54],4,7,6,6,4,7,6,8
2,TOTUSJH_mean_slice[32:46],5,5,6,7,5,6,5,4
3,ABSNJZH_std_max_ws8_ss4,2,4,4,3,4,5,4,7
4,ABSNJZH_std_max_ws15_ss8,5,6,3,4,4,5,2,3
5,ABSNJZH_std_min_ws15_ss8,5,3,4,3,4,3,3,3
6,R_VALUE_mean_mean_ws8_ss4,3,3,3,3,4,4,1,3
7,TOTUSJH_mean_max_ws8_ss4,3,2,1,3,2,2,3,4
8,TOTUSJH_mean_slice[16:30],3,2,3,3,3,1,0,0
9,SAVNCPP_std_max_ws30_ss15,2,2,2,2,2,2,1,1


In [18]:
log2_top_exp3_100

Unnamed: 0,Features,cw5,cw6,cw7,cw8,cw9,cw10,cw20,cw30
0,ABSNJZH_std_mean_ws8_ss4,75,69,69,73,73,69,51,32
1,TOTUSJH_mean_slice[40:54],50,59,64,63,66,65,59,59
2,TOTUSJH_mean_slice[32:46],40,42,42,45,46,46,50,51
3,SAVNCPP_std_mean_ws8_ss4,36,34,32,37,41,41,27,17
4,ABSNJZH_std_max_ws8_ss4,33,34,31,30,35,32,37,32
5,ABSNJZH_std_max_ws15_ss8,39,37,34,30,28,25,22,18
6,R_VALUE_mean_mean_ws8_ss4,24,27,20,25,24,25,22,26
7,TOTUSJH_mean_mean_ws15_ss8,19,19,30,28,26,24,21,12
8,TOTUSJH_mean_slice[16:30],10,19,19,16,22,25,25,19
9,TOTUSJH_mean_max_ws8_ss4,13,14,15,14,17,20,18,18


In [19]:
log2_top_exp3_1000

Unnamed: 0,Features,cw5,cw6,cw7,cw8,cw9,cw10,cw20,cw30
0,ABSNJZH_std_mean_ws8_ss4,752,753,745,746,719,693,470,342
1,TOTUSJH_mean_slice[40:54],524,558,580,598,618,633,674,660
2,TOTUSJH_mean_slice[32:46],373,378,413,409,438,463,503,458
3,ABSNJZH_std_max_ws8_ss4,356,358,358,338,334,340,339,307
4,SAVNCPP_std_mean_ws8_ss4,373,379,363,373,372,396,267,177
5,ABSNJZH_std_max_ws15_ss8,403,379,354,341,308,293,228,172
6,R_VALUE_mean_mean_ws8_ss4,203,208,208,221,208,196,186,183
7,TOTUSJH_mean_mean_ws15_ss8,164,190,213,230,235,218,185,154
8,TOTUSJH_mean_max_ws8_ss4,118,150,170,166,186,191,260,246
9,TOTUSJH_mean_slice[16:30],132,145,162,176,206,222,232,208


In [16]:
log2_top_exp3.to_csv('./log2_TOP_exp3_bs10.csv')
log2_top_exp3_100.to_csv('./log2_TOP_exp3_bs100.csv')
log2_top_exp3_1000.to_csv('./log2_TOP_exp3_bs1000.csv')

In [15]:
import xgboost as xgb
from xgboost import XGBClassifier
import Measurements as measurements
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

In [16]:
exp1_p1 = pd.read_csv('./data/exp1/exp1_p1_wsALL_24features.csv')
exp1_p2 = pd.read_csv('./data/exp1/exp1_p2_wsALL_24features.csv')
exp1_p3 = pd.read_csv('./data/exp1/exp1_p3_wsALL_24features.csv')
exp1_p4 = pd.read_csv('./data/exp1/exp1_p4_wsALL_24features.csv')
exp1_p5 = pd.read_csv('./data/exp1/exp1_p5_wsALL_24features.csv')

In [17]:
exp2_p1 = pd.read_csv('./data/exp2/exp2_p1_wsALL_24features.csv')
exp2_p2 = pd.read_csv('./data/exp2/exp2_p2_wsALL_24features.csv')
exp2_p3 = pd.read_csv('./data/exp2/exp2_p3_wsALL_24features.csv')
exp2_p4 = pd.read_csv('./data/exp2/exp2_p4_wsALL_24features.csv')
exp2_p5 = pd.read_csv('./data/exp2/exp2_p5_wsALL_24features.csv')

In [18]:
exp3_p1 = pd.read_csv('./data/exp3/exp3_p1_wsALL_24features.csv')
exp3_p2 = pd.read_csv('./data/exp3/exp3_p2_wsALL_24features.csv')
exp3_p3 = pd.read_csv('./data/exp3/exp3_p3_wsALL_24features.csv')
exp3_p4 = pd.read_csv('./data/exp3/exp3_p4_wsALL_24features.csv')
exp3_p5 = pd.read_csv('./data/exp3/exp3_p5_wsALL_24features.csv')

In [19]:
def extract_data(df_list,columns):
    return rename_columns(pd.concat(df_list, ignore_index=True)[columns])

def extract_label(df_list):
    df = pd.concat(df_list, ignore_index=True)
    df['LABEL'] = np.where(df['LABEL'] == 'CBN', 0, 1)
    return df['LABEL']

def rename_columns(df):
    df.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df.columns.values]
    return df

def evaluation(x_test, y_test, y_pred, clf):
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tn, fp, fn, tp = scores

    results_DF = pd.DataFrame(columns = ['Confusion_Matrix(tn, fp, fn, tp)', 'Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'], index = [0])

    #Confusion Matrix
    results_DF['Confusion_Matrix(tn, fp, fn, tp)'] = [scores]
    
    #Accuracy
    scoreTest = clf.score(x_test, y_test)
    #print("Train Accuracy: " + str(round(scoreTrain, 4)))
    #print("Test Accuracy: " + str(round(scoreTest, 4)))
    results_DF['Accur'] = scoreTest

    # TSS
    tss = measurements.TSS(scores)
    results_DF['TSS'] = tss

    # HSS2 Definition 2
    hss2 = measurements.HSS2(scores)
    results_DF['HSS'] = hss2

    # GSS
    gss = measurements.GSS(scores)
    results_DF['GSS'] = gss

    # TPR
    tpr = measurements.TPR(scores)
    results_DF['TPR'] = tpr
    
    # TNR
    tnr = measurements.TNR(scores)
    results_DF['TNR'] = tnr

    # Precision Negative
    negPrecision = measurements.precisionNeg(scores)
    results_DF['CBNPr'] = negPrecision
    
    # Precision Positive
    posPrecision = measurements.precisionPos(scores)
    results_DF['XMPr'] = posPrecision

    # FAR
    far = measurements.FAR(scores)
    results_DF['FAR'] = far

    # POFD
    pofd = measurements.POFD(scores)
    results_DF['POFD'] = pofd

    # F1(XM)
    f1XM = measurements.F1Pos(scores)
    results_DF['f1XM'] = f1XM

    # F1(CBN)
    f1CBN = measurements.F1Neg(scores)
    results_DF['f1CBN'] = f1CBN

    # Return the result measurement dataframe
    return results_DF

# Cross-validation

In [24]:
train_set_exp1_TEp5 = [exp1_p1,exp1_p2,exp1_p3,exp1_p4]
train_set_exp1_TEp4 = [exp1_p1,exp1_p2,exp1_p3,exp1_p5]
train_set_exp1_TEp3 = [exp1_p1,exp1_p2,exp1_p4,exp1_p5]
train_set_exp1_TEp2 = [exp1_p1,exp1_p3,exp1_p4,exp1_p5]
train_set_exp1_TEp1 = [exp1_p2,exp1_p3,exp1_p4,exp1_p5]

train_set_exp2_TEp5 = [exp2_p1,exp2_p2,exp2_p3,exp2_p4]
train_set_exp2_TEp4 = [exp2_p1,exp2_p2,exp2_p3,exp2_p5]
train_set_exp2_TEp3 = [exp2_p1,exp2_p2,exp2_p4,exp2_p5]
train_set_exp2_TEp2 = [exp2_p1,exp2_p3,exp2_p4,exp2_p5]
train_set_exp2_TEp1 = [exp2_p2,exp2_p3,exp2_p4,exp2_p5]

train_set_exp3_TEp5 = [exp3_p1,exp3_p2,exp3_p3,exp3_p4]
train_set_exp3_TEp4 = [exp3_p1,exp3_p2,exp3_p3,exp3_p5]
train_set_exp3_TEp3 = [exp3_p1,exp3_p2,exp3_p4,exp3_p5]
train_set_exp3_TEp2 = [exp3_p1,exp3_p3,exp3_p4,exp3_p5]
train_set_exp3_TEp1 = [exp3_p2,exp3_p3,exp3_p4,exp3_p5]

train_set_exp1_list = [train_set_exp1_TEp1, train_set_exp1_TEp2, train_set_exp1_TEp3, train_set_exp1_TEp4, train_set_exp1_TEp5]
train_set_exp2_list = [train_set_exp2_TEp1, train_set_exp2_TEp2, train_set_exp2_TEp3, train_set_exp2_TEp4, train_set_exp2_TEp5]
train_set_exp3_list = [train_set_exp3_TEp1, train_set_exp3_TEp2, train_set_exp3_TEp3, train_set_exp3_TEp4, train_set_exp3_TEp5]

In [25]:
test_set_exp1_TEp5 = [exp1_p5]
test_set_exp1_TEp4 = [exp1_p4]
test_set_exp1_TEp3 = [exp1_p3]
test_set_exp1_TEp2 = [exp1_p2]
test_set_exp1_TEp1 = [exp1_p1]

test_set_exp2_TEp5 = [exp2_p5]
test_set_exp2_TEp4 = [exp2_p4]
test_set_exp2_TEp3 = [exp2_p3]
test_set_exp2_TEp2 = [exp2_p2]
test_set_exp2_TEp1 = [exp2_p1]

test_set_exp3_TEp5 = [exp3_p5]
test_set_exp3_TEp4 = [exp3_p4]
test_set_exp3_TEp3 = [exp3_p3]
test_set_exp3_TEp2 = [exp3_p2]
test_set_exp3_TEp1 = [exp3_p1]

test_set_exp1_list = [test_set_exp1_TEp1, test_set_exp1_TEp2, test_set_exp1_TEp3, test_set_exp1_TEp4, test_set_exp1_TEp5]
test_set_exp2_list = [test_set_exp2_TEp1, test_set_exp2_TEp2, test_set_exp2_TEp3, test_set_exp2_TEp4, test_set_exp2_TEp5]
test_set_exp3_list = [test_set_exp3_TEp1, test_set_exp3_TEp2, test_set_exp3_TEp3, test_set_exp3_TEp4, test_set_exp3_TEp5]

In [26]:
log_exp1_X_train_100_list = []
log_exp2_X_train_100_list = []
log_exp3_X_train_100_list = []
sqrt_exp1_X_train_100_list = []
sqrt_exp2_X_train_100_list = []
sqrt_exp3_X_train_100_list = []

log_exp1_X_test_100_list = []
log_exp2_X_test_100_list = []
log_exp3_X_test_100_list = []
sqrt_exp1_X_test_100_list = []
sqrt_exp2_X_test_100_list = []
sqrt_exp3_X_test_100_list = []

exp1_y_train_100_list = []
exp2_y_train_100_list = []
exp3_y_train_100_list = []

exp1_y_test_100_list = []
exp2_y_test_100_list = []
exp3_y_test_100_list = []

for i in range(5):
    log_exp1_X_train_100_list.append(extract_data(train_set_exp1_list[i],log2_top_exp1_100['Features'].to_list()))
    log_exp2_X_train_100_list.append(extract_data(train_set_exp2_list[i],log2_top_exp2_100['Features'].to_list()))
    log_exp3_X_train_100_list.append(extract_data(train_set_exp3_list[i],log2_top_exp3_100['Features'].to_list()))
    sqrt_exp1_X_train_100_list.append(extract_data(train_set_exp1_list[i],sqrt_top_exp1_100['Features'].to_list()))
    sqrt_exp2_X_train_100_list.append(extract_data(train_set_exp2_list[i],sqrt_top_exp2_100['Features'].to_list()))
    sqrt_exp3_X_train_100_list.append(extract_data(train_set_exp3_list[i],sqrt_top_exp3_100['Features'].to_list()))

    log_exp1_X_test_100_list.append(extract_data(test_set_exp1_list[i],log2_top_exp1_100['Features'].to_list()))
    log_exp2_X_test_100_list.append(extract_data(test_set_exp2_list[i],log2_top_exp2_100['Features'].to_list()))
    log_exp3_X_test_100_list.append(extract_data(test_set_exp3_list[i],log2_top_exp3_100['Features'].to_list()))
    sqrt_exp1_X_test_100_list.append(extract_data(test_set_exp1_list[i],sqrt_top_exp1_100['Features'].to_list()))
    sqrt_exp2_X_test_100_list.append(extract_data(test_set_exp2_list[i],sqrt_top_exp2_100['Features'].to_list()))
    sqrt_exp3_X_test_100_list.append(extract_data(test_set_exp3_list[i],sqrt_top_exp3_100['Features'].to_list()))
 
    exp1_y_train_100_list.append(extract_label(train_set_exp1_list[i]))
    exp2_y_train_100_list.append(extract_label(train_set_exp2_list[i]))
    exp3_y_train_100_list.append(extract_label(train_set_exp3_list[i]))
    
    exp1_y_test_100_list.append(extract_label(test_set_exp1_list[i]))
    exp2_y_test_100_list.append(extract_label(test_set_exp2_list[i]))
    exp3_y_test_100_list.append(extract_label(test_set_exp3_list[i]))

In [49]:
log_exp1_X_train_1000_list = []
log_exp2_X_train_1000_list = []
log_exp3_X_train_1000_list = []
sqrt_exp1_X_train_1000_list = []
sqrt_exp2_X_train_1000_list = []
sqrt_exp3_X_train_1000_list = []

log_exp1_X_test_1000_list = []
log_exp2_X_test_1000_list = []
log_exp3_X_test_1000_list = []
sqrt_exp1_X_test_1000_list = []
sqrt_exp2_X_test_1000_list = []
sqrt_exp3_X_test_1000_list = []

exp1_y_train_1000_list = []
exp2_y_train_1000_list = []
exp3_y_train_1000_list = []

exp1_y_test_1000_list = []
exp2_y_test_1000_list = []
exp3_y_test_1000_list = []

for i in range(5):
    log_exp1_X_train_1000_list.append(extract_data(train_set_exp1_list[i],log2_top_exp1_1000['Features'].to_list()))
    log_exp2_X_train_1000_list.append(extract_data(train_set_exp2_list[i],log2_top_exp2_1000['Features'].to_list()))
    log_exp3_X_train_1000_list.append(extract_data(train_set_exp3_list[i],log2_top_exp3_1000['Features'].to_list()))
    sqrt_exp1_X_train_1000_list.append(extract_data(train_set_exp1_list[i],sqrt_top_exp1_1000['Features'].to_list()))
    sqrt_exp2_X_train_1000_list.append(extract_data(train_set_exp2_list[i],sqrt_top_exp2_1000['Features'].to_list()))
    sqrt_exp3_X_train_1000_list.append(extract_data(train_set_exp3_list[i],sqrt_top_exp3_1000['Features'].to_list()))

    log_exp1_X_test_1000_list.append(extract_data(test_set_exp1_list[i],log2_top_exp1_1000['Features'].to_list()))
    log_exp2_X_test_1000_list.append(extract_data(test_set_exp2_list[i],log2_top_exp2_1000['Features'].to_list()))
    log_exp3_X_test_1000_list.append(extract_data(test_set_exp3_list[i],log2_top_exp3_1000['Features'].to_list()))
    sqrt_exp1_X_test_1000_list.append(extract_data(test_set_exp1_list[i],sqrt_top_exp1_1000['Features'].to_list()))
    sqrt_exp2_X_test_1000_list.append(extract_data(test_set_exp2_list[i],sqrt_top_exp2_1000['Features'].to_list()))
    sqrt_exp3_X_test_1000_list.append(extract_data(test_set_exp3_list[i],sqrt_top_exp3_1000['Features'].to_list()))
 
    exp1_y_train_1000_list.append(extract_label(train_set_exp1_list[i]))
    exp2_y_train_1000_list.append(extract_label(train_set_exp2_list[i]))
    exp3_y_train_1000_list.append(extract_label(train_set_exp3_list[i]))
    
    exp1_y_test_1000_list.append(extract_label(test_set_exp1_list[i]))
    exp2_y_test_1000_list.append(extract_label(test_set_exp2_list[i]))
    exp3_y_test_1000_list.append(extract_label(test_set_exp3_list[i]))

## Experiments
### Log2 Top Rank Features

In [50]:
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [51]:
exp_result = pd.DataFrame(columns=['Experiment','Confusion_Matrix(tn, fp, fn, tp)','Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'])

for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp1_X_train_100_list[i], exp1_y_train_100_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp1_X_test_100_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp1_X_test_100_list[i], exp1_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp1_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 43.59 s
	Predicting time: 0.02 s
For TESTING set: p2
	Training time: 42.72 s
	Predicting time: 0.03 s
For TESTING set: p3
	Training time: 8.7 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 6.0 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 6.28 s
	Predicting time: 0.01 s


In [52]:
exp_result = pd.DataFrame(columns=['Experiment','Confusion_Matrix(tn, fp, fn, tp)','Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'])

for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp1_X_train_1000_list[i], exp1_y_train_1000_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp1_X_test_1000_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp1_X_test_1000_list[i], exp1_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp1_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 5.91 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.1 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 6.36 s
	Predicting time: 0.0 s
For TESTING set: p4
	Training time: 6.18 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 5.6 s
	Predicting time: 0.01 s


In [53]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp2_X_train_100_list[i], exp2_y_train_100_list[i])
    print("Training time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp2_X_test_100_list[i])
    print("Predicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp2_X_test_100_list[i], exp2_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp2_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
Training time: 5.51 s
Predicting time: 0.01 s
For TESTING set: p2
Training time: 5.25 s
Predicting time: 0.01 s
For TESTING set: p3
Training time: 6.27 s
Predicting time: 0.01 s
For TESTING set: p4
Training time: 6.26 s
Predicting time: 0.01 s
For TESTING set: p5
Training time: 6.02 s
Predicting time: 0.01 s


In [54]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp2_X_train_1000_list[i], exp2_y_train_1000_list[i])
    print("Training time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp2_X_test_1000_list[i])
    print("Predicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp2_X_test_1000_list[i], exp2_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp2_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
Training time: 5.58 s
Predicting time: 0.01 s
For TESTING set: p2
Training time: 4.51 s
Predicting time: 0.01 s
For TESTING set: p3
Training time: 6.15 s
Predicting time: 0.01 s
For TESTING set: p4
Training time: 6.59 s
Predicting time: 0.01 s
For TESTING set: p5
Training time: 5.72 s
Predicting time: 0.01 s


In [55]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp3_X_train_100_list[i], exp3_y_train_100_list[i])
    print("Training time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp3_X_test_100_list[i])
    print("Predicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp3_X_test_100_list[i], exp3_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp3_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
Training time: 6.01 s
Predicting time: 0.01 s
For TESTING set: p2
Training time: 4.23 s
Predicting time: 0.01 s
For TESTING set: p3
Training time: 6.04 s
Predicting time: 0.01 s
For TESTING set: p4
Training time: 6.34 s
Predicting time: 0.01 s
For TESTING set: p5
Training time: 5.71 s
Predicting time: 0.01 s


In [56]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(log_exp3_X_train_1000_list[i], exp3_y_train_1000_list[i])
    print("Training time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(log_exp3_X_test_1000_list[i])
    print("Predicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(log_exp3_X_test_1000_list[i], exp3_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['log2_exp3_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
Training time: 5.72 s
Predicting time: 0.01 s
For TESTING set: p2
Training time: 4.79 s
Predicting time: 0.01 s
For TESTING set: p3
Training time: 6.62 s
Predicting time: 0.01 s
For TESTING set: p4
Training time: 6.17 s
Predicting time: 0.01 s
For TESTING set: p5
Training time: 6.01 s
Predicting time: 0.01 s


### Squared-Root Top Rank Features

In [57]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp1_X_train_100_list[i], exp1_y_train_100_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp1_X_test_100_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp1_X_test_100_list[i], exp1_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp1_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 7.61 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 6.0 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 8.31 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.68 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 7.11 s
	Predicting time: 0.01 s


In [58]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp1_X_train_1000_list[i], exp1_y_train_1000_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp1_X_test_1000_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp1_X_test_1000_list[i], exp1_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp1_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 7.96 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.67 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 7.82 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.08 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 6.6 s
	Predicting time: 0.01 s


In [59]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp2_X_train_100_list[i], exp2_y_train_100_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp2_X_test_100_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp2_X_test_100_list[i], exp2_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp2_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 6.73 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.54 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 7.1 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.74 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 6.32 s
	Predicting time: 0.01 s


In [60]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp2_X_train_1000_list[i], exp2_y_train_1000_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp2_X_test_1000_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp2_X_test_1000_list[i], exp2_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp2_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 7.02 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.81 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 7.87 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.17 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 7.02 s
	Predicting time: 0.01 s


In [61]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp3_X_train_100_list[i], exp3_y_train_100_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp3_X_test_100_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp3_X_test_100_list[i], exp3_y_test_100_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp3_boostrap100_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 7.22 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.9 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 8.17 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.94 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 6.79 s
	Predicting time: 0.01 s


In [62]:
for i in range(5):
    print('For TESTING set: p' + str(i+1))
    bst = XGBClassifier()
    t0=time.time()
    bst.fit(sqrt_exp3_X_train_1000_list[i], exp3_y_train_1000_list[i])
    print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds
    t1=time.time()
    exp_y_pred = bst.predict(sqrt_exp3_X_test_1000_list[i])
    print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

    result = evaluation(sqrt_exp3_X_test_1000_list[i], exp3_y_test_1000_list[i], exp_y_pred, bst)
    result['Experiment'] = ['sqrt_exp3_boostrap1000_TEp'+str(i+1)]
    exp_result = exp_result.append(result)

For TESTING set: p1
	Training time: 7.45 s
	Predicting time: 0.01 s
For TESTING set: p2
	Training time: 5.81 s
	Predicting time: 0.01 s
For TESTING set: p3
	Training time: 8.08 s
	Predicting time: 0.01 s
For TESTING set: p4
	Training time: 7.92 s
	Predicting time: 0.01 s
For TESTING set: p5
	Training time: 8.15 s
	Predicting time: 0.01 s


In [63]:
exp_result.to_csv('./XGBoost_result.csv')

In [64]:
import math
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [65]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
       'scale_pos_weight': hp.choice ('scale_pos_weight', [1,3,4,5,6,7,8,9,10,20,30]),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.choice('n_estimators', [10,50,100]),
        'seed': 0
    }

# def score(space):
#     clf=xgb.XGBClassifier(
#                     n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
#                     reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
#                     colsample_bytree=int(space['colsample_bytree']))
    
#     evaluation = [( X_train, y_train), ( X_test, y_test)]
    
#     clf.fit(X_train, y_train,
#             eval_set=evaluation, eval_metric="auc",
#             early_stopping_rounds=10,verbose=False)
    

#     y_pred = clf.predict(X_test)
#     scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
#     return scores

def objective_weighted(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tss = measurements.TSS(scores)
    hss = measurements.HSS2(scores)
    weighted = (tss+hss)/2
#     weighted_sqrt = math.sqrt(tss*hss)
    print ("SCORE: " + str(weighted))
    print ("TSS: " + str(tss) + "  HSS: " + str(hss))
    return {'loss': -weighted, 'status': STATUS_OK }

def objective_weighted_sqrt(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tss = measurements.TSS(scores)
    hss = measurements.HSS2(scores)
    weighted_sqrt = math.sqrt(tss*hss)
    print ("SCORE: " + str(weighted_sqrt))
    print ("TSS: " + str(tss) + "  HSS: " + str(hss))
    return {'loss': -weighted_sqrt, 'status': STATUS_OK }

In [66]:
def objective_weighted_TSS(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tss = measurements.TSS(scores)
    hss = measurements.HSS2(scores)
    tpr = measurements.TPR(scores)
    tnr = measurements.TNR(scores)
    weighted = 0.5*tpr+1.5*tnr-1
    print ("SCORE: " + str(weighted)) 
    print ("TSS: " + str(tss) + "  HSS: " + str(hss))
    return {'loss': -weighted, 'status': STATUS_OK }

def objective_weighted_TSS2(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tss = measurements.TSS(scores)
    hss = measurements.HSS2(scores)
    tpr = measurements.TPR(scores)
    tnr = measurements.TNR(scores)
    weighted = 0.75*tpr+1.25*tnr-1
    print ("SCORE: " + str(weighted)) 
    print ("TSS: " + str(tss) + "  HSS: " + str(hss))
    return {'loss': -weighted, 'status': STATUS_OK }

def objective_weighted_TSS3(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']), scale_pos_weight=space['scale_pos_weight'],
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_test)
    scores = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
    tss = measurements.TSS(scores)
    hss = measurements.HSS2(scores)
    tpr = measurements.TPR(scores)
    tnr = measurements.TNR(scores)
    weighted = 0.2*tpr+1.8*tnr-1
    print ("SCORE: " + str(weighted)) 
    print ("TSS: " + str(tss) + "  HSS: " + str(hss))
    return {'loss': -weighted, 'status': STATUS_OK }

In [49]:
X_train, y_train, X_test, y_test = log2_exp1_X_train_100, exp1_y_train, log2_exp1_X_test_100, exp1_y_test

In [50]:
trials = Trials()

best_hyperparams_weighted = fmin(fn = objective_weighted,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE: 0.5653108117457333                                                       
TSS: 0.8550124835135136  HSS: 0.275609139977953                                 
SCORE: 0.5435077760631534                                                       
TSS: 0.6262693972912688  HSS: 0.46074615483503784                               
SCORE: 0.5549464619003598                                                       
TSS: 0.6743517173126843  HSS: 0.43554120648803524                               
SCORE: 0.5747085671543557                                                       
TSS: 0.8428488477079947  HSS: 0.3065682866007166                                
SCORE: 0.5737961164339149                                                       
TSS: 0.7391797631090611  HSS: 0.40841246975876866                               
SCORE: 0.5803301493941795                                                       
TSS: 0.760376458660698  HSS: 0.40028384012766105                                
SCORE: 0.5586590005962786   

In [51]:
best_hyperparams_weighted

{'colsample_bytree': 0.8004702930719109,
 'gamma': 3.0192091142400033,
 'max_depth': 13.0,
 'min_child_weight': 0.0,
 'n_estimators': 0,
 'reg_alpha': 174.0,
 'reg_lambda': 0.41038366048233216,
 'scale_pos_weight': 8}

In [52]:
trials = Trials()

best_hyperparams_weighted_sqrt = fmin(fn = objective_weighted_sqrt,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)

SCORE: 0.5493712792731409                                                       
TSS: 0.7788375164264655  HSS: 0.3875118957738381                                
SCORE: 0.544245457171039                                                        
TSS: 0.6770267167253358  HSS: 0.4375058034400737                                
SCORE: 0.28901742217537224                                                      
TSS: 0.23938297711433026  HSS: 0.3489432345099567                               
SCORE: 0.5463407072311396                                                       
TSS: 0.7062669827316113  HSS: 0.4226279518594604                                
SCORE: 0.5090546648906471                                                       
TSS: 0.8438071089232213  HSS: 0.3071041344716949                                
SCORE: 0.5397699599563365                                                       
TSS: 0.6362494685996635  HSS: 0.45792039765865383                               
SCORE: 0.4844125367122027   

In [53]:
best_hyperparams_weighted_sqrt

{'colsample_bytree': 0.6889576990047206,
 'gamma': 8.97409994332332,
 'max_depth': 3.0,
 'min_child_weight': 7.0,
 'n_estimators': 1,
 'reg_alpha': 169.0,
 'reg_lambda': 0.19059037570928167,
 'scale_pos_weight': 8}

In [54]:
trials = Trials()

best_hyperparams_weighted_TSS = fmin(fn = objective_weighted_TSS,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)

SCORE: 0.5704836973493519                                                       
TSS: 0.14484320881945353  HSS: 0.2321273040006232                               
SCORE: 0.8248592121367291                                                       
TSS: 0.8506214490067465  HSS: 0.27583368758911003                               
SCORE: 0.8161309905473764                                                       
TSS: 0.7108970552147029  HSS: 0.41744039163100133                               
SCORE: 0.8372896327760009                                                       
TSS: 0.7713547480309249  HSS: 0.4002338621718736                                
SCORE: 0.824668714012043                                                        
TSS: 0.8516389423885723  HSS: 0.2747816054942129                                
SCORE: 0.6202380827847556                                                       
TSS: 0.24611008094091  HSS: 0.3554908347146381                                  
SCORE: 0.8337073728613442   

In [55]:
trials = Trials()

best_hyperparams_weighted_TSS2 = fmin(fn = objective_weighted_TSS2,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)

SCORE: 0.7952204666170835                                                       
TSS: 0.7585798067700618  HSS: 0.3991618601805224                                
SCORE: 0.8109155674103727                                                       
TSS: 0.7828097071471823  HSS: 0.38778974751693834                               
SCORE: 0.840727940862495                                                        
TSS: 0.8427111080548029  HSS: 0.3124736756757795                                
SCORE: 0.7965080202651522                                                       
TSS: 0.7602965449674867  HSS: 0.3998887872520426                                
SCORE: 0.7386977472055503                                                       
TSS: 0.6735333050605775  HSS: 0.4353662401085121                                
SCORE: 0.6685085129519346                                                       
TSS: 0.5686265528508181  HSS: 0.502671906602565                                 
SCORE: 0.7424160919082927   

In [56]:
trials = Trials()

best_hyperparams_weighted_TSS3 = fmin(fn = objective_weighted_TSS3,
                            space = space,
                            algo = tpe.suggest,
                            max_evals = 100,
                            trials = trials)

SCORE: 0.8096373341683685                                                       
TSS: 0.852278251934263  HSS: 0.27618323111144616                                
SCORE: 0.8861296477915441                                                       
TSS: 0.5537153265031706  HSS: 0.4977912876096702                                
SCORE: 0.8085944604719606                                                       
TSS: 0.8516988776584808  HSS: 0.27491249228011444                               
SCORE: 0.8826502741845674                                                       
TSS: 0.5037136716380644  HSS: 0.5035026056932356                                
SCORE: 0.8830779976509491                                                       
TSS: 0.6763881102829545  HSS: 0.43851861400169995                               
SCORE: 0.8880737181526355                                                       
TSS: 0.5647142974000097  HSS: 0.5036931548049948                                
SCORE: 0.878942638945647    

In [141]:
trials = Trials()

best_hyperparams_TSS = fmin(fn = objective_TSS,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                          
0.20736430406234227                                                             
SCORE:                                                                          
0.8503817079271125                                                              
SCORE:                                                                          
0.6254117312958657                                                              
SCORE:                                                                          
0.8490459659790602                                                              
SCORE:                                                                          
0.5004207473096438                                                              
SCORE:                                                                          
0.7064853391813237                                                              
SCORE:                      

In [142]:
best_hyperparams_TSS

{'colsample_bytree': 0.7685127786617614,
 'gamma': 5.16359660218271,
 'max_depth': 8.0,
 'min_child_weight': 7.0,
 'reg_alpha': 71.0,
 'reg_lambda': 0.7090421892884624,
 'scale_pos_weight': 10}

In [144]:
trials = Trials()

best_hyperparams_HSS = fmin(fn = objective_HSS,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                          
0.2789379114530684                                                              
SCORE:                                                                          
0.4624164355482593                                                              
SCORE:                                                                          
0.46344147099184196                                                             
SCORE:                                                                          
0.38958685367172924                                                             
SCORE:                                                                          
0.39672610603865527                                                             
SCORE:                                                                          
0.45916212749656876                                                             
SCORE:                      

In [145]:
best_hyperparams_HSS

{'colsample_bytree': 0.7087703583689308,
 'gamma': 4.249971558871701,
 'max_depth': 9.0,
 'min_child_weight': 4.0,
 'reg_alpha': 92.0,
 'reg_lambda': 0.15001174871067502,
 'scale_pos_weight': 1}

# Exp1 weight tunining

In [63]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp1_X_train_100, exp1_y_train)
exp1_y_pred = bst.predict(log2_exp1_X_test_100)
log2_exp1_xgbst_result_100 = evaluation(log2_exp1_X_test_100, exp1_y_test, exp1_y_pred, bst)
log2_exp1_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[47559, 2495, 363, 802]",0.9442,0.638566,0.337201,0.202791,0.688412,0.950154,0.992425,0.243251,0.756749,0.049846,0.35948,0.97083


In [64]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp1_X_train_1000, exp1_y_train)
exp1_y_pred = bst.predict(log2_exp1_X_test_1000)
log2_exp1_xgbst_result_1000 = evaluation(log2_exp1_X_test_1000, exp1_y_test, exp1_y_pred, bst)
log2_exp1_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[47529, 2525, 403, 762]",0.942834,0.603632,0.319461,0.190094,0.654077,0.949554,0.991592,0.231822,0.768178,0.050446,0.342318,0.970118


In [65]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp1_X_train_100, exp1_y_train)
exp1_y_pred = bst.predict(sqrt_exp1_X_test_100)
sqrt_exp1_xgbst_result_100 = evaluation(sqrt_exp1_X_test_100, exp1_y_test, exp1_y_pred, bst)
sqrt_exp1_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48746, 1308, 674, 491]",0.961303,0.395327,0.312322,0.18506,0.421459,0.973868,0.986362,0.272929,0.727071,0.026132,0.331309,0.980075


In [66]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp1_X_train_1000, exp1_y_train)
exp1_y_pred = bst.predict(sqrt_exp1_X_test_1000)
sqrt_exp1_xgbst_result_1000 = evaluation(sqrt_exp1_X_test_1000, exp1_y_test, exp1_y_pred, bst)
sqrt_exp1_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48721, 1333, 613, 552]",0.962006,0.447189,0.34351,0.207372,0.47382,0.973369,0.987574,0.292838,0.707162,0.026631,0.361967,0.98042


# Exp2 weight tuning

In [76]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp2_X_train_100, exp2_y_train)
exp2_y_pred = bst.predict(log2_exp2_X_test_100)
log2_exp2_xgbst_result_100 = evaluation(log2_exp2_X_test_100, exp2_y_test, exp2_y_pred, bst)
log2_exp2_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[46999, 3055, 346, 819]",0.933599,0.64197,0.300604,0.176889,0.703004,0.938966,0.992692,0.211409,0.788591,0.061034,0.325064,0.965082


In [77]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp2_X_train_1000, exp2_y_train)
exp2_y_pred = bst.predict(log2_exp2_X_test_1000)
log2_exp2_xgbst_result_1000 = evaluation(log2_exp2_X_test_1000, exp2_y_test, exp2_y_pred, bst)
log2_exp2_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[46999, 3055, 346, 819]",0.933599,0.64197,0.300604,0.176889,0.703004,0.938966,0.992692,0.211409,0.788591,0.061034,0.325064,0.965082


In [78]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp2_X_train_100, exp2_y_train)
exp2_y_pred = bst.predict(sqrt_exp2_X_test_100)
sqrt_exp2_xgbst_result_100 = evaluation(sqrt_exp2_X_test_100, exp2_y_test, exp2_y_pred, bst)
sqrt_exp2_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48671, 1383, 646, 519]",0.960386,0.417863,0.319236,0.189935,0.445494,0.97237,0.986901,0.272871,0.727129,0.02763,0.338441,0.979582


In [79]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp2_X_train_1000, exp2_y_train)
exp2_y_pred = bst.predict(sqrt_exp2_X_test_1000)
sqrt_exp2_xgbst_result_1000 = evaluation(sqrt_exp2_X_test_1000, exp2_y_test, exp2_y_pred, bst)
sqrt_exp2_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48789, 1265, 716, 449]",0.961323,0.360135,0.29276,0.171481,0.385408,0.974727,0.985537,0.26196,0.73804,0.025273,0.311914,0.980102


# Exp3 weight tuning

In [80]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp3_X_train_100, exp3_y_train)
exp3_y_pred = bst.predict(log2_exp3_X_test_100)
log2_exp3_xgbst_result_100 = evaluation(log2_exp3_X_test_100, exp3_y_test, exp3_y_pred, bst)
log2_exp3_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[47627, 2427, 429, 736]",0.944239,0.583272,0.317418,0.188649,0.63176,0.951512,0.991073,0.23269,0.76731,0.048488,0.340111,0.97089


In [81]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(log2_exp3_X_train_1000, exp3_y_train)
exp3_y_pred = bst.predict(log2_exp3_X_test_1000)
log2_exp3_xgbst_result_1000 = evaluation(log2_exp3_X_test_1000, exp3_y_test, exp3_y_pred, bst)
log2_exp3_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[47613, 2441, 326, 839]",0.945977,0.671404,0.355881,0.216457,0.720172,0.951233,0.9932,0.255793,0.744207,0.048767,0.377503,0.971763


In [82]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp3_X_train_100, exp3_y_train)
exp3_y_pred = bst.predict(sqrt_exp3_X_test_100)
sqrt_exp3_xgbst_result_100 = evaluation(sqrt_exp3_X_test_100, exp3_y_test, exp3_y_pred, bst)
sqrt_exp3_xgbst_result_100

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48876, 1178, 779, 386]",0.961792,0.307796,0.263691,0.151869,0.33133,0.976465,0.984312,0.246803,0.753197,0.023535,0.282888,0.980373


In [83]:
bst = XGBClassifier(scale_pos_weight=61.6)
bst.fit(sqrt_exp3_X_train_1000, exp3_y_train)
exp3_y_pred = bst.predict(sqrt_exp3_X_test_1000)
sqrt_exp3_xgbst_result_1000 = evaluation(sqrt_exp3_X_test_1000, exp3_y_test, exp3_y_pred, bst)
sqrt_exp3_xgbst_result_1000

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[48700, 1354, 659, 506]",0.960698,0.407284,0.315396,0.187223,0.434335,0.972949,0.986649,0.272043,0.727957,0.027051,0.334545,0.979751


## Result using all features

In [None]:
# Check for class_weight: sum(negative instances) / sum(positive instances)

In [71]:
from collections import Counter
print(Counter(exp1_p1['LABEL']).values())
print(Counter(exp1_p2['LABEL']).values())
print(Counter(exp1_p3['LABEL']).values())

dict_values([72238, 1255])
dict_values([86283, 1401])
dict_values([41058, 1424])


In [134]:
bst = XGBClassifier()
bst.fit(exp1_X_train, exp1_y_train)
exp1_y_pred = bst.predict(exp1_X_test)
exp1_xgbst_result = evaluation(exp1_X_test, exp1_y_test, exp1_y_pred, bst)

In [135]:
exp1_xgbst_result

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[49852, 202, 835, 330]",0.979754,0.279226,0.380081,0.23463,0.283262,0.995964,0.983526,0.620301,0.379699,0.004036,0.388922,0.989706


In [136]:
bst = XGBClassifier()
bst.fit(exp2_X_train, exp2_y_train)
exp2_y_pred = bst.predict(exp2_X_test)
exp2_xgbst_result = evaluation(exp2_X_test, exp2_y_test, exp2_y_pred, bst)

In [137]:
exp2_xgbst_result

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[49916, 138, 885, 280]",0.980027,0.237586,0.345902,0.209118,0.240343,0.997243,0.982579,0.669856,0.330144,0.002757,0.353759,0.989857


In [138]:
bst = XGBClassifier()
bst.fit(exp3_X_train, exp3_y_train)
exp3_y_pred = bst.predict(exp3_X_test)
exp3_xgbst_result = evaluation(exp3_X_test, exp3_y_test, exp3_y_pred, bst)

In [139]:
exp3_xgbst_result

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN
0,"[49919, 135, 864, 301]",0.980496,0.255672,0.368188,0.225631,0.258369,0.997303,0.982986,0.690367,0.309633,0.002697,0.376015,0.990093
