In [1]:
import glob
import random
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import Measurements as measurements
from sklearn.metrics import confusion_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def comb_ws(df1,df2,df3):
    df1 = df1.drop(columns=['LABEL'])
    df2 = df2.drop(columns=['LABEL'])
    df3 = df3.drop(columns=['LABEL'])
    
    out_df = pd.merge(pd.merge(df1,df2,on='FILE', suffixes=[None, '_ws15_ss8']),df3,on='FILE', suffixes=['_ws8_ss4', '_ws30_ss15'])
 
    out_df['LABEL'] = ['CBN' if file.split('/')[-2]=='NF' else 'XM' for file in out_df['FILE']]
    
    # Setting FILE column to the last
    file = out_df['FILE']
    out_df = out_df.drop(columns=['FILE'])
    out_df['FILE'] = file
    
    return out_df

def rand_subsample(df, percent=.05):
    FL_df = df[df['LABEL']=='XM'].sample(frac = percent)
    NF_df = df[df['LABEL']=='CBN'].sample(frac = percent)
    return NF_df.append(FL_df, ignore_index=True)

def train_RF(train_lst, testDF, class_weights={'CBN': 1, 'XM': 43}, n_est=100, max_dep=1):
    trainDF = pd.DataFrame([])
    for i in range(0, len(train_lst)):
        trainDF = trainDF.append(train_lst[i])
        trainDF = trainDF.reset_index(drop=True)

    x_train = trainDF.loc[:, trainDF.columns != 'LABEL']
    x_train = x_train.loc[:, x_train.columns != 'FILE']
    y_train = trainDF['LABEL']
    x_test = testDF.loc[:, testDF.columns != 'LABEL']
    x_test = x_test.loc[:, x_test.columns != 'FILE']
    y_test = testDF['LABEL']
    
    clf = RandomForestClassifier(max_depth=max_dep, random_state=0, class_weight=class_weights, n_estimators=n_est)
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    
    scores = confusion_matrix(y_test, y_pred, labels=["CBN", "XM"]).ravel()
    tn, fp, fn, tp = scores

    results_DF = pd.DataFrame(columns = ['Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN', 'Confusion_Matrix(tn, fp, fn, tp)', 'Feature_importance'], index = [0])

    #Accuracy
    scoreTest = clf.score(x_test, y_test)
    #print("Train Accuracy: " + str(round(scoreTrain, 4)))
    #print("Test Accuracy: " + str(round(scoreTest, 4)))
    results_DF['Accur'] = scoreTest

    # TSS
    tss = measurements.TSS(scores)
    results_DF['TSS'] = tss

    # HSS2 Definition 2
    hss2 = measurements.HSS2(scores)
    results_DF['HSS'] = hss2

    # GSS
    gss = measurements.GSS(scores)
    results_DF['GSS'] = gss

    # TPR
    tpr = measurements.TPR(scores)
    results_DF['TPR'] = tpr
    
    # TNR
    tnr = measurements.TNR(scores)
    results_DF['TNR'] = tnr

    # Precision Negative
    negPrecision = measurements.precisionNeg(scores)
    results_DF['CBNPr'] = negPrecision
    
    # Precision Positive
    posPrecision = measurements.precisionPos(scores)
    results_DF['XMPr'] = posPrecision

    # FAR
    far = measurements.FAR(scores)
    results_DF['FAR'] = far

    # POFD
    pofd = measurements.POFD(scores)
    results_DF['POFD'] = pofd

    # F1(XM)
    f1XM = measurements.F1Pos(scores)
    results_DF['f1XM'] = f1XM

    # F1(CBN)
    f1CBN = measurements.F1Neg(scores)
    results_DF['f1CBN'] = f1CBN
    
    #Confusion Matrix
    results_DF['Confusion_Matrix(tn, fp, fn, tp)'] = [scores]
    
    #Feature Importance
#     features = x_train.columns.to_list()
#     feature_score = clf.feature_importances_.tolist()
#     sort_feature = sorted(feature_score, reverse=True)
#     feature_index = []
#     for f in sort_feature:
#         f_index = feature_score.index(f)
#         feature_index.append(features[f_index])
        
    feature_score = clf.feature_importances_
    # Sort the feature importance in descending order
    sorted_indices = np.argsort(feature_score)[::-1]
    features = x_train.columns.to_list()
    sort_feature = []
    feature_index = []
    for f in range(x_train.shape[1]):
        sort_feature.append(features[sorted_indices[f]])
        feature_index.append(feature_score[sorted_indices[f]])
    results_DF['Feature_importance'] = [sort_feature]
    results_DF['Feature_importance_score'] = [feature_index]

    # Return the result measurement dataframe
    return results_DF

In [3]:
partition1_ws8_ss4 = pd.read_csv('./data/exp3/exp3_p1_ws8_ss4_24features.csv')
partition2_ws8_ss4 = pd.read_csv('./data/exp3/exp3_p2_ws8_ss4_24features.csv')
partition3_ws8_ss4 = pd.read_csv('./data/exp3/exp3_p3_ws8_ss4_24features.csv')
partition4_ws8_ss4 = pd.read_csv('./data/exp3/exp3_p4_ws8_ss4_24features.csv')
partition5_ws8_ss4 = pd.read_csv('./data/exp3/exp3_p5_ws8_ss4_24features.csv')

partition1_ws15_ss8 = pd.read_csv('./data/exp3/exp3_p1_ws15_ss8_24features.csv')
partition2_ws15_ss8 = pd.read_csv('./data/exp3/exp3_p2_ws15_ss8_24features.csv')
partition3_ws15_ss8 = pd.read_csv('./data/exp3/exp3_p3_ws15_ss8_24features.csv')
partition4_ws15_ss8 = pd.read_csv('./data/exp3/exp3_p4_ws15_ss8_24features.csv')
partition5_ws15_ss8 = pd.read_csv('./data/exp3/exp3_p5_ws15_ss8_24features.csv')

partition1_ws30_ss15 = pd.read_csv('./data/exp3/exp3_p1_ws30_ss15_24features.csv')
partition2_ws30_ss15 = pd.read_csv('./data/exp3/exp3_p2_ws30_ss15_24features.csv')
partition3_ws30_ss15 = pd.read_csv('./data/exp3/exp3_p3_ws30_ss15_24features.csv')
partition4_ws30_ss15 = pd.read_csv('./data/exp3/exp3_p4_ws30_ss15_24features.csv')
partition5_ws30_ss15 = pd.read_csv('./data/exp3/exp3_p5_ws30_ss15_24features.csv')

In [4]:
def bootstrap(run=10):
    partition_ws8_ss4 = []
    partition_ws15_ss8 = []
    partition_ws30_ss15 = []

    partition_ws8_ss4_valid = []
    partition_ws15_ss8_valid = []
    partition_ws30_ss15_valid = []

    for i in range(run):
        train_ws8_ss4 = [partition1_ws8_ss4, partition2_ws8_ss4, partition3_ws8_ss4, partition5_ws8_ss4]
        train_ws15_ss8 = [partition1_ws15_ss8, partition2_ws15_ss8, partition3_ws15_ss8, partition5_ws15_ss8]
        train_ws30_ss15 = [partition1_ws30_ss15, partition2_ws30_ss15, partition3_ws30_ss15, partition5_ws30_ss15]
        
        valid_index = random.choice([0,1,2,3])
    
        partition_ws8_ss4_valid.append(rand_subsample(train_ws8_ss4.pop(valid_index)))
        partition_ws8_ss4_bin = []
        for partition in train_ws8_ss4:
            partition_ws8_ss4_bin.append(rand_subsample(partition))
        partition_ws8_ss4.append(partition_ws8_ss4_bin)

        partition_ws15_ss8_valid.append(rand_subsample(train_ws15_ss8.pop(valid_index)))
        partition_ws15_ss8_bin = []
        for partition in train_ws15_ss8:
            partition_ws15_ss8_bin.append(rand_subsample(partition))
        partition_ws15_ss8.append(partition_ws15_ss8_bin)

        partition_ws30_ss15_valid.append(rand_subsample(train_ws30_ss15.pop(valid_index)))
        partition_ws30_ss15_bin = []
        for partition in train_ws30_ss15:
            partition_ws30_ss15_bin.append(rand_subsample(partition))
        partition_ws30_ss15.append(partition_ws30_ss15_bin)

#         partition_ws8_ss4_test.append(rand_subsample(partition4_ws8_ss4))
#         partition_ws15_ss8_test.append(rand_subsample(partition4_ws15_ss8))
#         partition_ws30_ss15_test.append(rand_subsample(partition4_ws30_ss15))

    resultDF = pd.DataFrame(columns=['Experiment','Confusion_Matrix(tn, fp, fn, tp)', 'Feature_importance', 'Feature_importance_score','Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'])
    cw_list = [5,6,7,8,9,10,20,30]

    for cw in cw_list:
        for i in range(run):
            print('CW: '+str(cw)+' Run: '+str(i))
            trainDF_ws8_ss4 = partition_ws8_ss4[i]
            testDF_ws8_ss4 = partition_ws8_ss4_valid[i]
            result_ws8_ss4 = train_RF(trainDF_ws8_ss4, testDF_ws8_ss4, class_weights={'CBN': 1, 'XM': cw})
            result_ws8_ss4['Experiment'] = ['exp3_TRp1235_TEp4_ws8_ss4_cw'+str(cw)+'_bin'+str(i)]

            trainDF_ws15_ss8 = partition_ws15_ss8[i]
            testDF_ws15_ss8 = partition_ws15_ss8_valid[i]
            result_ws15_ss8 = train_RF(trainDF_ws15_ss8, testDF_ws15_ss8, class_weights={'CBN': 1, 'XM': cw})
            result_ws15_ss8['Experiment'] = ['exp3_TRp1235_TEp4_ws15_ss8_cw'+str(cw)+'_bin'+str(i)]

            trainDF_ws30_ss15 = partition_ws30_ss15[i]
            testDF_ws30_ss15 = partition_ws30_ss15_valid[i]
            result_ws30_ss15 = train_RF(trainDF_ws30_ss15, testDF_ws30_ss15, class_weights={'CBN': 1, 'XM': cw})
            result_ws30_ss15['Experiment'] = ['exp3_TRp1235_TEp4_ws30_ss15_cw'+str(cw)+'_bin'+str(i)]

            resultDF = resultDF.append(pd.concat([result_ws8_ss4, result_ws15_ss8, result_ws30_ss15]))

        resultDF = resultDF.reset_index(drop=True)
        resultDF.to_csv('./results/Exp3/RF_Exp3_24features_cwALL_bootstrap'+str(run)+'.csv', index=False)

In [13]:
bootstrap(run=10)

CW: 5 Run: 0
CW: 5 Run: 1
CW: 5 Run: 2
CW: 5 Run: 3
CW: 5 Run: 4
CW: 5 Run: 5
CW: 5 Run: 6
CW: 5 Run: 7
CW: 5 Run: 8
CW: 5 Run: 9
CW: 6 Run: 0
CW: 6 Run: 1
CW: 6 Run: 2
CW: 6 Run: 3
CW: 6 Run: 4
CW: 6 Run: 5
CW: 6 Run: 6
CW: 6 Run: 7
CW: 6 Run: 8
CW: 6 Run: 9
CW: 7 Run: 0
CW: 7 Run: 1
CW: 7 Run: 2
CW: 7 Run: 3
CW: 7 Run: 4
CW: 7 Run: 5
CW: 7 Run: 6
CW: 7 Run: 7
CW: 7 Run: 8
CW: 7 Run: 9
CW: 8 Run: 0
CW: 8 Run: 1
CW: 8 Run: 2
CW: 8 Run: 3
CW: 8 Run: 4
CW: 8 Run: 5
CW: 8 Run: 6
CW: 8 Run: 7
CW: 8 Run: 8
CW: 8 Run: 9
CW: 9 Run: 0
CW: 9 Run: 1
CW: 9 Run: 2
CW: 9 Run: 3
CW: 9 Run: 4
CW: 9 Run: 5
CW: 9 Run: 6
CW: 9 Run: 7
CW: 9 Run: 8
CW: 9 Run: 9
CW: 20 Run: 3
CW: 20 Run: 4
CW: 20 Run: 5
CW: 20 Run: 6
CW: 20 Run: 7
CW: 20 Run: 8
CW: 20 Run: 9
CW: 30 Run: 0
CW: 30 Run: 1
CW: 30 Run: 2
CW: 30 Run: 3
CW: 30 Run: 4
CW: 30 Run: 5
CW: 30 Run: 6
CW: 30 Run: 7
CW: 30 Run: 8
CW: 30 Run: 9


In [None]:
bootstrap(run=100)

CW: 5 Run: 0
CW: 5 Run: 1
CW: 5 Run: 2
CW: 5 Run: 3
CW: 5 Run: 4
CW: 5 Run: 5
CW: 5 Run: 6
CW: 5 Run: 7
CW: 5 Run: 8
CW: 5 Run: 9
CW: 5 Run: 10
CW: 5 Run: 11
CW: 5 Run: 12
CW: 5 Run: 13
CW: 5 Run: 14
CW: 5 Run: 15
CW: 5 Run: 16
CW: 5 Run: 17
CW: 5 Run: 18
CW: 5 Run: 19
CW: 5 Run: 20
CW: 5 Run: 21
CW: 5 Run: 22
CW: 5 Run: 23
CW: 5 Run: 24
CW: 5 Run: 25
CW: 5 Run: 26
CW: 5 Run: 27
CW: 5 Run: 28
CW: 5 Run: 29
CW: 5 Run: 30
CW: 5 Run: 31
CW: 5 Run: 32
CW: 5 Run: 33
CW: 5 Run: 34
CW: 5 Run: 35
CW: 5 Run: 36
CW: 5 Run: 37
CW: 5 Run: 38
CW: 5 Run: 39
CW: 5 Run: 40
CW: 5 Run: 41
CW: 5 Run: 42
CW: 5 Run: 43
CW: 5 Run: 44
CW: 5 Run: 45
CW: 5 Run: 46
CW: 5 Run: 47
CW: 5 Run: 48
CW: 5 Run: 49
CW: 5 Run: 50
CW: 5 Run: 51
CW: 5 Run: 52
CW: 5 Run: 53
CW: 5 Run: 54
CW: 5 Run: 55
CW: 5 Run: 56
CW: 5 Run: 57
CW: 5 Run: 58
CW: 5 Run: 59
CW: 5 Run: 60
CW: 5 Run: 61
CW: 5 Run: 62
CW: 5 Run: 63
CW: 5 Run: 64
CW: 5 Run: 65
CW: 5 Run: 66
CW: 5 Run: 67
CW: 5 Run: 68
CW: 5 Run: 69
CW: 5 Run: 70
CW: 5 Run: 71
CW

In [None]:
bootstrap(run=1000)

In [5]:
partition1 = comb_ws(partition1_ws8_ss4, partition1_ws15_ss8, partition1_ws30_ss15)
partition2 = comb_ws(partition2_ws8_ss4, partition2_ws15_ss8, partition2_ws30_ss15)
partition3 = comb_ws(partition3_ws8_ss4, partition3_ws15_ss8, partition3_ws30_ss15)
partition4 = comb_ws(partition4_ws8_ss4, partition4_ws15_ss8, partition4_ws30_ss15)
partition5 = comb_ws(partition5_ws8_ss4, partition5_ws15_ss8, partition5_ws30_ss15)

In [6]:
def bootstrap_comb(run=10):
    partition = []
    partition_valid = []

    for i in range(run):
        train_list = [partition1, partition2, partition3, partition5]
        valid_index = random.choice([0,1,2,3])
    
        partition_valid.append(rand_subsample(train_list.pop(valid_index)))
        partition_bin = []
        for part in train_list:
            partition_bin.append(rand_subsample(part))
        partition.append(partition_bin)

    resultDF = pd.DataFrame(columns=['Experiment','Confusion_Matrix(tn, fp, fn, tp)', 'Feature_importance', 'Feature_importance_score','Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'])
    cw_list = [5,6,7,8,9,10,20,30]

    for cw in cw_list:
        for i in range(run):
            trainDF = partition[i]
            testDF = partition_valid[i]
            result = train_RF(trainDF, testDF, class_weights={'CBN': 1, 'XM': cw})
            result['Experiment'] = ['exp3_TRp1235_TEp4_wsALL_cw'+str(cw)+'_bin'+str(i)]

            resultDF = resultDF.append(result)
        print('Finish for CW: '+str(cw))
        
        resultDF = resultDF.reset_index(drop=True)
        resultDF.to_csv('./results/Exp3/RF_Exp3_24features_wsALL_cwALL_bootstrap'+str(run)+'.csv', index=False)

In [17]:
bootstrap_comb(run=10)

Finish for CW: 5
Finish for CW: 6
Finish for CW: 7
Finish for CW: 8
Finish for CW: 9
Finish for CW: 10
Finish for CW: 20
Finish for CW: 30


In [18]:
bootstrap_comb(run=100)

Finish for CW: 5
Finish for CW: 6
Finish for CW: 7
Finish for CW: 8
Finish for CW: 9
Finish for CW: 10
Finish for CW: 20
Finish for CW: 30


In [19]:
bootstrap_comb(run=1000)

Finish for CW: 5
Finish for CW: 6
Finish for CW: 7
Finish for CW: 8
Finish for CW: 9
Finish for CW: 10
Finish for CW: 20
Finish for CW: 30


In [314]:
np.save('./info/Exp3_all_columns.npy', partition1.columns[:-2])

In [7]:
partition1.to_csv('./data/exp3/exp3_p1_wsALL_24features.csv',index=False)
partition2.to_csv('./data/exp3/exp3_p2_wsALL_24features.csv',index=False)
partition3.to_csv('./data/exp3/exp3_p3_wsALL_24features.csv',index=False)
partition4.to_csv('./data/exp3/exp3_p4_wsALL_24features.csv',index=False)
partition5.to_csv('./data/exp3/exp3_p5_wsALL_24features.csv',index=False)