In [1]:
import numpy as np
import pandas as pd

import glob
import math
import time
import pickle
import joblib
import warnings
from sktime.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
import Measurements as measurements
from sklearn.metrics import make_scorer

from sktime.classification.interval_based import DrCIF

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import Feature_rank_lib as lib

In [2]:
# This method extract the necessary features from the destination folder
def data_reader(loadPath, partitions, flare_label):
    cols = ['USFLUX','TOTUSJZ','TOTUSJH','ABSNJZH','SAVNCPP','TOTPOT','TOTBSQ','TOTFZ','MEANPOT','EPSZ','MEANSHR','SHRGT45','MEANGAM','MEANGBT','MEANGBZ','MEANGBH','MEANJZH','TOTFY','MEANJZD','MEANALP','TOTFX','EPSY','EPSX','R_VALUE']
    # Read files from the define path
    all_files = glob.glob(str(loadPath) + partitions + "/" + flare_label + "/*.csv")

    li = []
    for filename in all_files:
        # Read the file and extract necessary features
        df = pd.read_csv(filename, index_col=None, header=0, sep='\t')
        df.interpolate(method='linear', axis=0, limit_direction='both', inplace = True)
#         # Extract label info
#         INFO = filename.split('/')[-1]
        # Define Label value based on the file name
    
        if flare_label == 'NF':
            LABEL = 'CBN'
        else:
            LABEL = 'XM'
            
        col_list = []
        for col in cols:
            if not df[col].isnull().values.all(axis=0):
                col_list.append(df[col])
        if len(col_list) == 24:
            li.append(col_list + [LABEL])

    # Create and return the dataframe build on the extracted features
    partition_frame = pd.DataFrame(li, columns= cols + ['LABEL'])
    return partition_frame

def evaluation(x_test, y_test, y_pred, clf):
    scores = confusion_matrix(y_test, y_pred, labels=['CBN', 'XM']).ravel()
    tn, fp, fn, tp = scores

    results_DF = pd.DataFrame(columns = ['Confusion_Matrix(tn, fp, fn, tp)', 'Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN'], index = [0])

    #Confusion Matrix
    results_DF['Confusion_Matrix(tn, fp, fn, tp)'] = [scores]
    
    #Accuracy
    scoreTest = clf.score(x_test, y_test)
    #print("Train Accuracy: " + str(round(scoreTrain, 4)))
    #print("Test Accuracy: " + str(round(scoreTest, 4)))
    results_DF['Accur'] = scoreTest

    # TSS
    tss = measurements.TSS(scores)
    results_DF['TSS'] = tss

    # HSS2 Definition 2
    hss2 = measurements.HSS2(scores)
    results_DF['HSS'] = hss2

    # GSS
    gss = measurements.GSS(scores)
    results_DF['GSS'] = gss

    # TPR
    tpr = measurements.TPR(scores)
    results_DF['TPR'] = tpr
    
    # TNR
    tnr = measurements.TNR(scores)
    results_DF['TNR'] = tnr

    # Precision Negative
    negPrecision = measurements.precisionNeg(scores)
    results_DF['CBNPr'] = negPrecision
    
    # Precision Positive
    posPrecision = measurements.precisionPos(scores)
    results_DF['XMPr'] = posPrecision

    # FAR
    far = measurements.FAR(scores)
    results_DF['FAR'] = far

    # POFD
    pofd = measurements.POFD(scores)
    results_DF['POFD'] = pofd

    # F1(XM)
    f1XM = measurements.F1Pos(scores)
    results_DF['f1XM'] = f1XM

    # F1(CBN)
    f1CBN = measurements.F1Neg(scores)
    results_DF['f1CBN'] = f1CBN

    # Return the result measurement dataframe
    return results_DF

In [3]:
datapath = '/data/SHARPS/BERKAY/v0.7/new-data-folds/instances_O12L0P24/'

In [4]:
partition1_FL = data_reader(datapath,'partition1','FL')
partition1_NF = data_reader(datapath,'partition1','NF')

partition2_FL = data_reader(datapath,'partition2','FL')
partition2_NF = data_reader(datapath,'partition2','NF')

partition3_FL = data_reader(datapath,'partition3','FL')
partition3_NF = data_reader(datapath,'partition3','NF')

partition4_FL = data_reader(datapath,'partition4','FL')
partition4_NF = data_reader(datapath,'partition4','NF')

partition5_FL = data_reader(datapath,'partition5','FL')
partition5_NF = data_reader(datapath,'partition5','NF')

In [5]:
partition1 = pd.concat([partition1_FL, partition1_NF], ignore_index=True)
partition2 = pd.concat([partition2_FL, partition2_NF], ignore_index=True)
partition3 = pd.concat([partition3_FL, partition3_NF], ignore_index=True)
partition4 = pd.concat([partition4_FL, partition4_NF], ignore_index=True)
partition5 = pd.concat([partition5_FL, partition5_NF], ignore_index=True)

In [6]:
X_train_TEp5 = pd.concat([partition1, partition2, partition3, partition4], ignore_index=True)
X_train_TEp4 = pd.concat([partition1, partition2, partition3, partition5], ignore_index=True)
X_train_TEp3 = pd.concat([partition1, partition2, partition4, partition5], ignore_index=True)
X_train_TEp2 = pd.concat([partition1, partition3, partition4, partition5], ignore_index=True)
X_train_TEp1 = pd.concat([partition2, partition3, partition4, partition5], ignore_index=True)

y_train_TEp5 = X_train_TEp5['LABEL']
X_train_TEp5 = X_train_TEp5.loc[:, X_train_TEp5.columns != 'LABEL']
y_train_TEp4 = X_train_TEp4['LABEL']
X_train_TEp4 = X_train_TEp4.loc[:, X_train_TEp4.columns != 'LABEL']
y_train_TEp3 = X_train_TEp3['LABEL']
X_train_TEp3 = X_train_TEp3.loc[:, X_train_TEp3.columns != 'LABEL']
y_train_TEp2 = X_train_TEp2['LABEL']
X_train_TEp2 = X_train_TEp2.loc[:, X_train_TEp2.columns != 'LABEL']
y_train_TEp1 = X_train_TEp1['LABEL']
X_train_TEp1 = X_train_TEp1.loc[:, X_train_TEp1.columns != 'LABEL']

In [7]:
X_test_TEp5 = partition5.loc[:, partition5.columns != 'LABEL']
y_test_TEp5 = partition5['LABEL']
X_test_TEp4 = partition4.loc[:, partition4.columns != 'LABEL']
y_test_TEp4 = partition4['LABEL']
X_test_TEp3 = partition3.loc[:, partition3.columns != 'LABEL']
y_test_TEp3 = partition3['LABEL']
X_test_TEp2 = partition2.loc[:, partition2.columns != 'LABEL']
y_test_TEp2 = partition2['LABEL']
X_test_TEp1 = partition1.loc[:, partition1.columns != 'LABEL']
y_test_TEp1 = partition1['LABEL']

In [8]:
from sklearn.tree import DecisionTreeClassifier

In [35]:
result_DF = pd.DataFrame(columns = ['Confusion_Matrix(tn, fp, fn, tp)', 'Accur', 'TSS', 'HSS', 'GSS', 'TPR', 'TNR', 'CBNPr', 'XMPr', 'FAR', 'POFD', 'f1XM', 'f1CBN', 'Experiments'])

In [13]:
from sktime.classification.feature_based import SummaryClassifier
from sktime.classification.deep_learning import InceptionTimeClassifier

In [36]:
clf = SummaryClassifier(n_jobs=10)
# clf = InceptionTimeClassifier(n_epochs=100,kernel_size=30,metrics=make_scorer(measurements.geometric_mean, greater_is_better=True))

print('Testing p5')
t0=time.time()
clf.fit(X_train_TEp5, y_train_TEp5)
print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds

t1=time.time()
y_pred_TEp5 = clf.predict(X_test_TEp5)
print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

t2=time.time()
result = evaluation(X_test_TEp5, y_test_TEp5, y_pred_TEp5, clf)
result['Experiments'] = 'TEp5'
result_DF = pd.concat([result_DF,result])
print("\tEvaluation time:", round(time.time()-t2, 2), "s") # the time would be round to 3 decimal in seconds

Testing p5
	Training time: 6434.16 s
	Predicting time: 1714.84 s
	Evaluation time: 1751.3 s


In [37]:
clf = SummaryClassifier(n_jobs=10)
# clf = InceptionTimeClassifier(n_epochs=100,kernel_size=30,metrics=make_scorer(measurements.geometric_mean, greater_is_better=True))

print('Testing p4')
t0=time.time()
clf.fit(X_train_TEp4, y_train_TEp4)
print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds

t1=time.time()
y_pred_TEp4 = clf.predict(X_test_TEp4)
print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

t2=time.time()
result = evaluation(X_test_TEp4, y_test_TEp4, y_pred_TEp4, clf)
result['Experiments'] = 'TEp4'
result_DF = pd.concat([result_DF,result])
print("\tEvaluation time:", round(time.time()-t2, 2), "s") # the time would be round to 3 decimal in seconds

Testing p4
	Training time: 6999.78 s
	Predicting time: 1169.0 s
	Evaluation time: 1208.92 s


In [38]:
clf = SummaryClassifier(n_jobs=10)
# clf = InceptionTimeClassifier(n_epochs=100,kernel_size=30,metrics=make_scorer(measurements.geometric_mean, greater_is_better=True))

print('Testing p3')
t0=time.time()
clf.fit(X_train_TEp3, y_train_TEp3)
print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds

t1=time.time()
y_pred_TEp3 = clf.predict(X_test_TEp3)
print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

t2=time.time()
result = evaluation(X_test_TEp3, y_test_TEp3, y_pred_TEp3, clf)
result['Experiments'] = 'TEp3'
result_DF = pd.concat([result_DF,result])
print("\tEvaluation time:", round(time.time()-t2, 2), "s") # the time would be round to 3 decimal in seconds

Testing p3
	Training time: 7181.91 s
	Predicting time: 968.59 s
	Evaluation time: 964.96 s


In [39]:
clf = SummaryClassifier(n_jobs=10)
# clf = InceptionTimeClassifier(n_epochs=100,kernel_size=30,metrics=make_scorer(measurements.geometric_mean, greater_is_better=True))

print('Testing p2')
t0=time.time()
clf.fit(X_train_TEp2, y_train_TEp2)
print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds

t1=time.time()
y_pred_TEp2 = clf.predict(X_test_TEp2)
print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

t2=time.time()
result = evaluation(X_test_TEp2, y_test_TEp2, y_pred_TEp2, clf)
result['Experiments'] = 'TEp2'
result_DF = pd.concat([result_DF,result])
print("\tEvaluation time:", round(time.time()-t2, 2), "s") # the time would be round to 3 decimal in seconds

Testing p2
	Training time: 6041.75 s
	Predicting time: 1994.17 s
	Evaluation time: 1999.39 s


In [40]:
clf = SummaryClassifier(n_jobs=10)
# clf = InceptionTimeClassifier(n_epochs=100,kernel_size=30,metrics=make_scorer(measurements.geometric_mean, greater_is_better=True))

print('Testing p1')
t0=time.time()
clf.fit(X_train_TEp1, y_train_TEp1)
print("\tTraining time:", round(time.time()-t0, 2), "s") # the time would be round to 3 decimal in seconds

t1=time.time()
y_pred_TEp1 = clf.predict(X_test_TEp1)
print("\tPredicting time:", round(time.time()-t1, 2), "s") # the time would be round to 3 decimal in seconds

t2=time.time()
result = evaluation(X_test_TEp1, y_test_TEp1, y_pred_TEp1, clf)
result['Experiments'] = 'TEp1'
result_DF = pd.concat([result_DF,result])
print("\tEvaluation time:", round(time.time()-t2, 2), "s") # the time would be round to 3 decimal in seconds

Testing p1
	Training time: 6411.09 s
	Predicting time: 1675.3 s
	Evaluation time: 1672.67 s


In [41]:
result_DF

Unnamed: 0,"Confusion_Matrix(tn, fp, fn, tp)",Accur,TSS,HSS,GSS,TPR,TNR,CBNPr,XMPr,FAR,POFD,f1XM,f1CBN,Experiments
0,"[74189, 113, 840, 150]",0.987343,0.149994,0.235204,0.133275,0.151515,0.998479,0.988804,0.570342,0.429658,0.001521,0.239425,0.993618,TEp5
0,"[49963, 91, 992, 173]",0.978856,0.14668,0.235704,0.133597,0.148498,0.998182,0.980532,0.655303,0.344697,0.001818,0.242127,0.989278,TEp4
0,"[40789, 269, 1115, 309]",0.967421,0.210443,0.295047,0.173053,0.216994,0.993448,0.973392,0.534602,0.465398,0.006552,0.308691,0.983318,TEp3
0,"[85863, 420, 925, 476]",0.984661,0.33489,0.407063,0.255542,0.339757,0.995132,0.989342,0.53125,0.46875,0.004868,0.414454,0.992229,TEp2
0,"[72144, 94, 1076, 179]",0.98408,0.141328,0.229592,0.129683,0.142629,0.998699,0.985305,0.655678,0.344322,0.001301,0.234293,0.991956,TEp1
