In [1]:
import numpy as np
import pandas as pd
from scipy.io import wavfile
import tensorflow as tf
import glob, os
import random
import shutil

random.seed(42)

2022-11-21 22:02:00.997132: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-21 22:02:01.301773: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# Optional TF setting
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [101]:
data_folder = "dataset/training_data"
txt_files = glob.glob(data_folder + "/*.txt")

data = []
for file in txt_files:
    temp = {}
    with open(file, "r") as f:
        for line in f:
            if line[0].isdigit():
                temp['id'] = line.split(" ")[0].strip()
            if line.startswith("#Age:"):
                temp['age'] = line.split(" ")[-1].strip()
            if line.startswith("#Sex:"):
                temp['sex'] = line.split(" ")[-1].strip()
            if line.startswith("#Height:"):
                temp['height'] = float(line.split(" ")[-1].strip())
            if line.startswith("#Weight:"):
                temp['weight'] = float(line.split(" ")[-1].strip())
            if line.startswith("#Pregnancy"):
                temp['preg'] = line.split(" ")[-1].strip() == "True"
    data.append(temp)

In [102]:
df = pd.DataFrame.from_dict(data)
df = df.replace("nan", np.nan)
df

Unnamed: 0,id,age,sex,height,weight,preg
0,84743,Child,Male,107.0,18.600,False
1,84978,Child,Male,102.0,18.400,False
2,50751,Child,Female,116.0,19.100,False
3,84784,Neonate,Female,47.0,2.816,False
4,68888,Child,Male,,,False
...,...,...,...,...,...,...
937,68204,Adolescent,Male,165.0,48.000,False
938,49808,Child,Male,121.0,24.200,False
939,68436,Child,Female,133.0,24.900,False
940,50231,Adolescent,Male,143.0,40.400,False


In [104]:
df.loc[df['id'] == '50164']

Unnamed: 0,id,age,sex,height,weight,preg
634,50164,Adolescent,Female,,,False


In [5]:
df['weight'].describe()

count    837.000000
mean      23.632756
std       15.453337
min        2.300000
25%       12.500000
50%       20.400000
75%       31.200000
max      110.800000
Name: weight, dtype: float64

In [105]:
# Special type: Female, Adult null values --> Female, Adolescent mean values
adole_height = df[(df["sex"] == "Female") & (df["age"] == "Adolescent")]['height'].mean()
adole_weight = df[(df["sex"] == "Female") & (df["age"] == "Adolescent")]['weight'].mean()
df.loc[(df["sex"] == "Female") & (df["age"] == "Adolescent") & (df["height"].isna()), "height"] = adole_height.round(1)
df.loc[(df["sex"] == "Female") & (df["age"] == "Adolescent") & (df["weight"].isna()), "weight"] = adole_weight.round(1)

df.loc[df["preg"] == True, "age"] = "Adult"
df.loc[(df["sex"] == "Female") & (df["age"] == "Adult") & (df["height"].isna()), "height"] = adole_height.round(1)
df.loc[(df["sex"] == "Female") & (df["age"] == "Adult") & (df["weight"].isna()), "weight"] = adole_weight.round(1)

# Male, Infant null values --> mean values
infant_height = df[(df["sex"] == "Male") & (df["age"] == "Infant")]['height'].mean()
infant_weight = df[(df["sex"] == "Male") & (df["age"] == "Infant")]['weight'].mean()
df.loc[(df["sex"] == "Male") & (df["height"].isna()), "height"] = infant_height.round(1)
df.loc[(df["sex"] == "Male") & (df["weight"].isna()), "weight"] = infant_weight.round(1)

# General Type: just mean values
typeset = [
    ("Male", "Infant"), ("Female", "Infant"),
    ("Male", "Child"), ("Female", "Child"), 
    ("Male", "Adolescent"),
]

for t in typeset:
    t_height = df[(df["sex"] == t[0]) & (df["age"] == t[1])]['height'].mean()
    t_weight = df[(df["sex"] == t[0]) & (df["age"] == t[1])]['weight'].mean()
    df.loc[(df["sex"] == t[0]) & (df["age"] == t[1]) & (df["height"].isna()), "height"] = t_height.round(1)
    df.loc[(df["sex"] == t[0]) & (df["age"] == t[1]) & (df["weight"].isna()), "weight"] = t_weight.round(1)

# No age Type: Manual Setup
df.loc[(df["id"] == "50819"), "age"] = "Child"
df.loc[(df["id"] == "85113"), "age"] = "Child"
df.loc[(df["id"] == "50734"), "age"] = "Adult"
df.loc[(df["id"] == "85219"), "age"] = "Child"

# Error on base data: Manual Setup
df.loc[(df["id"] == "50797"), "age"] = "Adolescent"

In [106]:
#['Infant', 'Child', 'Adolescent', 'Adult', 'Neonate']
#['Male', 'Female']

age_classes = ['Neonate', 'Infant', 'Child', 'Adolescent', 'Adult']
sex_classes = ['Male', 'Female']

df['age'] = df['age'].apply(lambda x: age_classes.index(x))
df = pd.get_dummies(df, columns = ['age'])

#df['height'] = df['height'] / df['height'].max()
#df['weight'] = df['weight'] / df['weight'].max()
df['height'] = df['height'] / 200
df['weight'] = df['weight'] / 150
df['sex'] = df['sex'].apply(lambda x: sex_classes.index(x))
df['preg'] = df['preg'].apply(lambda x: 1 if x == True else 0)

demo_df = df
demo_df

Unnamed: 0,id,sex,height,weight,preg,age_0,age_1,age_2,age_3,age_4
0,84743,0,0.5350,0.124000,0,0,0,1,0,0
1,84978,0,0.5100,0.122667,0,0,0,1,0,0
2,50751,1,0.5800,0.127333,0,0,0,1,0,0
3,84784,1,0.2350,0.018773,0,1,0,0,0,0
4,68888,0,0.3165,0.052667,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
937,68204,0,0.8250,0.320000,0,0,0,0,1,0
938,49808,0,0.6050,0.161333,0,0,0,1,0,0
939,68436,1,0.6650,0.166000,0,0,0,1,0,0
940,50231,0,0.7150,0.269333,0,0,0,0,1,0


In [108]:
import os, glob
from scipy.io import wavfile

class Preprocessor():
    def __init__(self, data_folder, output_folder, classes = [], trimming=True):
        self.data_folder = data_folder
        self.output_folder = output_folder
        self.list_ids = []
        self.max_length = 0
        self.classes = classes
        self.labels = {}
        self.trimming = trimming
        
        shutil.rmtree(self.output_folder, ignore_errors=False)
        os.makedirs(self.output_folder)
        
    def __process_tsv(self, tsv_file, murmur):        
        root, ext = os.path.splitext(tsv_file)
        pid = root.split("/")[-1]
        frequency, recording = wavfile.read(root + '.wav')
        
        if self.trimming:
            tsv_data = []
            with open(tsv_file, "r") as f:
                for line in f:
                    tsv_data.append(line.strip().split('\t'))

            tsv_starts = None
            tsv_ends = None

            for i in range(len(tsv_data)):
                if not tsv_data[i][2] == 0:
                    tsv_start = float(tsv_data[i][0])
                    break
            for i in range(1, len(tsv_data)+1):
                if not tsv_data[-i][2] == 0:
                    tsv_ends = float(tsv_data[-i][1])
                    break

            frame_starts = int(frequency * tsv_start)
            frame_ends = int(frequency * tsv_ends)
            output_record = recording[frame_starts:frame_ends]
            output_length = len(output_record)
        else:
            output_length = len(recording)

        if self.max_length < output_length:
            self.max_length = output_length
            
        if murmur not in self.classes:
            self.classes.append(murmur)
            
        self.list_ids.append(pid)
        self.labels[pid.split("_")[0]] = self.classes.index(murmur)
        
        output_npy = self.output_folder + "/" + pid + ".npy"
        np.save(output_npy, output_record)
        
    def __process_txt(self, txt_file):
        root, ext = os.path.splitext(txt_file)
        dirs = "/".join(root.split("/")[:-1])
        pid = root.split("/")[-1]
        
        tsv_files = []
        pos = ['AV', 'MV', 'TV', 'PV', 'Phc']
        murmur = None
        
        with open(txt_file, "r") as f:
            for line in f:
                splitted = line.strip().split(" ")
                if splitted[0] in pos:
                    for i in splitted[1:]:
                        if i.endswith(".tsv"):
                            tsv_files.append(dirs + "/" + i)
                elif splitted[0].startswith("#Murmur:"):
                    murmur = splitted[1]
        
        for file in tsv_files:
            self.__process_tsv(file, murmur)
                    
    def process(self):
        for f in glob.glob(self.data_folder + "/*.txt"):
            self.__process_txt(f)

In [109]:
data_folder = "dataset/training_data"
output_folder = "trimmed_npy"
os.makedirs(output_folder, exist_ok=True)
classes = ['Present', 'Unknown', 'Absent',]

In [110]:
pp = Preprocessor(data_folder, output_folder, classes)
pp.process()

In [111]:
class DataSplitter():
    def __init__(self, data_folder, output_folder, list_ids, labels, window_size, classes=[]):
        self.data_folder = data_folder
        self.output_folder = output_folder
        self.window_size = window_size
        self.classes = classes
        self.list_ids = list_ids
        self.labels = labels
        self.output_list_ids = []
        self.output_labels = {}
        
        shutil.rmtree(self.output_folder, ignore_errors=False)
        os.makedirs(self.output_folder)
        
    def __process_npy(self, npy_file):
        root, ext = os.path.splitext(npy_file)
        pid = root.split("/")[-1]
        data = np.load(npy_file)
        
        for i in range(data.shape[0]//(self.window_size//2)):
            splitted = data[i*(self.window_size//2):(i+2)*(self.window_size//2)]
            output_npy = self.output_folder + "/" + pid + "_" + str(i) + ".npy"
            np.save(output_npy, splitted)
            self.output_list_ids.append(pid + "_" + str(i))
     
    def process(self):
        for f in glob.glob(self.data_folder + "/*.npy"):
            self.__process_npy(f)

In [112]:
data_folder = "trimmed_npy"
output_folder = "splitted_npy"
os.makedirs(output_folder, exist_ok=True)
window_size = 4000

In [113]:
ds = DataSplitter(data_folder, output_folder, pp.list_ids, pp.labels, window_size, classes)
ds.process()

In [114]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_folder, list_IDs, demo_df, labels, batch_size, dim, n_classes, shuffle=True):
        self.data_folder = data_folder
        self.dim = dim
        self.batch_size = batch_size
        self.demo_df = demo_df
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        X, y = self.__data_generation(list_IDs_temp)
        
        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        X = np.empty((len(list_IDs_temp), *self.dim))
        d = np.empty((len(list_IDs_temp),9))
        y = np.empty(len(list_IDs_temp), dtype=int)

        for i, ID in enumerate(list_IDs_temp):
            #frequency, recording = sp.io.wavfile.read(self.data_folder + "/" + ID + '.wav')
            X[i,] = np.zeros(dim)
            signal = np.load(data_folder + "/" + ID + ".npy").reshape(-1, 1)
            signal = signal / np.max(abs(signal))
            #offset = random.randrange(0, signal.shape[0]-dim[0])
            #X[i,:signal.shape[0],:signal.shape[1]] = signal[offset:offset+dim[0],:]
            X[i,:signal.shape[0],:signal.shape[1]] = signal
            y[i] = self.labels[ID.split("_")[0]]

            d[i,:] = np.array(demo_df.loc[demo_df['id'] == ID.split("_")[0]].values[0][1:]).astype(float).reshape(-1)
            
        return (X, d), y


In [115]:
data_folder = "splitted_npy"
list_ids = ds.output_list_ids
labels = ds.labels
batch_size = 32
max_length = ds.window_size
n_channels = 1
dim = (max_length, n_channels)
n_classes = len(pp.classes)

In [116]:
unique_ids = list(set([id.split("_")[0] for id in list_ids]))
random.shuffle(unique_ids)

In [117]:
sp1 = 0.6
sp2 = 0.8
train_unique = unique_ids[:int(len(unique_ids)*sp1)]
val_unique = unique_ids[int(len(unique_ids)*sp1):int(len(unique_ids)*sp2)]
test_unique = unique_ids[int(len(unique_ids)*sp2):]

In [118]:
train_ids = [ id for id in list_ids if id.split("_")[0] in train_unique ]
val_ids = [ id for id in list_ids if id.split("_")[0] in val_unique ]
test_ids = [ id for id in list_ids if id.split("_")[0] in test_unique ]

In [119]:
training_generator = DataGenerator(data_folder, train_ids, demo_df, labels, batch_size, dim, n_classes)
validation_generator = DataGenerator(data_folder, val_ids, demo_df, labels, batch_size, dim, n_classes)
test_generator = DataGenerator(data_folder, test_ids, demo_df, labels, 1, dim, n_classes, shuffle=False)

In [120]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Reshape, Conv1D, MaxPooling1D, LSTM, Bidirectional, Dense, Activation, Dropout, GaussianNoise, BatchNormalization, Flatten, AveragePooling1D, GlobalAveragePooling1D, Multiply
from tensorflow.keras.optimizers import Adam

In [121]:
from tensorflow.keras.backend import expand_dims

In [122]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Activation, Dense, BatchNormalization, Concatenate
from tensorflow.keras.layers import Add, Reshape, Multiply
import tensorflow.keras.backend as K

def conv1d_bn(x, filters, kernel_size, padding='same', strides=1, activation='relu'):
    x = Conv1D(filters, kernel_size, kernel_initializer='he_normal', padding=padding, strides=strides)(x)
    x = BatchNormalization()(x)
    if activation:
        x = Activation(activation)(x)
    
    return x

def SE_block(input_tensor, reduction_ratio=16):
    ch_input = K.int_shape(input_tensor)[-1]
    ch_reduced = ch_input//reduction_ratio
    
    # Squeeze
    x = GlobalAveragePooling1D()(input_tensor) # Eqn.2
    
    # Excitation
    x = Dense(ch_reduced, kernel_initializer='he_normal', activation='relu', use_bias=False)(x)
    x = Dense(ch_input, kernel_initializer='he_normal', activation='sigmoid', use_bias=False)(x)
    
    x = Reshape( (1, ch_input) )(x)
    x = Multiply()([input_tensor, x]) # Eqn.4
    
    return x
   
def SE_residual_block(input_tensor, filter_sizes, strides=1, reduction_ratio=16):
    filter_1, filter_2, filter_3 = filter_sizes
    
    x = conv1d_bn(input_tensor, filter_1, 1, strides=strides)
    x = conv1d_bn(x, filter_2, 3)
    x = conv1d_bn(x, filter_3, 1, activation=None)
    
    x = SE_block(x, reduction_ratio)
    
    projected_input = conv1d_bn(input_tensor, filter_3, 1, strides=strides, activation=None) if K.int_shape(input_tensor)[-1] != filter_3 else input_tensor
    shortcut = Add()([projected_input, x])
    shortcut = Activation(activation='relu')(shortcut)
    
    return shortcut
 

def stage_block(input_tensor, filter_sizes, blocks, reduction_ratio=16, stage=''):
    strides = 2 if stage != '2' else 1
    
    x = SE_residual_block(input_tensor, filter_sizes, strides, reduction_ratio)

    for i in range(blocks-1):
        x = SE_residual_block(x, filter_sizes, reduction_ratio=reduction_ratio)
    
    return x
    

def SE_ResNet50(model_input, demo_input, classes=3):
    stage_1 = conv1d_bn(model_input, 64, 7, strides=2, padding='same')
    stage_1 = MaxPooling1D(3, strides=2, padding='same')(stage_1)
    
    stage_2 = stage_block(stage_1, [64, 64, 256], 3, reduction_ratio=16, stage='2')
    stage_3 = stage_block(stage_2, [128, 128, 512], 4, reduction_ratio=16, stage='3')
    stage_4 = stage_block(stage_3, [256, 256, 1024], 6, reduction_ratio=16, stage='4')
    stage_5 = stage_block(stage_4, [512, 512, 2048], 3, reduction_ratio=16, stage='5')

    gap = GlobalAveragePooling1D()(stage_5)
    
    gap_demo = Concatenate()([gap, demo_input])
    model_output = Dense(classes, activation='softmax', kernel_initializer='he_normal')(gap_demo)
    
    model = Model(inputs=(model_input, demo_input), outputs=model_output, name='SE-ResNet50')
        
    return model

In [123]:
from tensorflow.keras import Model, Input
model_input = Input( shape=(4000,1) )
demo_input = Input( shape=(9,) )
model = SE_ResNet50(model_input, demo_input, 3)

In [124]:
from tensorflow.keras import backend as K
from datetime import datetime
now = datetime.now()

batch_size = 32

stopping = tf.keras.callbacks.EarlyStopping(patience=8)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    factor=0.1,
    patience=2,
    min_lr=0.001 * 0.001)

MAX_EPOCHS = 300

optimizer = Adam(learning_rate=0.001, clipnorm=1.)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'],)

In [125]:
model.fit(
    training_generator,
    validation_data=validation_generator,
    epochs=MAX_EPOCHS,
    callbacks=[reduce_lr, stopping, ],
    #workers=3,
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300


<keras.callbacks.History at 0x7ff5e00b50c0>

In [126]:
from sklearn import metrics
probs = model.predict(test_generator)
pred = np.argmax(probs, axis=1)
labels = [test_generator.labels[i.split("_")[0]] for i in test_generator.list_IDs]
print(metrics.confusion_matrix([a for a in labels], [p for p in pred]))
print("F1:",metrics.f1_score([a for a in labels], [p for p in pred], average="macro"))
print("Weighted F1:",metrics.f1_score([a for a in labels], [p for p in pred], average='weighted'))
print(metrics.classification_report([a for a in labels], [p for p in pred]))

[[ 2444    94  2775]
 [   79    64  1109]
 [  946   152 21213]]
F1: 0.5111505016961123
Weighted F1: 0.7974152472615093
              precision    recall  f1-score   support

           0       0.70      0.46      0.56      5313
           1       0.21      0.05      0.08      1252
           2       0.85      0.95      0.89     22311

    accuracy                           0.82     28876
   macro avg       0.59      0.49      0.51     28876
weighted avg       0.79      0.82      0.80     28876



In [127]:
test_result = {}
for i in range(len(probs)):
    if not test_generator.list_IDs[i].split("_")[0] in test_result:
        test_result[test_generator.list_IDs[i].split("_")[0]] = []
    test_result[test_generator.list_IDs[i].split("_")[0]].append(pred[i])

answer = []
for i in test_result.keys():
    answer.append(test_result[i])
labels = [test_generator.labels[i.split("_")[0]] for i in test_result.keys()]

for i in zip(labels, answer):
    if not i[0] == np.bincount(i[1]).argmax():
        print(i)

(1, [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
(0, [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [128]:
for r in np.arange(0.02, 0.32, 0.02):
    test_result = {}
    for i in range(len(probs)):
        if not test_generator.list_IDs[i].split("_")[0] in test_result:
            test_result[test_generator.list_IDs[i].split("_")[0]] = []
        test_result[test_generator.list_IDs[i].split("_")[0]].append(pred[i])

    answer = []
    for i in test_result.keys():
        if (np.array(test_result[i]) == 1).sum() / len(test_result[i]) > r:
            answer.append(1)
        elif (np.array(test_result[i]) == 0).sum() / len(test_result[i]) > r:
            answer.append(0)
        else:
            answer.append(2)
    labels = [test_generator.labels[i.split("_")[0]] for i in test_result.keys()]

    cf_values = metrics.confusion_matrix([a for a in labels], [p for p in answer]).ravel()
    pc_metric = (5*cf_values[0]+3*cf_values[4]+cf_values[8]) /\
    (5*(cf_values[0]+cf_values[1]+cf_values[2])+3*(cf_values[3]+cf_values[4]+cf_values[5])+(cf_values[6]+cf_values[7]+cf_values[8]))
    print(r, pc_metric)

0.02 0.559228650137741
0.04 0.6859504132231405
0.06 0.721763085399449
0.08 0.743801652892562
0.1 0.7713498622589532
0.12000000000000001 0.7520661157024794
0.13999999999999999 0.7603305785123967
0.16 0.7658402203856749
0.18 0.7493112947658402
0.19999999999999998 0.7493112947658402
0.22 0.7382920110192838
0.24 0.7520661157024794
0.26 0.7548209366391184
0.28 0.7575757575757576
0.30000000000000004 0.7575757575757576


In [129]:
test_result = {}
for i in range(len(probs)):
    if not test_generator.list_IDs[i].split("_")[0] in test_result:
        test_result[test_generator.list_IDs[i].split("_")[0]] = []
    test_result[test_generator.list_IDs[i].split("_")[0]].append(pred[i])

answer = []
for i in test_result.keys():
    if (np.array(test_result[i]) == 1).sum() / len(test_result[i]) > 0.15:
        answer.append(1)
    elif (np.array(test_result[i]) == 0).sum() / len(test_result[i]) > 0.15:
        answer.append(0)
    else:
        answer.append(2)
labels = [test_generator.labels[i.split("_")[0]] for i in test_result.keys()]

print(metrics.confusion_matrix([a for a in labels], [p for p in answer]))
print("F1:",metrics.f1_score([a for a in labels], [p for p in answer], average="macro"))
print("Weighted F1:",metrics.f1_score([a for a in labels], [p for p in answer], average='weighted'))
print(metrics.classification_report([a for a in labels], [p for p in answer]))

[[ 27   2   9]
 [  0   3   8]
 [  5   2 133]]
F1: 0.6740010946907499
Weighted F1: 0.8539404522984162
              precision    recall  f1-score   support

           0       0.84      0.71      0.77        38
           1       0.43      0.27      0.33        11
           2       0.89      0.95      0.92       140

    accuracy                           0.86       189
   macro avg       0.72      0.64      0.67       189
weighted avg       0.85      0.86      0.85       189



In [130]:
cf_values = metrics.confusion_matrix([a for a in labels], [p for p in answer]).ravel()
pc_metric = (5*cf_values[0]+3*cf_values[4]+cf_values[8]) /\
(5*(cf_values[0]+cf_values[1]+cf_values[2])+3*(cf_values[3]+cf_values[4]+cf_values[5])+(cf_values[6]+cf_values[7]+cf_values[8]))
print(pc_metric)

0.7630853994490359
