In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import matplotlib.pyplot as plt
import collections
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
# from imgaug import augmenters as iaa
from scipy import ndimage
from math import ceil, floor, log
import cv2
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import sys
import heapq
import efficientnet.tfkeras as efn 
from sklearn.model_selection import ShuffleSplit

# Set the log level of TensorFlow to suppress unnecessary output
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from tensorflow.python.util import deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Import necessary libraries for data preprocessing and modeling
# numpy - for numerical operations on arrays
# pandas - for data manipulation and analysis
# pydicom - for reading DICOM files
# os - for interacting with the operating system
# matplotlib - for data visualization
# collections - for specialized data structures
# tqdm - for progress bars during loops
# datetime - for working with dates and times
# scipy - for scientific computing and image processing
# math - for mathematical operations
# cv2 - for computer vision tasks
# tensorflow - for building and training machine learning models
# sys - for system-specific parameters and functions
# heapq - for heap-based data structures
# efficientnet - a library of efficient neural network architectures
# scikit-learn - for machine learning algorithms and model evaluation

# Define functions for data preprocessing and modeling here (not shown)

W0513 21:53:47.153772 140382116820800 core.py:46] TF Parameter Server distributed training not available.


In [None]:
# preprocess the features  (train + val)
tbl = pd.read_csv('# The path of the training clinical features')

# combine_id + key
tbl['ID'] = tbl.apply(lambda row: str(row.hashed_patient_ir_id) + '_' + row.Hash_Key, axis = 1)
tbl = tbl.drop(['hashed_patient_ir_id', 'Hash_Key'], axis = 1)

tbl.drop_duplicates(subset=['ID'], keep='first', inplace = True)

tbl = tbl.set_index('ID')
tbl.shape

(1489, 99)

In [None]:
# preprocess the features  (train + val)
tbl_val = pd.read_csv('# The path of the validation clinical features')
tbl_val.head(5)

# combine_id + key
tbl_val['ID'] = tbl_val.apply(lambda row: str(row.hashed_patient_ir_id) + '_' + row.Hash_Key, axis = 1)
tbl_val = tbl_val.drop(['hashed_patient_ir_id', 'Hash_Key'], axis = 1)
tbl_val.drop_duplicates(subset=['ID'], keep='first', inplace = True)

tbl_val = tbl_val.set_index('ID')
tbl_val.shape

(396, 99)

In [None]:
# preprocess testing dataset 
tbl_test = pd.read_csv('# The path of the testing clinical features')
tbl_test.head(5)

# combine_id + key
tbl_test['ID'] = tbl_test.apply(lambda row: str(row.hashed_patient_ir_id) + '_' + row.Hash_Key, axis = 1)
tbl_test = tbl_test.drop(['hashed_patient_ir_id', 'Hash_Key'], axis = 1)
tbl_test.drop_duplicates(subset=['ID'], keep='first', inplace = True)

tbl_test = tbl_test.set_index('ID')
tbl_test.shape

(413, 99)

In [1]:
def _read_tbl(df, ID, desired_size):    # here the input is "int"
    try:
        row = df.loc[ID,:]
    except:
        row = np.zeros(desired_size)
    return row

In [None]:
# Define the size of the input images
input_size = (224, 224)

# Define the paths to the directories containing the training, validation, and testing CXR features
train_images_dir = '# The path of the training CXR features'
val_images_dir = '# The path of the validation CXR features'
test_images_dir = '# The path of the testing CXR features'

# Define the training, validation, and testing clinical features DataFrames
train_fs_dir = tbl
valid_fs_dir = tbl_val
test_fs_dir = tbl_test

length = 99

class DataGenerator(Sequence):
    """
    Generates data for Keras
    """
    def __init__(self, list_IDs, labels, batch_size=16, fs_dir=train_fs_dir, fs_size=length, augment=False, shuffle=True):
        """
        Initialization
        """
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.shuffle = shuffle
        self.fs_dir = fs_dir
        self.fs_size = fs_size
        self.augment = augment
        self.on_epoch_end()

    def __len__(self):
        """
        Denotes the number of batches per epoch
        """
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        """
        Generate one batch of data
        """
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X_fs, y = self.__data_generation(list_IDs_temp)

        return X_fs, y

    def on_epoch_end(self):
        """
        Updates indexes after each epoch
        """
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        """
        Generates data containing batch_size samples
        """
        # Initialization
        X_fs = np.empty((self.batch_size, self.fs_size))
        y = np.empty((self.batch_size), dtype=np.float32)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            ID_split = ID.split('-')
            X_fs[i] = _read_tbl(self.fs_dir, ID_split[0], self.fs_size)
            y[i] = self.labels.loc[ID].values[0]

        return X_fs, y

In [2]:
train_xr = '# The path of the risk level labels'
val_xr = '# The path of the risk level labels'
df = pd.read_csv(train_xr)
df = df.set_index(['ID'])
df.index

df_val = pd.read_csv(val_xr)
df_val = df_val.set_index(['ID'])
df_val.index

test_xr = '# The path of the risk level labels'

df_test = pd.read_csv(test_xr)
df_test = df_test.set_index(['ID'])
df_test.index

In [None]:
train_seq = DataGenerator(df.index, df, 32,
                         fs_dir = train_fs_dir, fs_size = length , shuffle=True)
val_seq = DataGenerator(df_val.index, df_val, 32,
                         fs_dir = valid_fs_dir, fs_size = length, shuffle=False)

test_seq = DataGenerator(df_test.index, df_test, 1,
                         fs_dir = test_fs_dir, fs_size = length, shuffle=False)

In [3]:
model_1 = tfdf.keras.RandomForestModel(num_trees = 600)
model_1.fit(x=train_seq)

In [4]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_seq, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [5]:
import matplotlib.pyplot as plt

logs = model_1.make_inspector().training_logs()

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Accuracy (out-of-bag)")

plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("Logloss (out-of-bag)")

plt.show()

In [None]:
from tensorflow.keras.initializers import GlorotNormal

initializer = GlorotNormal()
fs_in = keras.layers.Input(shape=(length,))  # read the features in 
feature = keras.layers.Dense(128, kernel_initializer=initializer)(fs_in)
feature = keras.activations.relu(feature)
feature = keras.layers.Dropout(0.2)(feature)
fusion = keras.layers.Dense(64, kernel_initializer=initializer)(feature)
fusion = keras.layers.Activation('relu')(fusion)
predictions = keras.layers.Dense(3, activation="softmax", name='fusion_last')(fusion)

model = keras.models.Model(inputs=fs_in, outputs=predictions)
model.summary()

In [None]:
for i in range(5):
    
    Fold = i

    model.compile(loss='categorical_crossentropy', 
                  optimizer=keras.optimizers.SGD(lr=0.0002, momentum=0.9,nesterov=True),
                  metrics=['acc',                       
                         keras.metrics.AUC(),
                         keras.metrics.Precision(name='precision'),
                         keras.metrics.Recall(name='recall')])


    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
    weightpath = 'Path to save the model.hdf5'.format(Fold)
    es = EarlyStopping(monitor='val_acc', 
                       verbose=1, 
                       patience=8, 
                       min_delta=0.000001, 
                       mode='max')
    mc = ModelCheckpoint(weightpath, 
                         monitor='val_acc', 
                         verbose=1, 
                         save_best_only=True, 
                         mode='max')
    rlr = ReduceLROnPlateau(monitor='val_acc',
                            mode='max',
                            factor=0.1,
                            patience=3)
    
    # class weight
    classes = pd.value_counts(df.Level)
    print(classes)
    # # classes
    class_weight = [(classes[3] + classes[1] + classes[2])/classes[1], (classes[3] + classes[1] + classes[2])/classes[2],(classes[3] + classes[1] + classes[2])/classes[3]] 
    print(class_weight)
    
    
    hist = model.fit_generator(train_seq, epochs=100, verbose=1, max_queue_size=1, 
                           workers=1, validation_data=val_seq, 
                           callbacks = [es, mc,rlr], use_multiprocessing=False)  #,  class_weight = class_weight)

In [None]:
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.savefig('insert_path')
plt.show()

## Test

In [6]:
for i in range(5):
    for j in range(2):
        model.load_weights('The path of the best model.hdf5'.format(i))

        Y_pred = model.predict_generator(test_seq)
        yy_pred = Y_pred

        y_true = test_seq.labels.Label.values

        from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, f1_score, roc_auc_score, precision_score
        y_pred = np.round(Y_pred)
        print('Confusion Matrix')
        cm = confusion_matrix(y_true, y_pred)
        target_names = ['COVID-Neg', 'COVID-Pos']
        print(cm)

        print('i = ', i, 'j = ', j+1)
        print(accuracy_score(y_true, y_pred))
        print(recall_score(y_true, y_pred))
        print(precision_score(y_true, y_pred))
        print(f1_score(y_true, y_pred))
        print(roc_auc_score(y_true, yy_pred))
        print('**********')

In [7]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_true, Y_pred)
auc = auc(fpr, tpr)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc))
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.title('ROC curve')
plt.legend(loc='lower right')