#Set up CNN environment

In [70]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.4.0
Num GPUs Available:  1


In [7]:
!unzip "/content/drive/MyDrive/IML_CXR.zip" -d "/content"

Archive:  /content/drive/MyDrive/IML_CXR.zip
replace /content/IML_CXR/1.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [71]:
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from random import randint
from scipy.ndimage.interpolation import rotate

# Load required data for CNN

In [193]:
# Open the image form working directory
Outcomes = pd.read_csv("cxr_label_train.csv")
Outcomes = Outcomes.sort_values('PATIENT ID')
X = np.array([np.array(Image.open('./IML_CXR/' + str(fname) + '.jpg'), dtype='float32') for fname in Outcomes['PATIENT ID']])
Y = Outcomes['hospital_outcome']
Y = to_categorical(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

rotate_X_train = np.zeros(X_train.shape)

for i in range(len(X_train)):
        rotate_X_train[i] = rotate(X_train[i], angle=randint(0, 90), reshape=False)

X_train = np.concatenate((X_train, rotate_X_train), axis=0)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
seed = randint(0, 10000)
np.random.seed(seed)  
np.random.shuffle(X_train)

Y_train = np.concatenate((Y_train, Y_train), axis=0)
np.random.seed(seed)
np.random.shuffle(Y_train)

X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

# Build CNN model

In [194]:
model = Sequential()
# convolutional layer
model.add(Conv2D(16, kernel_size=(3,3), strides=(1,1), padding='valid', activation='relu', input_shape=(320,320, 1)))
model.add(MaxPool2D(pool_size=(3, 3)))
model.add(Conv2D(32, kernel_size=(3,3), padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(3, 3)))
model.add(Conv2D(64, kernel_size=(3,3), padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(3, 3)))
model.add(Conv2D(128, kernel_size=(3,3), padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(3, 3)))

# flatten output of conv
model.add(Flatten())
# hidden layer
model.add(Dense(128, activation='relu'))
# output layer
model.add(Dense(2, activation='softmax'))

# compiling the sequential model
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_80 (Conv2D)           (None, 318, 318, 16)      160       
_________________________________________________________________
max_pooling2d_80 (MaxPooling (None, 106, 106, 16)      0         
_________________________________________________________________
conv2d_81 (Conv2D)           (None, 104, 104, 32)      4640      
_________________________________________________________________
max_pooling2d_81 (MaxPooling (None, 34, 34, 32)        0         
_________________________________________________________________
conv2d_82 (Conv2D)           (None, 32, 32, 64)        18496     
_________________________________________________________________
max_pooling2d_82 (MaxPooling (None, 10, 10, 64)        0         
_________________________________________________________________
conv2d_83 (Conv2D)           (None, 8, 8, 128)       

# Training

In [195]:
# training the model for 20 epochs
model.fit(X_train, Y_train, batch_size=16, epochs=20, validation_data=(X_train, Y_train))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f278b0804a8>

# Check model performance

In [196]:
Y_pred = model.predict(X_test)
#print(Y_pred)
for i in range(len(Y_pred)):
    if Y_pred[i][1] > 0.3:
        Y_pred[i][1] = 1
    else:
        Y_pred[i][1] = 0
TN, FN, TP, FP = 0, 0, 0, 0
for i in range(len(Y_test)):
        if Y_test[i][1] == 0 and Y_pred[i][1] == 0:
                TN += 1
        if Y_test[i][1] == 1 and Y_pred[i][1] == 0:
                FN += 1
        if Y_test[i][1] == 0 and Y_pred[i][1] == 1:
                FP += 1
        if Y_test[i][1] == 1 and Y_pred[i][1] == 1:
                TP += 1
print("TN:", TN, ", FN:", FN, ", TP:", TP, ", FP:", FP)
precision, recall = (TP/(FP+TP)), (TP/(FN+TP))
print("precision:", precision, ", recall:", recall)
print('F1:',  2 * ((precision*recall)/(precision+recall)))

X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
Y_pred = model.predict(X)
CXR1 = Y_pred[:, 0]
CXR2 = Y_pred[:, 1]

TN: 334 , FN: 40 , TP: 13 , FP: 31
precision: 0.29545454545454547 , recall: 0.24528301886792453
F1: 0.26804123711340205


In [None]:
!unzip "/content/drive/MyDrive/IML_CXR_TEST.zip" -d "/content"

# Generate test data's CXR outputs

In [197]:
Data_list=[x.split('.')[0] for x in os.listdir('IML_CXR_TEST')]
for i in range(len(Data_list)): 
    Data_list[i] = int(Data_list[i]) 
Data_list.sort()
Test_data = np.array([np.array(Image.open('./IML_CXR_TEST/' + str(fname) + '.jpg'), dtype='float32') for fname in Data_list])
Test_data = Test_data.reshape(Test_data.shape[0], Test_data.shape[1], Test_data.shape[2], 1)

In [198]:
Y_pred = model.predict(Test_data)
# for i in range(len(Y_pred)):
#     if Y_pred[i][1] > 0.3:
#         Y_pred[i][1] = 1
#     else:
#         Y_pred[i][1] = 0
Test_CXR1 = Y_pred[:, 0]
Test_CXR2 = Y_pred[:, 1]

# Treat CXR outputs as one kind of attributes

In [205]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

Attributes = pd.read_csv("hm_hospitales_covid_structured_30d_train.csv", na_values=0, na_filter=True)
Outcomes = pd.read_csv("cxr_label_train.csv")
Data = Attributes

Data.loc[Data['sex'] == 'FEMALE', 'sex'] = 0
Data.loc[Data['sex'] == 'MALE', 'sex'] = 1
Data.loc[Data['ed_diagnosis'] == 'sx_breathing_difficulty', 'ed_diagnosis'] = 1
Data.loc[Data['ed_diagnosis'] == 'sx_others', 'ed_diagnosis'] = 2
Data.loc[Data['ed_diagnosis'] == 'sx_flu', 'ed_diagnosis'] = 3
Data.loc[Data['ed_diagnosis'] == 'sx_fever', 'ed_diagnosis'] = 4
Data.loc[Data['ed_diagnosis'] == 'sx_cough', 'ed_diagnosis'] = 5

Data = Data.sort_values('PATIENT ID')
Outcomes = Outcomes.sort_values('PATIENT ID')
Data = Data.fillna(Data.mode().iloc[0])
Data = Data[Data['PATIENT ID'].isin(Outcomes['PATIENT ID'])]
print(Data)
Data = Data.drop(labels=['PATIENT ID', 'admission_datetime'], axis='columns')
Data['CXR1'] = CXR1
Data['CXR2'] = CXR2
X = Data
#X = Data.loc[Data['PATIENT ID'] == Outcomes_CXR['PATIENT ID']]
#X = Data

#X = X.to_numpy()
Y = Outcomes.drop(labels='PATIENT ID', axis='columns')
Y = Y.to_numpy()
Y = Y.ravel()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)


      PATIENT ID  age  sex  ... lab_lymphocyte  lab_rdw  lab_hemoglobin
1029           1   85    1  ...           0.58     14.6            13.5
1261          47   55    1  ...           1.42     13.9            14.3
226           48   70    0  ...           0.92     13.6            13.6
167           49   85    1  ...           0.50     15.6             7.1
895           50   39    1  ...           0.99     12.6            16.0
...          ...  ...  ...  ...            ...      ...             ...
1183        2561   93    0  ...           0.88     13.2            13.9
98          2562   64    1  ...           0.88     13.2            13.9
1264        2563   58    1  ...           0.39     11.7            10.9
1120        2565   92    0  ...           1.37     13.2            12.4
284         2571   88    0  ...           0.43     10.2            10.2

[1393 rows x 48 columns]


# Random Forest training

In [206]:
clf = RandomForestClassifier(min_samples_leaf= 5, n_estimators= 300)
Y_pred_prob = clf.fit(X_train, Y_train).predict_log_proba(X_test)
Y_pred = clf.predict(X_test)
print(Y_pred)
for i in range(0, Y_pred.shape[0]):
        if Y_pred_prob[i][1] > -1.1:
                Y_pred[i] = 1
        else:
                Y_pred[i] = 0
        #print(Y_pred_prob[i][0], Y_pred_prob[i][1], Y_pred[i], Y_test[i])
TN, FN, TP, FP = 0, 0, 0, 0
for i in range(len(Y_test)):
        if Y_test[i] == 0 and Y_pred[i] == 0:
                TN += 1
        if Y_test[i] == 1 and Y_pred[i] == 0:
                FN += 1
        if Y_test[i] == 0 and Y_pred[i] == 1:
                FP += 1
        if Y_test[i] == 1 and Y_pred[i] == 1:
                TP += 1

print("TN:", TN, ", FN:", FN, ", TP:", TP, ", FP:", FP)
precision, recall = (TP/(FP+TP)), (TP/(FN+TP))
print("precision:", precision, ", recall:", recall)
print('F1:',  2 * ((precision*recall)/(precision+recall)))




[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]
TN: 360 , FN: 15 , TP: 34 , FP: 9
precision: 0.7906976744186046 , recall: 0.6938775510204082
F1: 0.7391304347826086


In [207]:
Attributes = pd.read_csv("fixed_test.csv", na_values=0, na_filter=True)

Data = Attributes
Data = Data.sort_values('PATIENT ID')
Data.loc[Data['sex'] == 'FEMALE', 'sex'] = 0
Data.loc[Data['sex'] == 'MALE', 'sex'] = 1
Data.loc[Data['ed_diagnosis'] == 'sx_breathing_difficulty', 'ed_diagnosis'] = 1
Data.loc[Data['ed_diagnosis'] == 'sx_others', 'ed_diagnosis'] = 2
Data.loc[Data['ed_diagnosis'] == 'sx_flu', 'ed_diagnosis'] = 3
Data.loc[Data['ed_diagnosis'] == 'sx_fever', 'ed_diagnosis'] = 4
Data.loc[Data['ed_diagnosis'] == 'sx_cough', 'ed_diagnosis'] = 5
Data = Data.fillna(Data.mode().iloc[0])
Data = Data[Data['PATIENT ID'].isin(Data_list)]
Data = Data.drop(labels=['PATIENT ID', 'admission_datetime'], axis='columns')

Data['CXR1'] = Test_CXR1
Data['CXR2'] = Test_CXR2
X = Data

Y_pred_prob = clf.predict_log_proba(X)
Y_pred = clf.predict(X)
for i in range(0, Y_pred.shape[0]):
        if Y_pred_prob[i][1] > -1.1:
                Y_pred[i] = 1
        else:
                Y_pred[i] = 0
Outcomes = pd.DataFrame(Data_list, columns = ['PATIENT ID']) 
Outcomes['hospital_outcome'] = Y_pred.astype(int)

Outcomes.to_csv('Bonus_107062338.csv', index=False) #output prediction

  return np.log(proba)
