<a href="https://colab.research.google.com/github/alicewoo0925/pneumonia-classifier/blob/main/pneumonia_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
# set up the environment
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import cv2


# import tensorflow and keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras.backend as K
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

# import skimage
from skimage.color import rgb2gray
from skimage import io

In [3]:
# using Google Colaboratory
from google.colab import files
from google.colab import drive

# link to Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load image data

In [None]:
train_paths = glob.glob('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/chest_xray_small_dataset/chest_xray/train/**/*.jpeg')
test_paths = glob.glob('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/chest_xray_small_dataset/chest_xray/test/**/*.jpeg')

In [None]:
train_images = []
test_images = []
# read image files
for image_path in train_paths:
  img = io.imread(image_path)
  img_gray = rgb2gray(img)*256
  train_images.append(img_gray)

for image_path in test_paths:
  img = io.imread(image_path)
  img_gray = rgb2gray(img)*256
  test_images.append(img_gray)

  
  # This is added back by InteractiveShellApp.init_path()


In [None]:
print(len(train_images))
print(len(test_images))

1327
624


# Extract features using CNN

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

In [None]:
# load a resnet model without the prediction layers at the end
resnet50_tl_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

In [None]:
'''
this function extracts image features using ResNet50
'''
def getCNNFeatures(image_paths):
  all_image_features = []
  for img_path in image_paths:
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    image_features = resnet50_tl_model.predict(x)
    all_image_features.append(image_features)

  # convert to np feature matrix
  features = np.concatenate(all_image_features, axis=0)
  return features

In [None]:
train_features_cnn = getCNNFeatures(train_paths)
test_features_cnn = getCNNFeatures(test_paths)

In [None]:
print(train_features_cnn.shape)
print(test_features_cnn.shape)

(1327, 2048)
(624, 2048)


In [None]:
# save this feature set for future use
np.savetxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/train_features_cnn.csv',train_features_cnn,delimiter=',')
np.savetxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/test_features_cnn.csv',test_features_cnn,delimiter=',')

# Extract features using first order statistics and GLCM

In [None]:
from scipy import stats
from skimage.feature import greycomatrix, greycoprops

In [None]:
'''
a function to obtain first order statistics
'''
def getFirstOrderStats(img):

  # obtain the values
  entropy = stats.entropy(img,axis=None)
  img_stats = stats.describe(img,axis=None)

  first_order_stats = np.array([img_stats.mean,
                                img_stats.minmax[0],
                                img_stats.minmax[1],
                                img_stats.variance,
                                img_stats.skewness,
                                img_stats.kurtosis,
                                entropy]) 

  return first_order_stats

In [None]:
'''
a function to obtain texture features
'''
def getGLCM(img):

  # create a binned image
  FACTOR = np.max(img) / (256-1) 
  binned = img // FACTOR 

  # ensure the image is of an integer type
  integer_img = binned.astype(int)

  # compute the matrix
  glcm = greycomatrix(integer_img,
                      distances=[1],
                      angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
                      levels=256,
                      normed=True,
                      symmetric=True)
  # define the features
  features = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']

  # using for loop to compute the features
  for feature in features:
    output = (greycoprops(glcm, feature).flatten())
    

  texture_features = np.array([])
  texture_features = np.insert(texture_features, [0], greycoprops(glcm, "contrast").flatten()) 
  texture_features = np.insert(texture_features, [-1], greycoprops(glcm, "dissimilarity").flatten())
  texture_features = np.insert(texture_features, [-1], greycoprops(glcm, "homogeneity").flatten())
  texture_features = np.insert(texture_features, [-1], greycoprops(glcm, "energy").flatten())
  texture_features = np.insert(texture_features, [-1], greycoprops(glcm, "correlation").flatten())
  texture_features = np.insert(texture_features, [-1], greycoprops(glcm, "ASM").flatten())

  return texture_features

In [None]:
# extract the features of train set
all_features = []

for img in train_images: 
  first_order_stats = getFirstOrderStats(img) 
  texture_features = getGLCM(img) 
 
  # merge features into one vector
  image_features = np.array([])
  image_features = np.insert(image_features, [0], first_order_stats.flatten())  
  image_features = np.insert(image_features, [-1], texture_features.flatten())    
 
  all_features.append(image_features) # put into the list

# transform the list into a 2D np array
train_features = np.vstack(all_features)

In [None]:
# extract the features of test set
all_features = []

for image in test_images: 
  first_order_stats = getFirstOrderStats(image) 
  texture_features = getGLCM(image) 
 
  # merge features into one vector
  image_features = np.array([])
  image_features = np.insert(image_features, [0], first_order_stats.flatten())  
  image_features = np.insert(image_features, [-1], texture_features.flatten())    
 
  all_features.append(image_features) # put into the list

# transform the list into a 2D np array
test_features = np.vstack(all_features)

In [None]:
# save this feature set for future use
np.savetxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/train_features.csv',train_features,delimiter=',')
np.savetxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/test_features.csv',test_features,delimiter=',')

# Train Random Forest

In [None]:
from sklearn import ensemble

## Using CNN features

In [None]:
# load the saved features
train_features_cnn = np.loadtxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/train_features_cnn.csv',delimiter=',')
test_features_cnn = np.loadtxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/test_features_cnn.csv',delimiter=',')

In [None]:
train_features_cnn.shape

(1327, 2048)

In [None]:
train_features_cnn

array([[1.18478370e+00, 1.24790990e+00, 1.95681766e-01, ...,
        1.18292507e-03, 5.47102511e-01, 1.79751769e-01],
       [2.92473578e+00, 1.59352064e+00, 4.98986840e-02, ...,
        8.36384743e-02, 2.36153200e-01, 1.07637778e-01],
       [3.15620708e+00, 1.90217900e+00, 1.87583063e-02, ...,
        2.29124613e-02, 1.54628366e-01, 0.00000000e+00],
       ...,
       [6.56956434e-01, 1.66037905e+00, 0.00000000e+00, ...,
        1.24154976e-02, 3.83768864e-02, 0.00000000e+00],
       [1.99334013e+00, 1.37760431e-01, 5.24333417e-02, ...,
        1.29686400e-01, 5.39776146e-01, 5.32395614e-04],
       [1.24437249e+00, 2.48014760e+00, 1.69038922e-02, ...,
        6.51951656e-02, 6.40175119e-02, 0.00000000e+00]])

In [None]:
train_paths[0].split('/')[-1].split('-')[0]

'NORMAL'

In [None]:
y_train = []
y_test = []

# extract the target from the file name
# train set
for classname in train_paths:
  y_train.append(classname.split('/')[-1].split('-')[0])
# test set
for classname in test_paths:
  y_test.append(classname.split('/')[-1].split('-')[0])

In [None]:
y_test

['NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',
 'NORMAL',

In [None]:
learner = ensemble.RandomForestClassifier()
model = learner.fit(train_features_cnn,y_train)

## Using first order statistics and GLCM

In [None]:
# load the saved features
train_features = np.loadtxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/train_features.csv',delimiter=',')
test_features = np.loadtxt('/content/gdrive/MyDrive/bmet5933/BMET5933 Assignment2/test_features.csv',delimiter=',')

In [None]:
learner2 = ensemble.RandomForestClassifier()
model2 = learner2.fit(train_features,y_train)

# Evaluation

In [None]:
from sklearn.metrics import classification_report

## Using CNN features

In [None]:
prediction = model.predict(test_features_cnn)

In [None]:
report = classification_report(y_test,prediction)
print(report)

              precision    recall  f1-score   support

    BACTERIA       0.72      0.97      0.82       242
      NORMAL       0.93      0.72      0.81       234
       VIRUS       0.75      0.60      0.67       148

    accuracy                           0.79       624
   macro avg       0.80      0.76      0.77       624
weighted avg       0.81      0.79      0.78       624



## Using first order statistics and GLCM

In [None]:
prediction2 = model2.predict(test_features)

In [None]:
report2 = classification_report(y_test,prediction2)
print(report2)

              precision    recall  f1-score   support

    BACTERIA       0.58      0.98      0.73       242
      NORMAL       0.97      0.38      0.55       234
       VIRUS       0.62      0.51      0.56       148

    accuracy                           0.64       624
   macro avg       0.72      0.62      0.61       624
weighted avg       0.73      0.64      0.62       624

