#### Imports

In [2]:
import os 
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


#### Fonctions utilitaires pour parcourir les images

In [3]:
common_path = "../chest_Xray/"
images_files = os.listdir(common_path)
subfolders = ["train","val","test"]
categories = ["NORMAL","PNEUMONIA"]

# Permet de parcourir les images, et pour chaque image, on applique une fonction de callback
# On peut optionnellement appeler une fonction de callback pour chaque dossier
def browse_imgs(img_callback, path_folder_callback = None, limit_size = None):
    for subfolder in subfolders:
        for category in categories:
            # pour avoir tous les chemins des 6 dossiers
            folder_path = os.path.join(common_path, subfolder, category)
            # liste de toutes les images
            images_files = os.listdir(folder_path)
            if path_folder_callback is not None:
                path_folder_callback(folder_path, images_files)
            array_limit = limit_size if limit_size is not None else len(images_files)
            #récupération de toutes les (ou des 'limit_size' premières) images du dossier.
            for file_name in images_files[:array_limit]:
                if not file_name.endswith(".jpeg"):
                    continue
                image_path = os.path.join(folder_path,file_name)
                img = cv2.imread(image_path,cv2.IMREAD_GRAYSCALE)
                img_callback(img, category)
                
                
def display_imgs(imgs, titles = [], plot_size = (1,1), figsize = (10,8)):
    fig = plt.figure(figsize=figsize)
    index = 0
    for image, title in zip(imgs, titles):
        index += 1
        ax = fig.add_subplot(plot_size[0], plot_size[1], index) 
        ax.imshow(image, cmap="gray")
        ax.axis("off")
        if titles is not None:
            ax.set_title(title)

    plt.tight_layout()
    plt.show()

In [4]:
def img_is_in_ratio(img, min_ratio = 1, max_ratio = 1.5):
    height, width = img.shape
    ratio = width / height
    if min_ratio <= ratio <= max_ratio:
        return True
    else:
        return False

In [5]:
grey_scale_limit = 10

def img_has_atleast_black_pixels(img, threshold = 5):
    height, width = img.shape
    percent = (np.sum(img <= grey_scale_limit)*100)/(width*height)
    return percent >= threshold

In [6]:
all_images = []
all_categories = []

all_original_images_index = []

max_ratio_threshold = 1.6
all_images_index_ratio = []

min_black_pixels_threshold = 5
all_images_indexes_black_pixels = []

all_images_index_ratio_and_black_pixels = []

datasetX = []
datasetY = []

image_size = (200, 200)

def load_datasets(img, category):
    new_img = cv2.resize(img, image_size)
    all_images.append(new_img)
    all_categories.append(category)
    index = len(all_images)-1
    all_images.append(new_img)
    all_categories.append(category)
    
    all_original_images_index.append(index)

    if img_is_in_ratio(img, max_ratio=max_ratio_threshold):
        all_images_index_ratio.append(index)

    if img_has_atleast_black_pixels(img, threshold=min_black_pixels_threshold):
        all_images_indexes_black_pixels.append(index)

    if img_has_atleast_black_pixels(img, threshold=min_black_pixels_threshold) and img_is_in_ratio(img, max_ratio=max_ratio_threshold):
        all_images_index_ratio_and_black_pixels.append(index)

    
browse_imgs(load_datasets)

def use_all_dataset():
    global datasetX, datasetY
    datasetX = np.array(all_images)
    datasetY = np.array(all_categories)
    
def use_all_original_images_dataset():
    global datasetX, datasetY
    datasetX = np.array([all_images[i] for i in all_original_images_index])
    datasetY = np.array([all_categories[i] for i in all_original_images_index])
 
def use_ratio_dataset():
    global datasetX, datasetY
    datasetX = np.array([all_images[i] for i in all_images_index_ratio])
    datasetY = np.array([all_categories[i] for i in all_images_index_ratio])

def use_black_pixel_dataset():
    global datasetX, datasetY
    datasetX = np.array([all_images[i] for i in all_images_indexes_black_pixels])
    datasetY = np.array([all_categories[i] for i in all_images_indexes_black_pixels])

def use_ratio_black_pixel_dataset():
    global datasetX, datasetY
    datasetX = np.array([all_images[i] for i in all_images_index_ratio_and_black_pixels])
    datasetY = np.array([all_categories[i] for i in all_images_index_ratio_and_black_pixels])

use_all_dataset()
print("Dataset all shape : ", datasetX.shape)
use_black_pixel_dataset()
print("Dataset bp shape : ", datasetX.shape)
use_ratio_dataset()
print("Dataset ration shape : ", datasetX.shape)
use_ratio_black_pixel_dataset()
print("Dataset bp+ratio shape : ", datasetX.shape)


Dataset all shape :  (5856, 200, 200)
Dataset bp shape :  (3178, 200, 200)
Dataset ration shape :  (4431, 200, 200)
Dataset bp+ratio shape :  (2502, 200, 200)


#### CNN - Convolution Neural Network

In [8]:
# Imports
from tensorflow.keras import layers
import tensorflow as tf
import visualkeras as vk
import pandas as pd

ModuleNotFoundError: No module named 'visualkeras'

Convolution = mettre en évidence les caractéristiques de l'image.

Poolling = réduire l'image.

In [9]:
# This line sets the number of output classes for your model
num_classes = 1
# This line sets the size of the kernel to be used in the convolutional layers. The kernel is
# a small matrix that is used for blurring, sharpening, embossing, edge detection, and more
core_size = 4

# he Sequential model is a linear stack of layers that you can add to in order
model = tf.keras.Sequential([
    layers.Input(shape=(200, 200, 1)),
    layers.Conv2D(256, core_size, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(128, core_size, activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, core_size, activation='relu'),
    layers.MaxPooling2D(),

    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='sigmoid')
])

model.summary()

In [10]:
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['recall'])

In [None]:
use_all_dataset()

trainx, testx, trainy, testy = train_test_split(datasetX, datasetY, test_size=0.2, random_state=1)
trainx = trainx / 255
print("trainx normalized")
testx = testx / 255
print("testx normalized")

# For datasetY replace normal with 0 and pneumonia with 1 in order to have numeric values
trainy = np.array([0 if y == "NORMAL" else 1 for y in trainy])
testy = np.array([0 if y == "NORMAL" else 1 for y in testy])

# déterminer le type de cette variable
print(trainx)
print(trainx.shape)
print(trainy)
# print(trainy.shape)

model.fit(trainx,
          trainy,
          validation_data=(testx, testy),
          epochs=2)

trainx normalized
testx normalized
[[[0.08627451 0.08235294 0.08627451 ... 0.0745098  0.02352941 0.54117647]
  [0.08235294 0.08235294 0.08627451 ... 0.4627451  0.56078431 0.58823529]
  [0.07843137 0.08235294 0.08627451 ... 0.45490196 0.47843137 0.58431373]
  ...
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.12156863 0.12156863 0.11764706 ... 0.12156863 0.11764706 0.11764706]
  [0.11372549 0.11372549 0.09803922 ... 0.12156863 0.11764706 0.11764706]
  [0.10588235 0.09411765 0.10588235 ... 0.1254902  0.12156863 0.11764706]
  ...
  [0.12941176 0.09803922 0.19607843 ... 0.17647059 0.17647059 0.17647059]
  [0.11764706 0.12156863 0.19607843 ... 0.17647059 0.17647059 0.17647059]
  [0.11372549 0.16078431 0.20784314 ... 0.17647059 0.17647059 0.17647059]]

 [[0.04313725 0.04313725 0.03529412 ... 0.08627451 0.08235294

  output, from_logits = _get_logits(


[1m100/147[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m1:49[0m 2s/step - loss: 0.5841 - recall: 0.9468

In [None]:
print(model.evaluate(testx, testy))

In [None]:
# Predict the probabilities
y_pred = model.predict(testx)

# Convert probabilities to class labels
y_pred_label = np.round(y_pred).astype(int)

# Create the confusion matrix
cm = confusion_matrix(testy, y_pred_label)

print(cm)
recall = recall_score(testy, y_pred_label)
precision = precision_score(testy, y_pred_label)
f1score = f1_score(testy, y_pred_label)
print("recall : ", recall)
print("precision : ", precision)
print("f1score : ", f1score)