# **Seminário de Introdução a Imagens Médicas - Inteligência artificial aplica a imagens médicas**

### *Breast Cancer Detection using Machine Learning Techniques*

**Alunos:** _Caio Fernandes Lott Primola_     - 20193001742<br>
_Henrique Rodrigues Lima_         - 20193009473<br>
_João Pedro de Almeida Campos_         - 20203003792<br>
_Victor Cunha Freitas Lara_         - 20193015695<br>

Este trabalho consiste na simulação, analise e comparação entre uma rede neural densa e uma rede neural convolucional básica.

As técnicas utilizadas foram:<br>
    - Redes neurais convolucionais (CNN);<br>
    - Rede Neural Densa (NN);<br>
    

Para instalar as bibliotecas necessárias, utilize a célula abaixo

In [61]:
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting scikeras (from -r requirements.txt (line 6))
  Using cached scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting scikit-learn (from -r requirements.txt (line 4))
  Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Using cached scikeras-0.13.0-py3-none-any.whl (26 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.0 MB 991.0 kB/s eta 0:00:12
    --------------------------------------- 0.3/11.0 MB 2.3 MB/s eta 0:00:05
   - -------------------------------------- 0.5/11.0 MB 3.0 MB/s eta 0:00:04
   -- ------------------------------------- 0.7/11.0 MB 3.5 MB/s eta 0:00:03
   --- ------------------------------------ 0.9/11.0 MB 3.6 MB/s eta 0:00:03
   ---- ----------------------------------- 1.2/11.0 MB 3.9 MB/s eta 0:00:03
   ---- ---------

  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Inicialização e carregamento dos dados

In [1]:
import os
import cv2
import re
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.image import resize
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  precision_score, confusion_matrix,  recall_score, f1_score, roc_auc_score
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, RFE
import matplotlib.pyplot as plt




In [75]:
dataset_path = "histology_slides"  
regex_label = r"[A-Z]+_([A-Z])_[A-Z]+-\d{2}-[A-Z\d]+-(\d+)-\d+\.png"
magnification = "200X" #diretorio da magnificação desejada

In [77]:
def load_data(dataset_path, magnification, img_size):
    X = []
    y = []
    num_files = 0

    # Percorrer o diretório de imagens
    for root, dirs, files in os.walk(dataset_path):
        if os.path.basename(root) == magnification: 
            for file in files:
                if file.endswith('.png'):
                    img_path = os.path.join(root, file)
                    img = cv2.imread(img_path)
                    
                    if img is None:
                        print(f"Erro ao carregar imagem: {file}")
                        continue
                    
                    img = cv2.resize(img, img_size)
                    X.append(img)

                    match_obj = re.search(regex_label, file)
                    if match_obj:
                        label = match_obj.group(1)
                        y.append(True if label == "M" else False)
                    else:
                        print(f"Erro ao extrair rótulo da imagem: {file}")
                    
                    num_files += 1

    print(f"Total de imagens processadas: {num_files}")
    X = np.array(X)
    y = np.array(y)

    return X, y


In [78]:
img_size = (700, 460)
X, y = load_data(dataset_path, magnification, img_size)


Total de imagens processadas: 941


In [79]:
count = Counter(y)
print(count)

Counter({True: 589, False: 352})


## Preprocessemamento dos dados

### Normalização dos dados

In [82]:
X = X / 255.0

### Feature Selection

In [83]:
X_flat = X.reshape(X.shape[0], -1)
selector = SelectKBest(f_classif, k=500) 
X_new = selector.fit_transform(X_flat, y)

### Recursive Feature Elimination

In [86]:
lr = LogisticRegression(max_iter=1000, random_state=42)
rfe = RFE(estimator=lr, n_features_to_select=100, step=50)
X_rfe = rfe.fit_transform(X_new, y)

### Separação entre dados de treino e teste

In [87]:
X_rfe, y = shuffle(X_rfe, y)

# Dividindo os dados em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.2, random_state=42)

print(f"Treinamento: {X_train.shape}, Teste: {X_test.shape}")


Treinamento: (752, 100), Teste: (189, 100)


## Modelos
### Redes Neural Densa

In [88]:
def build_dense_nn(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dropout(0.3),  # Dropout para evitar overfitting
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Saída binária (maligno ou benigno)
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

### Modelo Básico de CNN

In [95]:


def build_basic_cnn(input_shape):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


### Avaliações 

In [90]:


def avaliacoes(model, X_train, y_train, X_test, y_test):
    # Configurar callback para ajuste da taxa de aprendizado
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.00001)

    # Treinamento do modelo
    history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=32, callbacks=[lr_scheduler])

    # Avaliação do modelo
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    # Matriz de Confusão
    cm = confusion_matrix(y_test, y_pred)
    print("Matriz de Confusão:")
    print(cm)

    # Métricas de Desempenho
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, model.predict(X_test))

    print(f"Precisão: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"AUC: {roc_auc:.4f}")

    # Repartição dos dados para validação cruzada
    X, y = shuffle(X_train, y_train, random_state=42)
    
    # Utilização de KerasClassifier para integração com scikit-learn
    # model_sk = KerasClassifier(build_fn=lambda: model, epochs=20, batch_size=32, verbose=0)
    # scores = cross_val_score(model_sk, X, y, cv=5, scoring='accuracy')
    # print(f"Precisão média da validação cruzada: {scores.mean():.4f}")

## Testes dos modelos

### Teste Rede Neural Densa

In [91]:
def test_dense_nn():
    model = build_dense_nn((X_train.shape[1],))
    avaliacoes(model, X_train, y_train, X_test, y_test)

test_dense_nn()



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.6327 - loss: 0.6868 - val_accuracy: 0.6349 - val_loss: 0.6656 - learning_rate: 0.0010
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6139 - loss: 0.6696 - val_accuracy: 0.6349 - val_loss: 0.6572 - learning_rate: 0.0010
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6259 - loss: 0.6634 - val_accuracy: 0.6349 - val_loss: 0.6565 - learning_rate: 0.0010
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6038 - loss: 0.6702 - val_accuracy: 0.6349 - val_loss: 0.6545 - learning_rate: 0.0010
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6357 - loss: 0.6541 - val_accuracy: 0.6349 - val_loss: 0.6529 - learning_rate: 0.0010
Epoch 6/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/

In [93]:
def load_data_cnn(dataset_path, magnification, img_size, regex_label):
    X = []
    y = []
    num_files = 0

    # Percorrer o diretório de imagens
    for root, dirs, files in os.walk(dataset_path):
        if os.path.basename(root) == magnification:
            for file in files:
                if file.endswith('.png'):
                    img_path = os.path.join(root, file)
                    img = cv2.imread(img_path)

                    if img is None:
                        print(f"Erro ao carregar imagem: {file}")
                        continue

                    # Redimensionar a imagem
                    img = cv2.resize(img, img_size)
                    X.append(img)

                    # Extrair rótulo usando expressão regular
                    match_obj = re.search(regex_label, file)
                    if match_obj:
                        label = match_obj.group(1)
                        y.append(True if label == "M" else False)
                    else:
                        print(f"Erro ao extrair rótulo da imagem: {file}")

                    num_files += 1

    print(f"Total de imagens processadas: {num_files}")
    X = np.array(X)
    y = np.array(y)
    return X, y

# Exemplo de uso
dataset_path = 'histology_slides'
magnification = '200X'
img_size = (700, 460)
regex_label = r'_(M|B)_'

X_cnn, y_cnn = load_data_cnn(dataset_path, magnification, img_size, regex_label)

# Divida os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_cnn, y_cnn, test_size=0.2, random_state=42)

# Verifique as dimensões de X_train e X_test
print(f"Dimensões de X_train: {X_train.shape}")
print(f"Dimensões de X_test: {X_test.shape}")


Total de imagens processadas: 941
Dimensões de X_train: (752, 460, 700, 3)
Dimensões de X_test: (189, 460, 700, 3)


### Teste Rede Neural Convolucional Básica

In [96]:
def test_basic_cnn():
    model = build_basic_cnn((img_size[0], img_size[1], 3))
    avaliacoes(model, X_train, y_train, X_test, y_test)

test_basic_cnn()

Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2s/step - accuracy: 0.5423 - loss: 3445.4185 - val_accuracy: 0.7725 - val_loss: 4.2169 - learning_rate: 0.0010
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 2s/step - accuracy: 0.7842 - loss: 4.9511 - val_accuracy: 0.7302 - val_loss: 0.5652 - learning_rate: 0.0010
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2s/step - accuracy: 0.8244 - loss: 0.5565 - val_accuracy: 0.7460 - val_loss: 0.8236 - learning_rate: 0.0010
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 2s/step - accuracy: 0.8417 - loss: 0.5311 - val_accuracy: 0.7090 - val_loss: 0.6529 - learning_rate: 0.0010
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2s/step - accuracy: 0.8636 - loss: 0.4110 - val_accuracy: 0.7778 - val_loss: 0.6063 - learning_rate: 0.0010
Epoch 6/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2