In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import keras_tuner as kt

from tkinter import filedialog
from scipy import stats
from sklearn import preprocessing, model_selection, pipeline, compose, linear_model, metrics, ensemble, neighbors, cluster

In [3]:
path_roiFts = filedialog.askdirectory(title='Diretório dos arquivos de ROIs')
path_microFts = filedialog.askdirectory(title='Diretório dos arquivos de MICROS')

Juntar as tabelas de cada em imagem em uma única tabela:

In [4]:
df_roiFts = []
for file_name in os.listdir(path_roiFts):
    file_path = os.path.join(path_roiFts, file_name)
    file_open = open(file_path, "r")
    df_file = pd.read_csv(file_open)

    df_file['im_name'] = int(file_name[0:8])

    df_roiFts.append(df_file)

df_roiFts = pd.concat(df_roiFts)

In [None]:
df_microFts = []
for file_name in os.listdir(path_microFts):
    file_path = os.path.join(path_microFts, file_name)
    file_open = open(file_path, "r")
    df_file = pd.read_csv(file_open)

    df_file['im_name'] = int(file_name[0:8])

    df_microFts.append(df_file)

df_microFts = pd.concat(df_microFts)

Organização dos dataframes:

In [None]:
df_roiFts = df_roiFts.drop(labels=['Unnamed: 0'], axis=1)
df_roiFts = df_roiFts.replace({True:1, False:0})
df_roiFts['key'] = df_roiFts.apply(lambda row: f'{row["im_name"]}_{row["roi_index"]}', axis=1)

df_roiFts.head(n=5)

In [None]:
# df_microFts = df_microFts.drop(labels=['Unnamed: 0'], axis=1)
df_microFts['roi_index'] = pd.to_numeric(df_microFts['roi_index'], downcast='integer')
df_microFts['im_name'] = pd.to_numeric(df_microFts['im_name'], downcast='integer')
df_microFts['key'] = df_microFts.apply(lambda row: f'{row["im_name"]}_{row["roi_index"]}', axis=1)

df_microFts.head(n=5)

## Resultado do processamento de segmentação

Resultados por imagem -> Número de ROIs detectados corretamente por imagem conforme ground-truth

In [None]:
table_1 = df_roiFts.groupby(by='im_name')['roi_result'].value_counts()
table_1 = table_1.unstack().fillna(value=0)
table_1['Sensibilidade'] = table_1['True_positive']/(table_1['True_positive']+table_1['False_negative'])
table_1['Precisao'] = table_1['True_positive']/(table_1['True_positive']+table_1['False_positive'])
table_1['Acuracia'] = table_1['True_positive']/(table_1['True_positive']+table_1['False_negative']+table_1['False_positive'])

table_1.style \
  .format(precision=0, thousands='.', decimal=',', na_rep=0) \
  .format(formatter='{:.1%}', thousands='.', decimal=',', subset=['Sensibilidade', 'Precisao', 'Acuracia']) \
  .relabel_index(['FN', 'FP', 'TP', 'Sensib.', 'Precisão', 'Acurácia'], axis=1) \
  .highlight_between(subset=['Sensibilidade', 'Precisao', 'Acuracia'], color='green', left=0.75, right=1) \
  .highlight_between(subset='Sensibilidade', color='red', left=0, right=0.40) \
  .highlight_between(subset=['Precisao', 'Acuracia'], color='red', left=0, right=0.1) \
  .highlight_between(subset=['Precisao', 'Acuracia'], color='green', left=0.20, right=1)  

Média e Desvio Padrão dos resultados de todas as imagens:

In [None]:
table_2 = pd.DataFrame([table_1.mean(), table_1.std()], index=['mean', 'std']).T

table_2.style \
    .format(precision=2, thousands='.', decimal=',') \
    .format(formatter='{:.2%}', thousands='.', decimal=',', subset=pd.IndexSlice[['Sensibilidade', 'Precisao', 'Acuracia'], :]) \
    .relabel_index(labels=['Mean', 'STD'], axis=1) \
    .relabel_index(labels=['Mean', 'STD'], axis=1).relabel_index(labels=['FN', 'FP', 'TP', 'Sensib.', 'Precisão', 'Acurácia'], axis=0)

Divisão do total de imagens em 2 partes para compor o set de TESTE e de TREINAMENTO:

In [None]:
img_train, img_test = model_selection.train_test_split(df_roiFts['im_name'].unique(), test_size=0.35, train_size=0.65)

Criar o dataframe de treinamento:

In [None]:
roi_train = df_roiFts.loc[df_roiFts['im_name'].isin(img_train)]

micro_train = df_microFts.loc[df_microFts['key'].isin(roi_train['key'])]

Combinação de atributos:

In [None]:
##### Média e desvio padrão dos atributos de textura
roi_train.loc[:, 't_ASM_mean'] = roi_train.loc[:, ['t_ASM_0', 't_ASM_90', 't_ASM_180', 't_ASM_270']].mean(axis=1)
roi_train.loc[:, 't_ASM_std'] = roi_train.loc[:, ['t_ASM_0', 't_ASM_90', 't_ASM_180', 't_ASM_270']].std(axis=1)
roi_train.loc[:, 't_contrast_mean'] = roi_train.loc[:, ['t_contrast_0', 't_contrast_90', 't_contrast_180', 't_contrast_270']].mean(axis=1)
roi_train.loc[:, 't_contrast_std'] = roi_train.loc[:, ['t_contrast_0', 't_contrast_90', 't_contrast_180', 't_contrast_270']].std(axis=1)
roi_train.loc[:, 't_correlation_mean'] = roi_train.loc[:, ['t_correlation_0', 't_correlation_90', 't_correlation_180', 't_correlation_270']].mean(axis=1)
roi_train.loc[:, 't_correlation_std'] = roi_train.loc[:, ['t_correlation_0', 't_correlation_90', 't_correlation_180', 't_correlation_270']].std(axis=1)
roi_train.loc[:, 't_sumSqrVariance_mean'] = roi_train.loc[:, ['t_sumSqrVariance_0', 't_sumSqrVariance_90', 't_sumSqrVariance_180', 't_sumSqrVariance_270']].mean(axis=1)
roi_train.loc[:, 't_sumSqrVariance_std'] = roi_train.loc[:, ['t_sumSqrVariance_0', 't_sumSqrVariance_90', 't_sumSqrVariance_180', 't_sumSqrVariance_270']].std(axis=1)
roi_train.loc[:, 't_idm_mean'] = roi_train.loc[:, ['t_idm_0', 't_idm_90', 't_idm_180', 't_idm_270']].mean(axis=1)
roi_train.loc[:, 't_idm_std'] = roi_train.loc[:, ['t_idm_0', 't_idm_90', 't_idm_180', 't_idm_270']].std(axis=1)
roi_train.loc[:, 't_sumAverage_mean'] = roi_train.loc[:, ['t_sumAverage_0', 't_sumAverage_90', 't_sumAverage_180', 't_sumAverage_270']].mean(axis=1)
roi_train.loc[:, 't_sumAverage_std'] = roi_train.loc[:, ['t_sumAverage_0', 't_sumAverage_90', 't_sumAverage_180', 't_sumAverage_270']].std(axis=1)
roi_train.loc[:, 't_sumVariance_mean'] = roi_train.loc[:, ['t_sumVariance_0', 't_sumVariance_90', 't_sumVariance_180', 't_sumVariance_270']].mean(axis=1)
roi_train.loc[:, 't_sumVariance_std'] = roi_train.loc[:, ['t_sumVariance_0', 't_sumVariance_90', 't_sumVariance_180', 't_sumVariance_270']].std(axis=1)
roi_train.loc[:, 't_sumEntropy_mean'] = roi_train.loc[:, ['t_sumEntropy_0', 't_sumEntropy_90', 't_sumEntropy_180', 't_sumEntropy_270']].mean(axis=1)
roi_train.loc[:, 't_sumEntropy_std'] = roi_train.loc[:, ['t_sumEntropy_0', 't_sumEntropy_90', 't_sumEntropy_180', 't_sumEntropy_270']].std(axis=1)
roi_train.loc[:, 't_IMC2_mean'] = roi_train.loc[:, ['t_entropy_0', 't_entropy_90', 't_entropy_180', 't_entropy_270']].mean(axis=1)
roi_train.loc[:, 't_entropy_std'] = roi_train.loc[:, ['t_entropy_0', 't_entropy_90', 't_entropy_180', 't_entropy_270']].std(axis=1)
roi_train.loc[:, 't_diffVariance_mean'] = roi_train.loc[:, ['t_diffVariance_0', 't_diffVariance_90', 't_diffVariance_180', 't_diffVariance_270']].mean(axis=1)
roi_train.loc[:, 't_diffVariance_std'] = roi_train.loc[:, ['t_diffVariance_0', 't_diffVariance_90', 't_diffVariance_180', 't_diffVariance_270']].std(axis=1)
roi_train.loc[:, 't_diffEntropy_mean'] = roi_train.loc[:, ['t_diffEntropy_0', 't_diffEntropy_90', 't_diffEntropy_180', 't_diffEntropy_270']].mean(axis=1)
roi_train.loc[:, 't_diffEntropy_std'] = roi_train.loc[:, ['t_diffEntropy_0', 't_diffEntropy_90', 't_diffEntropy_180', 't_diffEntropy_270']].std(axis=1)
roi_train.loc[:, 't_IMC1_mean'] = roi_train.loc[:, ['t_IMC1_0', 't_IMC1_90', 't_IMC1_180', 't_IMC1_270']].mean(axis=1)
roi_train.loc[:, 't_IMC1_std'] = roi_train.loc[:, ['t_IMC1_0', 't_IMC1_90', 't_IMC1_180', 't_IMC1_270']].std(axis=1)
roi_train.loc[:, 't_IMC2_mean'] = roi_train.loc[:, ['t_IMC2_0', 't_IMC2_90', 't_IMC2_180', 't_IMC2_270']].mean(axis=1)
roi_train.loc[:, 't_IMC2_std'] = roi_train.loc[:, ['t_IMC2_0', 't_IMC2_90', 't_IMC2_180', 't_IMC2_270']].std(axis=1)

In [None]:
##### Média dos itens correspondentes em micro_train
mean_values = micro_train.iloc[:,1:].groupby('key').mean()
mean_values.columns = ['obj_'+col for col in mean_values.columns]

roi_train = pd.merge(roi_train, mean_values, on='key', how='left')

Atributos não numéricos:

In [None]:
non_att = ['roi_index', 'roi_result', 'roi_target', 'im_name', 'key', ]

## Análise do dataframe de treinamento
### Histogramas

In [None]:
for col in roi_train.iloc[:, 3:-2]:
    fig, ax = plt.subplots()
    ax.hist(roi_train[col], bins=100, density=False, histtype='step', log=False, color='blue')
    ax.hist(roi_train.loc[roi_train['roi_result'] == 'False_negative', col], bins=100, density=False, histtype='step', log=False, color='purple')
    ax.hist(roi_train.loc[roi_train['roi_result'] == 'False_positive', col], bins=100, density=False, histtype='step', log=False, color='red')
    ax.hist(roi_train.loc[roi_train['roi_result'] == 'True_positive', col], bins=100, density=False, histtype='step', log=False, color='green')
    plt.grid(True)
    plt.title(col)
    plt.show()

%matplotlib inline

### Boxplots

In [None]:
for col in roi_train.iloc[:, 3:-2]:
    fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(20,10))
    ax[0].boxplot(roi_train.loc[roi_train['roi_result'] == 'False_negative', col], notch=True)
    ax[0].set_title('Falso negativo')
    ax[0].grid(True)
    ax[1].boxplot(roi_train.loc[roi_train['roi_result'] == 'False_positive', col], notch=True)
    ax[1].set_title('Falso positivo')
    ax[1].grid(True)
    ax[2].boxplot(roi_train.loc[roi_train['roi_result'] == 'True_positive', col], notch=True)
    ax[2].set_title('Verdadeiro Positivo')
    ax[2].grid(True)
    plt.suptitle(col)
    plt.show()

%matplotlib inline

## Balancear dataframe de treinamento -> Igualar o número de alvos True e False

In [None]:
df_train = pd.concat([
    roi_train.loc[roi_train['roi_result']=='True_positive', :],
    roi_train.loc[roi_train['roi_result']=='False_positive', :].sample(roi_train.loc[roi_train['roi_result']=='True_positive', :].shape[0], random_state=985)
])
df_train[['roi_result', 'roi_target']].value_counts()

## Pré-processamento do dataframe

### Histogramas do dataframe de treinamento

In [None]:
for col in df_train.loc[:, ~df_train.columns.isin(non_att)]:
    fig, ax = plt.subplots()
    ax.hist(df_train[col], bins=100, density=False, histtype='step', log=False, color='blue')
    plt.grid(True)
    plt.title(col)
    plt.show()

%matplotlib inline

### Boxplots do dataframe de treinamento

In [None]:
for col in df_train.iloc[:, 3:-2]:
    fig, ax = plt.subplots()
    ax.boxplot(df_train[col], vert=False)
    plt.grid(True)
    plt.title(col)
    plt.show()

%matplotlib inline

### Pipeline para pré-processamento dos dados

In [None]:
####### Criar pipelines específicas conforme necessidade dos dados
base_pipeline = pipeline.make_pipeline(
    preprocessing.MinMaxScaler()
)

###### Operador para aplicar cada pipeline na coluna específica
p_processing = compose.ColumnTransformer([
    ('base', base_pipeline, compose.make_column_selector(dtype_include=np.number))
], remainder='passthrough')

df_train_scaled = df_train.copy()
##### aplicar os operadores no dataframe
df_train_scaled.loc[:, ~df_train_scaled.columns.isin(non_att)] = p_processing.fit_transform(df_train_scaled.loc[:, ~df_train_scaled.columns.isin(non_att)])
col_labels = p_processing.get_feature_names_out() ###### nome das variáveis/características

### Correlação entre variáveis

In [None]:
corr_matrix = df_train_scaled.iloc[:,2:].corr()
corr_matrix['roi_target'].sort_values(ascending=False) ### Correlação entre as variáveis e o resultado esperado

Selecionando variáveis pelo valor da correlação com o resultado esperado

In [None]:
fts_pos = corr_matrix['roi_target'].sort_values(ascending=False) >= 0.2
fts_neg = corr_matrix['roi_target'].sort_values(ascending=False) <= -0.2

slc_fts = np.array(corr_matrix[np.logical_and(np.logical_or(fts_pos, fts_neg), corr_matrix['roi_target'].sort_values(ascending=False) != 1.0)].index)
slc_fts

## KNeighborsClassifier

In [None]:
model_knn = neighbors.KNeighborsClassifier(
    n_neighbors=15,
    weights='distance', ### ['uniform', 'distance']
    algorithm='kd_tree', ### ['auto', 'ball_tree', 'kd_tree', 'brute']
    p=1, ### power parameter for minkowski metric -> 1 é manhattan_distance e 2 é euclidean distance
    metric='minkowski' ### metric to use for distance computation -> ['euclidean', 'cosine', 'chebyshev', 'correlation', 'manhattan', 'minkowski']
)

Gridsearch

In [None]:
param_grid = dict(
    n_neighbors=[2, 5, 7, 10, 15, 20, 25, 30],
    weights=['uniform', 'distance'],
    algorithm=['ball_tree', 'kd_tree', 'brute'],
    leaf_size=[5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    p=[1, 2],
    metric=['eucliedean', 'cosine', 'manhattan', 'minkowski']
    )

In [None]:
grid_search = model_selection.GridSearchCV(model_knn, param_grid, cv=10, scoring='accuracy')
grid_search.fit(df_train.loc[:, slc_fts], df_train['roi_target'])

In [None]:
grid_search.best_params_

In [None]:
####### Primeiras 5 combinações com melhores resultados
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

In [None]:
param_dstb = dict(
    n_neighbors=stats.randint(low=1, high=100),
    weights=['uniform', 'distance'],
    algorithm=['ball_tree', 'kd_tree', 'brute'],
    leaf_size=stats.randint(low=1, high=100),
    p=[1, 2],
    metric=['eucliedean', 'manhattan', 'minkowski']
    )

In [None]:
rnd_search = model_selection.RandomizedSearchCV(model_knn, param_distributions=param_dstb, n_iter=1000, cv=10, scoring='accuracy')
rnd_search.fit(df_train.loc[:, slc_fts], df_train['roi_target'])

In [None]:
rnd_search.best_params_

In [None]:
####### Primeiras 5 combinações com melhores resultados
cv_res = pd.DataFrame(rnd_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

In [None]:
model_knn.fit(df_train.loc[:, slc_fts], df_train['roi_target'])

In [None]:
cv_scores = model_selection.cross_val_score(model_knn, df_train.loc[:, slc_fts], df_train['roi_target'], cv=10, scoring='accuracy')

y_train_pred = model_selection.cross_val_predict(model_knn, df_train.loc[:, slc_fts], df_train['roi_target'], cv=10, method='predict')

### Matriz de confusão

In [None]:

metrics.ConfusionMatrixDisplay.from_predictions(df_train['roi_target'], y_train_pred, normalize='true', values_format='.2%')
plt.show()

print('\nRecall score: ', '{:.2%}'.format(metrics.recall_score(df_train['roi_target'], y_train_pred)))
print('Precision score: ', '{:.2%}'.format(metrics.precision_score(df_train['roi_target'], y_train_pred)))
print('Accuracy: ', '{:.2%}'.format(metrics.accuracy_score(df_train['roi_target'], y_train_pred)))
print('F1-score: ', metrics.f1_score(df_train['roi_target'], y_train_pred))


### Precisão vs Recall

In [None]:
precisions, recalls, thresholds = metrics.precision_recall_curve(df_train['roi_target'], y_train_pred)

fig, ax = plt.subplots()
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel('Threshold')
plt.legend(loc='center right')
plt.grid(visible=True)
plt.show()

fig, ax = plt.subplots()
plt.plot(recalls, precisions, linewidth=2, label="Precision/Recall curve")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(visible=True)
plt.show()


### Curva ROC

In [None]:
fpr, tpr, roc_thresh = metrics.roc_curve(df_train['roi_target'], y_train_pred)

print('ROC Area Under Curve: ', metrics.roc_auc_score(df_train['roi_target'], y_train_pred))

plt.plot(fpr, tpr, linewidth=2, label="ROC curve")
plt.plot([0, 1], [0, 1], 'k:', label="ROC curve")
plt.grid(visible=True)
plt.show()

## Neural Networks

#### Montando a rede

Explorando hiperparâmetros

In [None]:
def build_model(hp):
######### Definindo os hiperparametros
    n_hidden = hp.Int('n_hidden', min_value=0, max_value=4, default=2)
    n_neurons = hp.Int('n_neurons', min_value=16, max_value=100)
    learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')
    optimizer = hp.Choice('optimizer', values=['sgd', 'adam'])
    if optimizer == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

    ####### Montando o modelo
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=[21], batch_size=32))
    for i in range(n_hidden): ###### camadas intermediárias
        model.add(tf.keras.layers.Dense(n_neurons, activation='sigmoid'))

    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    ###### Compilando o modelo
    model.compile(loss='mean_squared_error',
                optimizer=optimizer,
                metrics=['accuracy'])

    return model

In [None]:
random_search_tuner = kt.RandomSearch(build_model,
                                    objective='val_accuracy',
                                    max_trials=15,
                                    overwrite=True,
                                    seed=654, #### random seed
                                    project_name='roi_classification'
)

Callbacks

In [None]:
##### Salvar o modelo conforme o checkpoint definido
cb_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='checkpoint_file.keras',
    monitor='val_accuracy', ### 'val_loss',
    verbose=False,
    save_best_only=True,
    mode='auto',
)

#### Interromper o treinamento conforme critérios estabelecidos
cb_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=100,
    verbose=False,
    mode='auto',
    start_from_epoch=0,
    restore_best_weights=True
)

#### TensorBoard
cb_tensorboard = tf.keras.callbacks.TensorBoard(
    log_dir='logs',
    profile_batch=(100,200)
)

Fitting

In [None]:
X_train = df_train_scaled.loc[:, slc_fts]
y_train = df_train_scaled['roi_target']

In [None]:
random_search_tuner.search(X_train, y_train, epochs=5000, validation_split=0.25, batch_size=32, shuffle=True,
                           callbacks=[cb_checkpoint, cb_early_stopping, cb_tensorboard], verbose=False)

In [None]:
best_trial = random_search_tuner.oracle.get_best_trials(num_trials=1)[0]
best_trial.summary()

Avaliação

#### Teste e Avaliação