In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

file = "dataR2.csv"
df = pd.read_csv(file)
X = df.drop(columns=["Classification"])
y = df["Classification"]



In [2]:
import numpy as np
from scipy.stats import kruskal

#--------------------------feature selection --------------------------
#Kruskal-Wallis
kruskal_selected_features = []
p_values = {}
classes = np.unique(y)
 
for feature in X.columns:
    groups = []
    for have_c in classes:
        groups.append(X.loc[y == have_c, feature]) #faz uma lista dependendo do valor do classification

    stat, p = kruskal(*groups) #stat fica com o valor H e p fica com o p-value
    p_values[feature] = p
    
    # Selecionar features com diferença estatisticamente significativa  (5% significance level)
    if p < 0.05:
        kruskal_selected_features.append(feature)
        print(f"Feature: {feature}, Kruskal-Wallis H-statistic: {stat}, p-value: {p}")

print("kruskal_selected_features:", kruskal_selected_features)
# ROC-AUC
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from plotly.subplots import make_subplots
import math


# --- prepara dados (y_train e X_train já definidos) ---
classes = np.unique(y)
pos_label = classes[-1]   # define a classe positiva como a de maior valor (ex: 1 ou 2)


fnames = np.array(X.columns)
roc_auc = np.zeros(fnames.shape)


# --- calcular número de linhas e colunas para a grelha ---
n_features = len(fnames)
cols = 2  # alterado para 2 colunas
rows = math.ceil(n_features / cols)


# --- criar grelha de subplots ---
figR = make_subplots(
    rows=rows, 
    cols=cols,
    subplot_titles=[f for f in fnames],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)


# --- calcular curvas ROC e AUC para cada feature ---
i = 0
for idx, f in enumerate(fnames):
    x_feat = X[f].to_numpy().astype(float)
    y_true = y.to_numpy()


    # curva ROC + AUC
    fpr, tpr, _ = roc_curve(y_true, x_feat, pos_label=pos_label)
    a = auc(fpr, tpr)
    if a < 0.5:
        a = 1 - a  # inverter direção se necessário
    roc_auc[i] = a


    # calcular posição na grelha
    row = (idx // cols) + 1
    col = (idx % cols) + 1


    # adicionar trace ao subplot específico com cor fixa
    figR.add_scatter(
        x=fpr, 
        y=tpr, 
        mode='lines+markers',
        line=dict(color='#636EFA'),  # cor azul padrão do plotly
        marker=dict(color='#636EFA'),
        showlegend=False,
        row=row, 
        col=col
    )


    # adicionar anotação com AUC
    figR.add_annotation(
        x=0.5, y=0.5,
        text=f"AUC: {a:.3f}",
        showarrow=False,
        yshift=10,
        row=row, col=col
    )


    i += 1


# --- configurar layout ---
figR.update_xaxes(title_text="1 - Specificidade (FPR)", range=[-0.01, 1.01])
figR.update_yaxes(title_text="Sensibilidade (TPR)", range=[-0.01, 1.01])


figR.update_layout(
    autosize=False,
    height=700 * rows,
    width=1400,  # ajustado para 2 colunas
    showlegend=False
)


figR.show()



# --- selecionar features com AUC acima do limiar (ex: 0.60) ---
roc_auc_selected_features = fnames[roc_auc > 0.60]
print("roc_auc_selected_features:", roc_auc_selected_features)


# Matriz de Correlação entre todas as features (treino)
import plotly.express as px
X_roc = X[kruskal_selected_features]
eatures = X_roc.columns.tolist()

X_roc = X_roc.to_numpy().T
corrMat = np.corrcoef(X_roc)
fig = px.imshow(
    corrMat,
    text_auto=True,
    labels=dict(x="Features", y="Features", color="Correlação"),
    x=eatures,
    y=eatures,
    width=1000,
    height=1000,
    color_continuous_scale=px.colors.sequential.gray
)
fig.update_layout(title="Matriz de Correlação entre TODAS as Features (Treino)")
fig.show()
X = X[kruskal_selected_features]

# (80%) de treino e validaçao / (20%) de teste
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 60% de treino / 20% de validação 
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

print(f"Tamanho treino:      {len(X_train)}")
print(f"Tamanho validação:   {len(X_val)}")
print(f"Tamanho teste:       {len(X_test)}")

X_train.to_csv("X_train.csv", index=False)


Feature: Glucose, Kruskal-Wallis H-statistic: 23.91957997825514, p-value: 1.0044494847229907e-06
Feature: Insulin, Kruskal-Wallis H-statistic: 4.931023337417101, p-value: 0.026378691997257887
Feature: HOMA, Kruskal-Wallis H-statistic: 8.887111686390483, p-value: 0.0028719068810340023
Feature: Resistin, Kruskal-Wallis H-statistic: 9.699241863905343, p-value: 0.0018434405032271757
kruskal_selected_features: ['Glucose', 'Insulin', 'HOMA', 'Resistin']


roc_auc_selected_features: ['Glucose' 'Insulin' 'HOMA' 'Resistin']


Tamanho treino:      69
Tamanho validação:   23
Tamanho teste:       24


In [3]:
#Removemos a insulina pq tem alta correlaçao com a homa mas tem um p-value mais alto 
#kruskal_selected_features.remove('Insulin')


In [4]:
# --------------------dimensionality reduction  -----

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Usar as FEATURES SELECIONADAS pelo Kruskal-Wallis (após remover Insulin)
X_train_array = X_train.to_numpy()  # features selecionadas
X_val_array = X_val.to_numpy()      # features selecionadas
X_test_array = X_test.to_numpy()    # features selecionadas
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_array)  # fit ONLY on training

# Normalizar validação e teste com os parâmetros do treino
X_val_std = scaler.transform(X_val_array)  # transform usando parâmetros do treino
X_test_std = scaler.transform(X_test_array)  # transform usando parâmetros do treino

# PCAcom dados de treino
pca = PCA()
pca.fit(X_train_std)

print("PCA eigenvalues/Explained variance")
print(pca.explained_variance_)
print("Sum of eigenvalues="+str(np.sum(pca.explained_variance_)))
#PCA eigenvectors/Principal components
print("PCA eigenvectors/Principal components")
W=pca.components_.T
print(W)


fig = px.scatter(x= np.arange(1, len(pca.explained_variance_) + 1), y=pca.explained_variance_,labels=dict(x="PC",y="Explained Variance"))
fig.add_hline(y=1,line_width=3, line_dash="dash", line_color="red")
fig.update_traces(marker_size=10)
fig.show()

print("Variance (%) retained accourding to Kaiser: "+str(pca.explained_variance_[0]**2/(np.sum(pca.explained_variance_**2))*100))

print("Variance (%) retained accourding to Scree: "+str(np.sum(pca.explained_variance_[0:3]**2)/(np.sum(pca.explained_variance_**2))*100))

#PCA com n_components=1 no treino
pca = PCA(n_components=2)
X_train_do_reduction_pca = pca.fit_transform(X_train_std)  # fit_transform ONLY on training

# PCA validação e teste
X_val_do_reduction_pca = pca.transform(X_val_std)  # transform usando o PCA do treino
X_test_do_reduction_pca = pca.transform(X_test_std)  # transform usando o PCA do treino



PCA eigenvalues/Explained variance
[2.66317327 0.94641169 0.41561352 0.03362505]
Sum of eigenvalues=4.0588235294117645
PCA eigenvectors/Principal components
[[ 0.52276621  0.12785984  0.80500912 -0.24965512]
 [ 0.55347396 -0.29792031 -0.4975053  -0.59782818]
 [ 0.59265706 -0.24047764 -0.11075854  0.76069748]
 [ 0.26294423  0.91491303 -0.30361381  0.04016387]]


Variance (%) retained accourding to Kaiser: 86.89593862394149
Variance (%) retained accourding to Scree: 99.98614755080267


In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
import plotly.express as px
import pandas as pd

# --- IMPORTANTE: LDA é aprendido APENAS no treino, E APENAS SOBRE AS FEATURES SELECIONADAS ---
# Usar scaler_pca que já foi fit nos dados de treino (features selecionadas)
X_train_lda = scaler.transform(X_train.to_numpy())
y_train_lda = y_train.to_numpy()

# Também normalizar validação e teste
X_val_lda = scaler.transform(X_val.to_numpy())
X_test_lda = scaler.transform(X_test.to_numpy())

# Ajustar LDA APENAS com dados de treino
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_lda, y_train_lda)  # fit ONLY on training

# Transformar treino, validação e teste usando o LDA aprendido
X_train_lda_transformed = lda.transform(X_train_lda)
X_val_lda_transformed = lda.transform(X_val_lda)
X_test_lda_transformed = lda.transform(X_test_lda)



#Plot transformed data (treino)
fig = px.scatter(x=X_train_lda_transformed[:,0], y=np.zeros_like(X_train_lda_transformed[:,0]), color=y_train.astype(str), labels=dict(x="LDA1", y="", color="Class"))
fig.update_traces(marker_size=10)

In [14]:

# MDC Euclidean a seguir o LDA
X = X_test_lda_transformed

y = y_test.to_numpy()

ix_1 = np.where(y == 1)[0]
ix_2 = np.where(y == 2)[0]

mu1 = np.array([np.mean(X[ix_1], axis=0)]).T
mu2 = np.array([np.mean(X[ix_2], axis=0)]).T

# Classificação usando MDC
yrtp = np.ones(len(y))
dx = ((mu1-mu2).T @ (X.T - 0.5*(mu1+mu2))).flatten()
yrtp[dx < 0] = 2

TPTr=np.shape(np.where((y[ix_1]==yrtp[ix_1]))[0])[0]
TNTr=np.shape(np.where((y[ix_2]==yrtp[ix_2]))[0])[0]
FPTr=np.shape(np.where((y[ix_2]!=yrtp[ix_2]))[0])[0]
FNTr=np.shape(np.where((y[ix_1]!=yrtp[ix_1]))[0])[0]


SSTr=TPTr/(TPTr+FNTr)
SPTr=TNTr/(TNTr+FPTr)
PRTr=TPTr/(TPTr+FPTr)
F1ScoreTr=2*(PRTr*SSTr)/(PRTr+SSTr)
ACTr=(TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("\n--- Métricas de Performance (MDC a seguir o LDA) ---")

print("Sensitivity(%)="+str(SSTr*100))
print("Specificity(%)="+str(SPTr*100))
print("Precision(%)="+str(PRTr*100))
print("F1Score(%)="+str(F1ScoreTr*100))
print("Accuracy(%)="+str(ACTr*100))




--- Métricas de Performance (MDC a seguir o LDA) ---
Sensitivity(%)=72.72727272727273
Specificity(%)=46.15384615384615
Precision(%)=53.333333333333336
F1Score(%)=61.53846153846153
Accuracy(%)=58.333333333333336


In [7]:
# Médias no espaço LDA
mu1 = np.array([np.mean(X[ix_1], axis=0)]).T
mu2 = np.array([np.mean(X[ix_2], axis=0)]).T

# ------ Mahalanobis: covariância pooled e sua inversa ------

C1 = np.cov(X[ix_1], rowvar=False, bias=False)
C2 = np.cov(X[ix_2], rowvar=False, bias=False)
C1 = np.atleast_2d(C1)   # garante matriz mesmo em 1D
C2 = np.atleast_2d(C2)
C  = (C1+C2)/2

# inversa estável
Ci = np.linalg.inv(C)

# ------ Classificação MDC (Mahalanobis) ------
yrtp = np.ones(len(y), dtype=int)
dx = ((mu1 - mu2).T @ Ci @ (X.T - 0.5*(mu1 + mu2))).flatten()
yrtp[dx < 0] = 2

# ------ Métricas ------
TPTr = np.shape(np.where((y[ix_1] == yrtp[ix_1]))[0])[0]
TNTr = np.shape(np.where((y[ix_2] == yrtp[ix_2]))[0])[0]
FPTr = np.shape(np.where((y[ix_2] != yrtp[ix_2]))[0])[0]
FNTr = np.shape(np.where((y[ix_1] != yrtp[ix_1]))[0])[0]

SSTr = TPTr/(TPTr+FNTr) if (TPTr+FNTr) else 0.0
SPTr = TNTr/(TNTr+FPTr) if (TNTr+FPTr) else 0.0
PRTr = TPTr/(TPTr+FPTr) if (TPTr+FPTr) else 0.0
F1ScoreTr = 2*(PRTr*SSTr)/(PRTr+SSTr) if (PRTr+SSTr) else 0.0
ACTr = (TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("\n--- Métricas de Performance (MDC após LDA - Mahalanobis) ---")
print("Sensitivity(%)=" + str(SSTr*100))
print("Specificity(%)=" + str(SPTr*100))
print("Precision(%)="   + str(PRTr*100))
print("F1Score(%)="    + str(F1ScoreTr*100))
print("Accuracy(%)="   + str(ACTr*100))



--- Métricas de Performance (MDC após LDA - Mahalanobis) ---
Sensitivity(%)=72.72727272727273
Specificity(%)=46.15384615384615
Precision(%)=53.333333333333336
F1Score(%)=61.53846153846153
Accuracy(%)=58.333333333333336


In [None]:



#------ Fisher ------
S1=(X[ix_1,:].T-mu1)@(X[ix_1,:].T-mu1).T
S2=(X[ix_2,:].T-mu2)@(X[ix_2,:].T-mu2).T

Sw=S1+S2
SwInv=np.linalg.inv(Sw)

w=SwInv@(mu1-mu2)

b=-0.5*(w.T@mu1+w.T@mu2)

yrtp=np.ones(np.shape(y))

dx=(w.T@X.T + b).flatten()
yrtp[dx<0]=2


TPTr=np.shape(np.where((y[ix_1]==yrtp[ix_1]))[0])[0]
TNTr=np.shape(np.where((y[ix_2]==yrtp[ix_2]))[0])[0]
FPTr=np.shape(np.where((y[ix_2]!=yrtp[ix_2]))[0])[0]
FNTr=np.shape(np.where((y[ix_1]!=yrtp[ix_1]))[0])[0]


SSTr=TPTr/(TPTr+FNTr)
SPTr=TNTr/(TNTr+FPTr)
PRTr=TPTr/(TPTr+FPTr)
F1ScoreTr=2*(PRTr*SSTr)/(PRTr+SSTr)
ACTr=(TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("Sensitivity(%)="+str(SSTr*100))
print("Specificity(%)="+str(SPTr*100))
print("Precision(%)="+str(PRTr*100))
print("F1Score(%)="+str(F1ScoreTr*100))
print("Accuracy(%)="+str(ACTr*100))




SyntaxError: positional argument follows keyword argument (3770137548.py, line 38)

In [None]:
X=X_test_do_reduction_pca

y = y_test.to_numpy()

ix_1 = np.where(y == 1)[0]
ix_2 = np.where(y == 2)[0]

mu1 = np.array([np.mean(X[ix_1], axis=0)]).T
mu2 = np.array([np.mean(X[ix_2], axis=0)]).T

yrtp = np.ones(np.shape(y))
dx = ((mu1 - mu2).T @ (X.T - 0.5*(mu1+mu2))).flatten()
yrtp[dx < 0] = 2

TPTr=np.shape(np.where((y[ix_1]==yrtp[ix_1]))[0])[0]
TNTr=np.shape(np.where((y[ix_2]==yrtp[ix_2]))[0])[0]
FPTr=np.shape(np.where((y[ix_2]!=yrtp[ix_2]))[0])[0]
FNTr=np.shape(np.where((y[ix_1]!=yrtp[ix_1]))[0])[0]
SSTr=TPTr/(TPTr+FNTr)
SPTr=TNTr/(TNTr+FPTr)
PRTr=TPTr/(TPTr+FPTr)
F1ScoreTr=2*(PRTr*SSTr)/(PRTr+SSTr)
ACTr=(TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("\n--- Métricas de Performance (MDC  a seguir o PCA) ---")
print("Sensitivity(%)="+str(SSTr*100))
print("Specificity(%)="+str(SPTr*100))
print("Precision(%)="+str(PRTr*100))
print("F1Score(%)="+str(F1ScoreTr*100))
print("Accuracy(%)="+str(ACTr*100))





--- Métricas de Performance (MDC a seguir o PCA) ---
Sensitivity(%)=100.0
Specificity(%)=30.76923076923077
Precision(%)=55.00000000000001
F1Score(%)=70.96774193548387
Accuracy(%)=62.5


In [None]:
#MDC Mahalanobis a seguir PCA
C1 = np.cov(X[ix_1].T)


C2 = np.cov(X[ix_2].T)



C  = (C1+C2)/2

# inversa estável
Ci = np.linalg.inv(C)

yrtp = np.ones(len(y), dtype=int)
dx = ((mu1 - mu2).T @ Ci @ (X.T - 0.5*(mu1 + mu2))).flatten()
yrtp[dx < 0] = 2


TPTr=np.shape(np.where((y[ix_1]==yrtp[ix_1]))[0])[0]
TNTr=np.shape(np.where((y[ix_2]==yrtp[ix_2]))[0])[0]
FPTr=np.shape(np.where((y[ix_2]!=yrtp[ix_2]))[0])[0]
FNTr=np.shape(np.where((y[ix_1]!=yrtp[ix_1]))[0])[0]
SSTr=TPTr/(TPTr+FNTr)
SPTr=TNTr/(TNTr+FPTr)
PRTr=TPTr/(TPTr+FPTr)
F1ScoreTr=2*(PRTr*SSTr)/(PRTr+SSTr)
ACTr=(TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("\n--- Métricas de Performance (MDC  a seguir o PCA) ---")
print("Sensitivity(%)="+str(SSTr*100))
print("Specificity(%)="+str(SPTr*100))
print("Precision(%)="+str(PRTr*100))
print("F1Score(%)="+str(F1ScoreTr*100))
print("Accuracy(%)="+str(ACTr*100))





--- Métricas de Performance (MDC a seguir o PCA) ---
Sensitivity(%)=72.72727272727273
Specificity(%)=38.46153846153847
Precision(%)=50.0
F1Score(%)=59.25925925925925
Accuracy(%)=54.166666666666664


In [None]:
#------ Fisher ------


S1=(X[ix_1,:].T-mu1)@(X[ix_1,:].T-mu1).T
S2=(X[ix_2,:].T-mu2)@(X[ix_2,:].T-mu2).T

Sw=S1+S2
SwInv=np.linalg.inv(Sw)

w=SwInv@(mu1-mu2)

b=-0.5*(w.T@mu1+w.T@mu2)

yrtp=np.ones(np.shape(y))

dx=(w.T@X.T + b).flatten()
yrtp[dx<0]=2


TPTr=np.shape(np.where((y[ix_1]==yrtp[ix_1]))[0])[0]
TNTr=np.shape(np.where((y[ix_2]==yrtp[ix_2]))[0])[0]
FPTr=np.shape(np.where((y[ix_2]!=yrtp[ix_2]))[0])[0]
FNTr=np.shape(np.where((y[ix_1]!=yrtp[ix_1]))[0])[0]


SSTr=TPTr/(TPTr+FNTr)
SPTr=TNTr/(TNTr+FPTr)
PRTr=TPTr/(TPTr+FPTr)
F1ScoreTr=2*(PRTr*SSTr)/(PRTr+SSTr)
ACTr=(TNTr+TPTr)/(TPTr+TNTr+FPTr+FNTr)

print("Sensitivity(%)="+str(SSTr*100))
print("Specificity(%)="+str(SPTr*100))
print("Precision(%)="+str(PRTr*100))
print("F1Score(%)="+str(F1ScoreTr*100))
print("Accuracy(%)="+str(ACTr*100))


Sensitivity(%)=72.72727272727273
Specificity(%)=38.46153846153847
Precision(%)=50.0
F1Score(%)=59.25925925925925
Accuracy(%)=54.166666666666664
