# 04. Model Training

In [2]:
# ==============================================================================
# 04. TREINAMENTO DE MACHINE LEARNING E AVALIA√á√ÉO DE DESEMPENHO (LOOCV)
# ARQUITETURA OFICIAL: Random Forest + SelectKBest (k=12)
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

# Suprimindo avisos para manter o log de sa√≠da limpo
warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è INICIANDO MOTOR DE MACHINE LEARNING E VALIDA√á√ÉO (LOOCV)")
print("="*80)

# ------------------------------------------------------------------------------
# 1. CARREGAMENTO DOS DADOS
# ------------------------------------------------------------------------------
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
try:
    df = pd.read_csv(caminho_csv)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå Erro: Arquivo {caminho_csv} n√£o encontrado.")

SEED = 97
cv_strategy = LeaveOneOut()

# Mapeamento robusto das condi√ß√µes
condicoes_busca = {
    'Face Feliz': ['Face Feliz', 'FF', 'F'],
    'Face Neutra': ['Face Neutra', 'FN', 'N'],
    'Face Raiva': ['Face Raiva', 'FR', 'R']
}

# ------------------------------------------------------------------------------
# 2. TREINAMENTO E EXTRA√á√ÉO DE M√âTRICAS GLOBAIS
# ------------------------------------------------------------------------------
for nome_condicao, lista_triggers in condicoes_busca.items():
    print(f"\n" + "-"*80)
    print(f"üß† AVALIANDO CONDI√á√ÉO: {nome_condicao.upper()}")
    print("-"*80)
    
    df_f = df[df['Condicao'].isin(lista_triggers)].copy()
    if df_f.empty: 
        print(f"‚ö†Ô∏è Sem dados para {nome_condicao}.")
        continue
        
    y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
    X = df_f.drop(columns=['Condicao', 'Tipo', 'Grupo', 'ID'], errors='ignore')
    X = X.dropna(axis=1, how='all')

    # A ARQUITETURA VENCEDORA BLINDADA
    pipe_oficial = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(f_classif, k=12)),
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))
    ])
    
    # Executando a predi√ß√£o para todos os 42 sujeitos (um por vez no LOOCV)
    y_pred = cross_val_predict(pipe_oficial, X, y, cv=cv_strategy)
    
    # Calculando as m√©tricas oficiais
    acc = accuracy_score(y, y_pred)
    cm = confusion_matrix(y, y_pred)
    
    # Impress√£o do Laudo Cl√≠nico Textual
    destaque = " ‚≠ê [BIOMARCADOR PRIM√ÅRIO DETECTADO]" if acc >= 0.78 else ""
    print(f"üéØ ACUR√ÅCIA GERAL: {acc:.2%} {destaque}\n")
    
    print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
    print(f"                   Predito Controle (0) | Predito TEA (1)")
    print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
    print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")
    
    print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
    # target_names: 0 √© Controle, 1 √© TEA
    report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
    print(report)

print("="*80)
print("‚úÖ AVALIA√á√ÉO CONCLU√çDA. PRONTO PARA AN√ÅLISE VISUAL (NOTEBOOK 5).")
print("="*80)

‚öôÔ∏è INICIANDO MOTOR DE MACHINE LEARNING E VALIDA√á√ÉO (LOOCV)

--------------------------------------------------------------------------------
üß† AVALIANDO CONDI√á√ÉO: FACE FELIZ
--------------------------------------------------------------------------------
üéØ ACUR√ÅCIA GERAL: 79.07%  ‚≠ê [BIOMARCADOR PRIM√ÅRIO DETECTADO]

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        21           |        04
Real TEA (1)      |        05           |        13

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.808     0.840     0.824        25
         TEA      0.765     0.722     0.743        18

    accuracy                          0.791        43
   macro avg      0.786     0.781     0.783        43
weighted avg      0.790     0.791     0.790        43


--------------------------------------------------------------------------------
üß† AVALIANDO

In [3]:
# ==============================================================================
# 04. TREINAMENTO DE MACHINE LEARNING E AVALIA√á√ÉO DE DESEMPENHO (LOOCV)
# ARQUITETURA OFICIAL: Random Forest + SelectKBest (k=12)
# BLINDAGEM ATIVA: Corre√ß√£o de Duplicatas (Garantia de N=42)
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

# Suprimindo avisos para manter o log de sa√≠da limpo
warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è INICIANDO MOTOR DE MACHINE LEARNING E VALIDA√á√ÉO (LOOCV)")
print("="*80)

# ------------------------------------------------------------------------------
# 1. CARREGAMENTO DOS DADOS
# ------------------------------------------------------------------------------
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
try:
    df = pd.read_csv(caminho_csv)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå Erro: Arquivo {caminho_csv} n√£o encontrado.")

SEED = 97
cv_strategy = LeaveOneOut()

# Mapeamento robusto das condi√ß√µes
condicoes_busca = {
    'Face Feliz': ['Face Feliz', 'FF', 'F'],
    'Face Neutra': ['Face Neutra', 'FN', 'N'],
    'Face Raiva': ['Face Raiva', 'FR', 'R']
}

# ------------------------------------------------------------------------------
# 2. TREINAMENTO E EXTRA√á√ÉO DE M√âTRICAS GLOBAIS
# ------------------------------------------------------------------------------
for nome_condicao, lista_triggers in condicoes_busca.items():
    print(f"\n" + "-"*80)
    print(f"üß† AVALIANDO CONDI√á√ÉO: {nome_condicao.upper()}")
    print("-"*80)
    
    df_f = df[df['Condicao'].isin(lista_triggers)].copy()
    
    # --- A BARREIRA DE SEGURAN√áA (GARANTINDO N=42) ---
    # Funde pacientes que tiveram marca√ß√µes duplas (ex: 'FF' e 'F') pela m√©dia
    df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()
    
    if df_f.empty: 
        print(f"‚ö†Ô∏è Sem dados para {nome_condicao}.")
        continue
        
    n_pacientes = df_f.shape[0]
    print(f"üë• Pacientes validados para esta tarefa: {n_pacientes}")
        
    y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
    X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

    # A ARQUITETURA VENCEDORA BLINDADA
    pipe_oficial = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(f_classif, k=12)),
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))
    ])
    
    try:
        # Executando a predi√ß√£o real (LOOCV)
        y_pred = cross_val_predict(pipe_oficial, X, y, cv=cv_strategy)
        
        # Calculando as m√©tricas oficiais
        acc = accuracy_score(y, y_pred)
        cm = confusion_matrix(y, y_pred)
        
        # Impress√£o do Laudo Cl√≠nico Textual
        destaque = " ‚≠ê [BIOMARCADOR PRIM√ÅRIO DETECTADO]" if acc >= 0.78 else ""
        print(f"üéØ ACUR√ÅCIA GERAL: {acc:.2%} {destaque}\n")
        
        print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
        print(f"                   Predito Controle (0) | Predito TEA (1)")
        print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
        print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")
        
        print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
        report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
        print(report)
        
    except Exception as e:
        print(f"‚ùå Ocorreu um erro ao processar a {nome_condicao}: {e}")

print("="*80)
print("‚úÖ AVALIA√á√ÉO CONCLU√çDA. PRONTO PARA AN√ÅLISE VISUAL (NOTEBOOK 5).")
print("="*80)

‚öôÔ∏è INICIANDO MOTOR DE MACHINE LEARNING E VALIDA√á√ÉO (LOOCV)

--------------------------------------------------------------------------------
üß† AVALIANDO CONDI√á√ÉO: FACE FELIZ
--------------------------------------------------------------------------------
üë• Pacientes validados para esta tarefa: 42
üéØ ACUR√ÅCIA GERAL: 73.81% 

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        18           |        06
Real TEA (1)      |        05           |        13

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.783     0.750     0.766        24
         TEA      0.684     0.722     0.703        18

    accuracy                          0.738        42
   macro avg      0.733     0.736     0.734        42
weighted avg      0.740     0.738     0.739        42


--------------------------------------------------------------------------------
üß† A

In [4]:
# ==============================================================================
# 04. TREINAMENTO E VALIDA√á√ÉO DE MACHINE LEARNING (LOOCV | N=42)
# BASE: REDES FUNCIONAIS E COMPLEXIDADE (74 Features)
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

# Suprimindo avisos para manter o log de sa√≠da limpo
warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è MOTOR DE MACHINE LEARNING: TESTANDO REDES FUNCIONAIS (N=42)")
print("="*80)

# ------------------------------------------------------------------------------
# 1. CARREGAMENTO DOS DADOS
# ------------------------------------------------------------------------------
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
try:
    df = pd.read_csv(caminho_csv)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå Erro: Arquivo {caminho_csv} n√£o encontrado.")

SEED = 97
cv_strategy = LeaveOneOut()

# Mapeamento robusto das condi√ß√µes
condicoes_busca = {
    'Face Feliz': ['Face Feliz', 'FF', 'F'],
    'Face Neutra': ['Face Neutra', 'FN', 'N'],
    'Face Raiva': ['Face Raiva', 'FR', 'R']
}

# Expandimos a busca levemente para comportar as 74 vari√°veis
configuracoes = [
    ('Random Forest (k=10)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=10)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('Random Forest (k=15)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=15)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('SVM RBF (k=15)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=15)), ('clf', SVC(kernel='rbf', C=10.0, random_state=SEED))]))
]

# ------------------------------------------------------------------------------
# 2. TREINAMENTO, SELE√á√ÉO E AVALIA√á√ÉO (LOOCV BLINDADO)
# ------------------------------------------------------------------------------
for nome_condicao, lista_triggers in condicoes_busca.items():
    print(f"\n" + "-"*80)
    print(f"üß† AVALIANDO CONDI√á√ÉO: {nome_condicao.upper()}")
    print("-"*80)
    
    df_f = df[df['Condicao'].isin(lista_triggers)].copy()
    
    # --- A BARREIRA DE SEGURAN√áA (GARANTINDO N=42) ---
    df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()
    
    if df_f.empty: 
        print(f"‚ö†Ô∏è Sem dados para {nome_condicao}.")
        continue
        
    n_pacientes = df_f.shape[0]
    print(f"üë• Pacientes validados: {n_pacientes}")
        
    y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
    X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

    melhor_acc = 0
    melhor_pipe = None
    melhor_nome = ""
    
    # Descobre quem lida melhor com a Conectividade e Entropia
    for nome_modelo, pipeline in configuracoes:
        acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
        if acc_temp > melhor_acc:
            melhor_acc = acc_temp
            melhor_pipe = pipeline
            melhor_nome = nome_modelo
            
    try:
        # Extrai a resposta real do melhor modelo para cada paciente
        y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
        
        acc = accuracy_score(y, y_pred)
        cm = confusion_matrix(y, y_pred)
        
        destaque = " ‚≠ê [ALVO ATINGIDO: >78%]" if acc >= 0.78 else ""
        print(f"üèÜ ARQUITETURA VENCEDORA: {melhor_nome}")
        print(f"üéØ ACUR√ÅCIA GERAL: {acc:.2%} {destaque}\n")
        
        print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
        print(f"                   Predito Controle (0) | Predito TEA (1)")
        print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
        print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")
        
        print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
        report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
        print(report)
        
    except Exception as e:
        print(f"‚ùå Ocorreu um erro ao processar a {nome_condicao}: {e}")

print("="*80)
print("‚úÖ MOTOR DE BUSCA E VALIDA√á√ÉO FINALIZADO.")
print("="*80)

‚öôÔ∏è MOTOR DE MACHINE LEARNING: TESTANDO REDES FUNCIONAIS (N=42)

--------------------------------------------------------------------------------
üß† AVALIANDO CONDI√á√ÉO: FACE FELIZ
--------------------------------------------------------------------------------
üë• Pacientes validados: 42
üèÜ ARQUITETURA VENCEDORA: Random Forest (k=15)
üéØ ACUR√ÅCIA GERAL: 76.19% 

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        19           |        05
Real TEA (1)      |        05           |        13

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.792     0.792     0.792        24
         TEA      0.722     0.722     0.722        18

    accuracy                          0.762        42
   macro avg      0.757     0.757     0.757        42
weighted avg      0.762     0.762     0.762        42


-----------------------------------------------------

In [13]:
# ==============================================================================
# 04. OTIMIZA√á√ÉO DE HIPERPAR√ÇMETROS (FASE 1 - EXPLORAT√ìRIA)
# META: > 78% NA FACE FELIZ | BLINDAGEM: LOOCV + PIPELINE + N=42 C√ìPIA √öNICA
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("üöÄ FASE 1: VARREDURA AVAN√áADA DE ARQUITETURAS PARA O PROJETO NEO")
print("="*80)

# 1. CARREGAMENTO E BLINDAGEM DE DADOS (Garantindo N=42)
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
df = pd.read_csv(caminho_csv)

SEED = 97
cv_strategy = LeaveOneOut()

# Focando na Face Feliz, que j√° provou ser o biomarcador prim√°rio
df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

print(f"üë• Pacientes validados para o teste: {df_f.shape[0]}")

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# 2. DEFINI√á√ÉO DO ESPA√áO DE BUSCA (Totalmente dentro de Pipelines)
arquiteturas = [
    # Aumentando o campo de vis√£o das √°rvores
    ('Random Forest (k=20)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=20)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('Random Forest (k=25)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=25)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    
    # XGBoost (O campe√£o mundial de dados tabulares)
    ('XGBoost (k=15)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=15)), ('clf', XGBClassifier(eval_metric='logloss', max_depth=2, random_state=SEED))])),
    ('XGBoost (k=20)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('sel', SelectKBest(k=20)), ('clf', XGBClassifier(eval_metric='logloss', max_depth=2, random_state=SEED))])),
    
    # A arma secreta para 74 vari√°veis: Compress√£o PCA + SVM
    ('SVM RBF (PCA 10)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('pca', PCA(n_components=10, random_state=SEED)), ('clf', SVC(kernel='rbf', C=10.0, random_state=SEED))])),
    ('SVM RBF (PCA 15)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('pca', PCA(n_components=15, random_state=SEED)), ('clf', SVC(kernel='rbf', C=10.0, random_state=SEED))])),
    ('SVM RBF (PCA 20)', Pipeline([('i', SimpleImputer()), ('s', StandardScaler()), ('pca', PCA(n_components=20, random_state=SEED)), ('clf', SVC(kernel='rbf', C=10.0, random_state=SEED))]))
]

# 3. COMPETI√á√ÉO JUSTA (LOOCV)
melhor_acc = 0
melhor_pipe = None
melhor_nome = ""

print("\n‚è≥ Avaliando 7 arquiteturas complexas (Isso pode levar alguns segundos)...")
for nome_modelo, pipeline in arquiteturas:
    acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
    if acc_temp > melhor_acc:
        melhor_acc = acc_temp
        melhor_pipe = pipeline
        melhor_nome = nome_modelo

# 4. LAUDO DA ARQUITETURA VENCEDORA
print("\n" + "="*80)
print(f"üèÜ ARQUITETURA √ìTIMA ENCONTRADA: {melhor_nome}")
print("="*80)

# Gerando a matriz real de cruzamento para o vencedor
y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
acc_final = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

destaque = " üöÄ [META ULTRAPASSADA: >78%]" if acc_final > 0.78 else ""
print(f"üéØ ACUR√ÅCIA GERAL: {acc_final:.2%} {destaque}\n")

print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
print(report)

üöÄ FASE 1: VARREDURA AVAN√áADA DE ARQUITETURAS PARA O PROJETO NEO
üë• Pacientes validados para o teste: 42

‚è≥ Avaliando 7 arquiteturas complexas (Isso pode levar alguns segundos)...

üèÜ ARQUITETURA √ìTIMA ENCONTRADA: SVM RBF (PCA 10)
üéØ ACUR√ÅCIA GERAL: 66.67% 

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        17           |        07
Real TEA (1)      |        07           |        11

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.708     0.708     0.708        24
         TEA      0.611     0.611     0.611        18

    accuracy                          0.667        42
   macro avg      0.660     0.660     0.660        42
weighted avg      0.667     0.667     0.667        42



In [6]:
# ==============================================================================
# 04. FASE 1: VARREDURA AVAN√áADA COM ROBUST SCALER E FEATURE SELECTION
# META: > 78% NA FACE FELIZ | BLINDAGEM: LOOCV + PIPELINE
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("üöÄ FASE 1: VARREDURA DE ARQUITETURAS ROBUSTAS (N=42)")
print("="*80)

# 1. CARREGAMENTO E BLINDAGEM DE DADOS
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
df = pd.read_csv(caminho_csv)

SEED = 97
cv_strategy = LeaveOneOut()

# Foco na Face Feliz
df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

print(f"üë• Pacientes validados: {df_f.shape[0]}")

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# 2. ESPA√áO DE BUSCA COM ROBUST SCALER E NOVAS SELE√á√ïES
# O RobustScaler ignora os outliers biol√≥gicos extremos
arquiteturas = [
    # Estrat√©gia 1: ExtraTrees (√ìtimo para amostras pequenas e ruidosas)
    ('Extra Trees (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    
    # Estrat√©gia 2: RF com Sele√ß√£o Baseada em √Årvore (SelectFromModel)
    ('RF + Tree Selection', Pipeline([
        ('i', SimpleImputer()), 
        ('s', RobustScaler()), 
        ('sel', SelectFromModel(ExtraTreesClassifier(n_estimators=50, random_state=SEED))), 
        ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))
    ])),
    
    # Estrat√©gia 3: O Campe√£o Anterior + RobustScaler
    ('Random Forest (Robust k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    
    # Estrat√©gia 4: SVM com Puni√ß√£o de Classes (Balanced) e RobustScaler
    ('SVM RBF (Robust k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', SVC(kernel='rbf', C=5.0, class_weight='balanced', random_state=SEED))]))
]

# 3. COMPETI√á√ÉO (LOOCV)
melhor_acc = 0
melhor_pipe = None
melhor_nome = ""

print("\n‚è≥ Avaliando arquiteturas com imunidade a outliers...")
for nome_modelo, pipeline in arquiteturas:
    acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
    if acc_temp > melhor_acc:
        melhor_acc = acc_temp
        melhor_pipe = pipeline
        melhor_nome = nome_modelo

# 4. LAUDO DA ARQUITETURA VENCEDORA
print("\n" + "="*80)
print(f"üèÜ ARQUITETURA √ìTIMA ENCONTRADA: {melhor_nome}")
print("="*80)

y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
acc_final = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

destaque = " üöÄ [META ULTRAPASSADA: >78%]" if acc_final > 0.78 else ""
print(f"üéØ ACUR√ÅCIA GERAL: {acc_final:.2%} {destaque}\n")

print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
print(report)

üöÄ FASE 1: VARREDURA DE ARQUITETURAS ROBUSTAS (N=42)
üë• Pacientes validados: 42

‚è≥ Avaliando arquiteturas com imunidade a outliers...

üèÜ ARQUITETURA √ìTIMA ENCONTRADA: Random Forest (Robust k=15)
üéØ ACUR√ÅCIA GERAL: 76.19% 

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        19           |        05
Real TEA (1)      |        05           |        13

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.792     0.792     0.792        24
         TEA      0.722     0.722     0.722        18

    accuracy                          0.762        42
   macro avg      0.757     0.757     0.757        42
weighted avg      0.762     0.762     0.762        42



In [14]:
# ==============================================================================
# 04. FASE 1: AVALIA√á√ÉO DE ALTA DIMENSIONALIDADE (90 FEATURES)
# BLINDAGEM: LOOCV + COMPRESS√ÉO DE VARI√ÇNCIA + GARANTIA DE N=42
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è MOTOR DE MACHINE LEARNING: ALTA DIMENSIONALIDADE (90 FEATURES)")
print("="*80)

# 1. CARREGAMENTO E BLINDAGEM DE DADOS
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
try:
    df = pd.read_csv(caminho_csv)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå Erro: Arquivo {caminho_csv} n√£o encontrado.")

SEED = 97
cv_strategy = LeaveOneOut()

# Focaremos na Face Feliz para avaliar o impacto da nova biologia
df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()

# BARREIRA DE SEGURAN√áA: Garante exatamente 42 pacientes, fundindo duplicatas
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

print(f"üë• Pacientes validados estritamente para o teste: {df_f.shape[0]}")

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# 2. DEFINI√á√ÉO DE ARQUITETURAS RESTRITIVAS E ROBUSTAS
arquiteturas = [
    # Compress√£o de Vari√¢ncia via PCA
    ('SVM Linear (PCA 10)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('pca', PCA(n_components=10, random_state=SEED)), ('clf', SVC(kernel='linear', C=1.0, random_state=SEED))])),
    ('SVM RBF (PCA 15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('pca', PCA(n_components=15, random_state=SEED)), ('clf', SVC(kernel='rbf', C=10.0, random_state=SEED))])),
    
    # Sele√ß√£o Estrita baseada em √Årvores (Extra Trees penaliza overfitting melhor que Random Forest)
    ('Extra Trees (k=10)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=10)), ('clf', ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('Extra Trees (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    
    # Random Forest tradicional com controle de profundidade
    ('Random Forest (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))]))
]

# 3. VALIDA√á√ÉO RIGOROSA (LOOCV)
melhor_acc = 0
melhor_pipe = None
melhor_nome = ""

print("\n‚è≥ Submetendo as 90 caracter√≠sticas ao LOOCV. Avaliando arquiteturas...")
for nome_modelo, pipeline in arquiteturas:
    acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
    if acc_temp > melhor_acc:
        melhor_acc = acc_temp
        melhor_pipe = pipeline
        melhor_nome = nome_modelo

# 4. EXIBI√á√ÉO DO RESULTADO E MATRIZ
print("\n" + "="*80)
print(f"üèÜ ARQUITETURA DE MAIOR CAPACIDADE PREDITIVA: {melhor_nome}")
print("="*80)

y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
acc_final = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

destaque = " üöÄ [ALVO DE ALTA PERFORMANCE ATINGIDO]" if acc_final > 0.78 else " ‚ö†Ô∏è [ABAIXO DA META DE 78%]"
print(f"üéØ ACUR√ÅCIA GERAL: {acc_final:.2%} {destaque}\n")

print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
print(report)

‚öôÔ∏è MOTOR DE MACHINE LEARNING: ALTA DIMENSIONALIDADE (90 FEATURES)
üë• Pacientes validados estritamente para o teste: 42

‚è≥ Submetendo as 90 caracter√≠sticas ao LOOCV. Avaliando arquiteturas...

üèÜ ARQUITETURA DE MAIOR CAPACIDADE PREDITIVA: Extra Trees (k=15)
üéØ ACUR√ÅCIA GERAL: 69.05%  ‚ö†Ô∏è [ABAIXO DA META DE 78%]

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        22           |        02
Real TEA (1)      |        11           |        07

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.667     0.917     0.772        24
         TEA      0.778     0.389     0.519        18

    accuracy                          0.690        42
   macro avg      0.722     0.653     0.645        42
weighted avg      0.714     0.690     0.663        42



In [15]:
# ==============================================================================
# 04. FASE 1: A BUSCA FINAL COM ELIMINA√á√ÉO RECURSIVA (RFE)
# META: > 78% | BLINDAGEM: LOOCV + N=42 + PIPELINE ROBUSTO
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è MOTOR DE SELE√á√ÉO AVAN√áADA: ELIMINA√á√ÉO RECURSIVA (RFE)")
print("="*80)

caminho_csv = '../reports/tabela_features_eeg_completa.csv'
df = pd.read_csv(caminho_csv)

SEED = 97
cv_strategy = LeaveOneOut()

# Foco na Face Feliz
df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

print(f"üë• Pacientes validados: {df_f.shape[0]}")

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# Estimador base para o RFE julgar o trabalho em equipe das vari√°veis
seletor_base = LogisticRegression(solver='liblinear', random_state=SEED)

arquiteturas = [
    ('RF + RFE (k=10)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', RFE(estimator=seletor_base, n_features_to_select=10)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('RF + RFE (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', RFE(estimator=seletor_base, n_features_to_select=15)), ('clf', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED))])),
    ('SVM Linear + RFE (k=12)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', RFE(estimator=seletor_base, n_features_to_select=12)), ('clf', SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=SEED))]))
]

melhor_acc = 0
melhor_pipe = None
melhor_nome = ""

print("\n‚è≥ Submetendo ao RFE. Procurando o 33¬∫ paciente...")
for nome_modelo, pipeline in arquiteturas:
    acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
    if acc_temp > melhor_acc:
        melhor_acc = acc_temp
        melhor_pipe = pipeline
        melhor_nome = nome_modelo

print("\n" + "="*80)
print(f"üèÜ ARQUITETURA √ìTIMA ENCONTRADA: {melhor_nome}")
print("="*80)

y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
acc_final = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

destaque = " üöÄ [ALVO ATINGIDO: >78%]" if acc_final > 0.78 else " ‚öñÔ∏è [LIMITE BIOL√ìGICO ATINGIDO]"
print(f"üéØ ACUR√ÅCIA GERAL: {acc_final:.2%} {destaque}\n")

print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
print(report)

‚öôÔ∏è MOTOR DE SELE√á√ÉO AVAN√áADA: ELIMINA√á√ÉO RECURSIVA (RFE)
üë• Pacientes validados: 42

‚è≥ Submetendo ao RFE. Procurando o 33¬∫ paciente...

üèÜ ARQUITETURA √ìTIMA ENCONTRADA: RF + RFE (k=15)
üéØ ACUR√ÅCIA GERAL: 69.05%  ‚öñÔ∏è [LIMITE BIOL√ìGICO ATINGIDO]

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        19           |        05
Real TEA (1)      |        08           |        10

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.704     0.792     0.745        24
         TEA      0.667     0.556     0.606        18

    accuracy                          0.690        42
   macro avg      0.685     0.674     0.676        42
weighted avg      0.688     0.690     0.686        42



In [16]:
# ==============================================================================
# 04. FASE 1: A BUSCA DEFINITIVA COM XGBOOST (O ESTADO DA ARTE)
# META: > 78% | BLINDAGEM: LOOCV + N=42 + PIPELINE + REGULARIZA√á√ÉO
# ==============================================================================

import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut, cross_val_predict, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è MOTOR XGBOOST: TESTANDO GRADIENT BOOSTING COM REGULARIZA√á√ÉO")
print("="*80)

caminho_csv = '../reports/tabela_features_eeg_completa.csv'
try:
    df = pd.read_csv(caminho_csv)
except FileNotFoundError:
    raise FileNotFoundError(f"‚ùå Erro: Arquivo {caminho_csv} n√£o encontrado.")

SEED = 97
cv_strategy = LeaveOneOut()

# Foco na Face Feliz
df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

print(f"üë• Pacientes validados para o XGBoost: {df_f.shape[0]}")

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# Hiperpar√¢metros estritos para evitar overfitting em amostras pequenas:
# max_depth=2 (√°rvores muito rasas)
# reg_lambda e reg_alpha (penalidades fortes contra decoreba)
xgb_model = XGBClassifier(
    n_estimators=100, 
    max_depth=2, 
    learning_rate=0.05,
    reg_lambda=1.5,
    reg_alpha=0.5,
    use_label_encoder=False, 
    eval_metric='logloss', 
    random_state=SEED
)

arquiteturas = [
    ('XGBoost + KBest (k=10)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=10)), ('clf', xgb_model)])),
    ('XGBoost + KBest (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', xgb_model)])),
    ('XGBoost + KBest (k=20)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=20)), ('clf', xgb_model)])),
    
    ('XGBoost + PCA (k=10)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('pca', PCA(n_components=10, random_state=SEED)), ('clf', xgb_model)])),
    ('XGBoost + PCA (k=15)', Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('pca', PCA(n_components=15, random_state=SEED)), ('clf', xgb_model)]))
]

melhor_acc = 0
melhor_pipe = None
melhor_nome = ""

print("\n‚è≥ Submetendo a base de 74 vari√°veis ao XGBoost...")
for nome_modelo, pipeline in arquiteturas:
    acc_temp = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
    if acc_temp > melhor_acc:
        melhor_acc = acc_temp
        melhor_pipe = pipeline
        melhor_nome = nome_modelo

print("\n" + "="*80)
print(f"üèÜ ARQUITETURA √ìTIMA ENCONTRADA: {melhor_nome}")
print("="*80)

y_pred = cross_val_predict(melhor_pipe, X, y, cv=cv_strategy)
acc_final = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

destaque = " üöÄ [ALVO ATINGIDO: >78%]" if acc_final > 0.78 else " ‚öñÔ∏è [AQU√âM DA META]"
print(f"üéØ ACUR√ÅCIA GERAL: {acc_final:.2%} {destaque}\n")

print("üìä MATRIZ DE CONFUS√ÉO (Real x Predito):")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

print("üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:")
report = classification_report(y, y_pred, target_names=['Controle', 'TEA'], digits=3)
print(report)

‚öôÔ∏è MOTOR XGBOOST: TESTANDO GRADIENT BOOSTING COM REGULARIZA√á√ÉO
üë• Pacientes validados para o XGBoost: 42

‚è≥ Submetendo a base de 74 vari√°veis ao XGBoost...

üèÜ ARQUITETURA √ìTIMA ENCONTRADA: XGBoost + PCA (k=15)
üéØ ACUR√ÅCIA GERAL: 64.29%  ‚öñÔ∏è [AQU√âM DA META]

üìä MATRIZ DE CONFUS√ÉO (Real x Predito):
                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        19           |        05
Real TEA (1)      |        10           |        08

üìà RELAT√ìRIO DE CLASSIFICA√á√ÉO:
              precision    recall  f1-score   support

    Controle      0.655     0.792     0.717        24
         TEA      0.615     0.444     0.516        18

    accuracy                          0.643        42
   macro avg      0.635     0.618     0.617        42
weighted avg      0.638     0.643     0.631        42



In [17]:
# ==============================================================================
# 04. BENCHMARK METODOL√ìGICO: VALIDA√á√ÉO CRUZADA E VOTING CLASSIFIER
# OBJETIVO: Comparar K-Fold vs LOOCV e testar Ensemble Heterog√™neo (Junta M√©dica)
# ==============================================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneOut, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore")

print("="*80)
print("‚öôÔ∏è BENCHMARK PARA A BANCA: K-FOLD vs LOOCV + JUNTA M√âDICA (VOTING)")
print("="*80)

# 1. CARREGAMENTO E BLINDAGEM DOS DADOS (N=42)
caminho_csv = '../reports/tabela_features_eeg_completa.csv'
df = pd.read_csv(caminho_csv)

SEED = 97

df_f = df[df['Condicao'].isin(['Face Feliz', 'FF', 'F'])].copy()
df_f = df_f.groupby(['ID', 'Grupo']).mean(numeric_only=True).reset_index()

y = df_f['Grupo'].apply(lambda x: 1 if 'TEA' in x else 0).values
X = df_f.drop(columns=['ID', 'Grupo'], errors='ignore')

# 2. AS DUAS ARQUITETURAS CAMPE√ÉS
# Modelo A: O Campe√£o Anterior (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED)

# Modelo B: A Junta M√©dica (Voting Classifier com Soft Voting)
# 'soft' significa que eles somam as probabilidades matem√°ticas antes de decidir
clf1 = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=SEED)
clf2 = SVC(kernel='rbf', C=5.0, probability=True, random_state=SEED)
clf3 = ExtraTreesClassifier(n_estimators=100, max_depth=3, random_state=SEED)

voting_model = VotingClassifier(estimators=[('rf', clf1), ('svm', clf2), ('et', clf3)], voting='soft')

arquiteturas = {
    'Random Forest Solo': Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', rf_model)]),
    'Junta M√©dica (Voting)': Pipeline([('i', SimpleImputer()), ('s', RobustScaler()), ('sel', SelectKBest(f_classif, k=15)), ('clf', voting_model)])
}

# 3. AS TR√äS R√âGUAS DE VALIDA√á√ÉO (O que a Banca pediu)
validacoes = {
    'Stratified K-Fold (k=5)': StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
    'Stratified K-Fold (k=10)': StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED),
    'Leave-One-Out (LOOCV)': LeaveOneOut()
}

# 4. EXECU√á√ÉO DO BENCHMARK
resultados = []

print(f"üë• Pacientes validados estritamente: {df_f.shape[0]}")
print("‚è≥ Rodando a matriz de testes pesada. Aguarde...\n")

for nome_val, cv_strategy in validacoes.items():
    print(f"--- Testando Valida√ß√£o: {nome_val} ---")
    for nome_arq, pipeline in arquiteturas.items():
        # Acur√°cia m√©dia
        acc = cross_val_score(pipeline, X, y, cv=cv_strategy).mean()
        resultados.append({'Valida√ß√£o': nome_val, 'Modelo': nome_arq, 'Acur√°cia': acc})
        
        destaque = "üöÄ ALVO ATINGIDO" if acc > 0.78 else ""
        print(f"   {nome_arq}: {acc:.2%} {destaque}")
    print("")

# 5. O LAUDO FINAL E A PROVA DO LOOCV
df_res = pd.DataFrame(resultados)
melhor_teste = df_res.loc[df_res['Acur√°cia'].idxmax()]

print("="*80)
print(f"üèÜ PICO M√ÅXIMO ATINGIDO: {melhor_teste['Acur√°cia']:.2%} ({melhor_teste['Modelo']} via {melhor_teste['Valida√ß√£o']})")
print("="*80)

# 6. MATRIZ DE CONFUS√ÉO PARA A JUNTA M√âDICA (VOTING) COM LOOCV
print("\nüìä LAUDO CL√çNICO: JUNTA M√âDICA + LOOCV (O PADR√ÉO-OURO):")
pipe_voting = arquiteturas['Junta M√©dica (Voting)']
y_pred_voting = cross_val_predict(pipe_voting, X, y, cv=LeaveOneOut())
acc_voting_loocv = accuracy_score(y, y_pred_voting)
cm = confusion_matrix(y, y_pred_voting)

print(f"üéØ ACUR√ÅCIA GERAL LOOCV: {acc_voting_loocv:.2%}\n")
print(f"                   Predito Controle (0) | Predito TEA (1)")
print(f"Real Controle (0) |        {cm[0,0]:02d}           |        {cm[0,1]:02d}")
print(f"Real TEA (1)      |        {cm[1,0]:02d}           |        {cm[1,1]:02d}\n")

report = classification_report(y, y_pred_voting, target_names=['Controle', 'TEA'], digits=3)
print(report)

‚öôÔ∏è BENCHMARK PARA A BANCA: K-FOLD vs LOOCV + JUNTA M√âDICA (VOTING)
üë• Pacientes validados estritamente: 42
‚è≥ Rodando a matriz de testes pesada. Aguarde...

--- Testando Valida√ß√£o: Stratified K-Fold (k=5) ---
   Random Forest Solo: 66.94% 
   Junta M√©dica (Voting): 67.22% 

--- Testando Valida√ß√£o: Stratified K-Fold (k=10) ---
   Random Forest Solo: 64.50% 
   Junta M√©dica (Voting): 60.00% 

--- Testando Valida√ß√£o: Leave-One-Out (LOOCV) ---
   Random Forest Solo: 64.29% 
   Junta M√©dica (Voting): 61.90% 

üèÜ PICO M√ÅXIMO ATINGIDO: 67.22% (Junta M√©dica (Voting) via Stratified K-Fold (k=5))

üìä LAUDO CL√çNICO: JUNTA M√âDICA + LOOCV (O PADR√ÉO-OURO):
üéØ ACUR√ÅCIA GERAL LOOCV: 61.90%

                   Predito Controle (0) | Predito TEA (1)
Real Controle (0) |        19           |        05
Real TEA (1)      |        11           |        07

              precision    recall  f1-score   support

    Controle      0.633     0.792     0.704        24
         TEA   