<a href="https://colab.research.google.com/github/aureavaleria/DataBalancing-Research/blob/main/papers/Artigo%201/V3/Vers%C3%A3o_3_(ajustes_de_hiperpar%C3%A2metros_SVM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from concurrent.futures import ProcessPoolExecutor

In [None]:
# Carregar o dataset
df = pd.read_csv('https://raw.githubusercontent.com/aureavaleria/Reprodu-o/main/export.csv')

# Verificar e remover valores faltantes
print("Valores faltantes por coluna:\n", df.isnull().sum())
df.dropna(inplace=True)

# Definir variáveis preditoras (X) e alvo binária (y)
X = df[['Age recode with <1 year olds', 'Sex', 'Race recode (White, Black, Other)',
        'Histologic Type ICD-O-3', 'Grade Recode (thru 2017)', 'Primary Site',
        'Derived AJCC T, 7th ed (2010-2015)', 'Derived AJCC N, 7th ed (2010-2015)',
        'CS tumor size (2004-2015)', 'CEA Pretreatment Interpretation Recode (2010+)',
        'Tumor Deposits Recode (2010+)', 'Marital status at diagnosis',
        'Origin recode NHIA (Hispanic, Non-Hisp)']]
y = (df['SEER Combined Mets at DX-liver (2010+)'] == 'Yes') | (df['SEER Combined Mets at DX-lung (2010+)'] == 'Yes')
y = y.astype(int)

Valores faltantes por coluna:
 Patient ID                                         0
Age recode with <1 year olds                       0
Sex                                                0
Race recode (White, Black, Other)                  0
Histologic Type ICD-O-3                            0
Grade Recode (thru 2017)                           0
Primary Site                                       0
Derived AJCC T, 7th ed (2010-2015)                 0
Derived AJCC N, 7th ed (2010-2015)                 0
CS tumor size (2004-2015)                          0
CEA Pretreatment Interpretation Recode (2010+)     0
Tumor Deposits Recode (2010+)                      0
Marital status at diagnosis                        0
Origin recode NHIA (Hispanic, Non-Hisp)            0
SEER Combined Mets at DX-lung (2010+)             15
SEER Combined Mets at DX-liver (2010+)            12
SEER Combined Mets at DX-bone (2010+)             14
ICD-O-3 Hist/behav                                 0
ICD-O-3 Hist/be

In [None]:
label_encoders = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Balancear o conjunto de treino com SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Normalizar os dados
scaler = StandardScaler()
X_train_bal = scaler.fit_transform(X_train_bal)
X_test = scaler.transform(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [None]:
# ### 3. Configuração de hiperparâmetros mais simples
param_grid = {
    "C": [1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", 0.1]
}
block_size = 3  # Combinações por bloco
total_iterations = 9  # Total de iterações desejadas
blocks = total_iterations // block_size

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # Menos folds na validação cruzada

# Função para realizar busca em um bloco
def fit_block(n_iter):
    print(f"\nExecutando bloco com {n_iter} iterações...")
    random_search = RandomizedSearchCV(estimator=SVC(probability=True), param_distributions=param_grid,
                                       n_iter=n_iter, scoring="roc_auc", cv=cv, n_jobs=-1, random_state=42, verbose=1)
    random_search.fit(X_train_bal, y_train_bal)
    return random_search.best_params_, random_search.best_score_

# Iterar pelos blocos
results = []
for i in range(blocks):
    print(f"\nIniciando bloco {i + 1} de {blocks}...")
    params, score = fit_block(block_size)
    results.append((params, score))
    print(f"Bloco {i + 1} concluído: Melhor pontuação AUC-ROC: {score:.4f}")

# ### 4. Resultados
# Exibir resultados de cada bloco
for i, (params, score) in enumerate(results):
    print(f"\nBloco {i + 1} - Melhores parâmetros: {params}")
    print(f"Bloco {i + 1} - Melhor pontuação (AUC-ROC): {score:.4f}")

# Obter os melhores parâmetros do último bloco
best_params = results[-1][0]
best_model = SVC(**best_params, probability=True).fit(X_train_bal, y_train_bal)

# Avaliação do modelo no conjunto de teste
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
y_pred = best_model.predict(X_test)

print(f"\nAUC-ROC no conjunto de teste: {roc_auc:.4f}")
print("\nRelatório de classificação no conjunto de teste:")
print(classification_report(y_test, y_pred))



Iniciando bloco 1 de 3...

Executando bloco com 3 iterações...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
