<a href="https://colab.research.google.com/github/aureavaleria/DataBalancing-Research/blob/main/papers/Artigo%201/V5/Teste%204/vers%C3%A3o_5_(teste_04_engenharia_de_atributos).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, average_precision_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

# Carregar o dataset
df = pd.read_csv('https://raw.githubusercontent.com/aureavaleria/dataset/refs/heads/main/export.csv')
df.dropna(inplace=True)

# ------------------ FEATURE ENGINEERING ------------------

# 1. Age_group_numeric: Codificar faixas etárias ordinalmente
age_map = {
    '<1 year': 0, '1-4 years': 1, '5-9 years': 2, '10-14 years': 3, '15-19 years': 4,
    '20-24 years': 5, '25-29 years': 6, '30-34 years': 7, '35-39 years': 8, '40-44 years': 9,
    '45-49 years': 10, '50-54 years': 11, '55-59 years': 12, '60-64 years': 13, '65-69 years': 14,
    '70-74 years': 15, '75-79 years': 16, '80-84 years': 17, '85+ years': 18
}
df['Age_group_numeric'] = df['Age recode with <1 year olds'].map(age_map)

# 2. Log do tamanho do tumor (evitar log(0) ou log de nulos)
df['Tumor_size_log'] = np.log1p(df['CS tumor size (2004-2015)'].replace('999', np.nan).astype(float))

# 3. Ordinalização de estágios T e N
def stage_to_num(stage):
    """
    Converte estágios T ou N em valor numérico.
    Retorna np.nan para valores não numéricos (ex: X, TX, NX).
    """
    if pd.isnull(stage):
        return np.nan
    # Se contém 'X' (ex: TX, NX, Tis, etc), retorna np.nan
    if 'X' in str(stage) or 'is' in str(stage):
        return np.nan
    # Remove letras e transforma em inteiro
    cleaned = ''.join([c for c in str(stage) if c.isdigit()])
    return int(cleaned) if cleaned else np.nan

df['T_stage_num'] = df['Derived AJCC T, 7th ed (2010-2015)'].apply(stage_to_num)
df['N_stage_num'] = df['Derived AJCC N, 7th ed (2010-2015)'].apply(stage_to_num)

# 4. Aggressiveness_score: Exemplo simples (você pode pesar como desejar)
df['Aggressiveness_score'] = (
    df['T_stage_num'].fillna(0) +
    df['N_stage_num'].fillna(0) +
    df['Grade Recode (thru 2017)'].map({'Well differentiated; Grade I':1, 'Moderately differentiated; Grade II':2,
                                        'Poorly differentiated; Grade III':3, 'Undifferentiated; anaplastic; Grade IV':4}).fillna(0)
)

# 5. CEA_positive: 1 se positivo, 0 caso contrário
df['CEA_positive'] = df['CEA Pretreatment Interpretation Recode (2010+)'].apply(lambda x: 1 if 'positive' in str(x).lower() else 0)

# 6. Tumor_deposits_present: 1 se presentes, 0 caso contrário
df['Tumor_deposits_present'] = df['Tumor Deposits Recode (2010+)'].apply(lambda x: 1 if 'present' in str(x).lower() or 'yes' in str(x).lower() else 0)

# 7. Histology_simplified: exemplo para adenocarcinoma vs outros
df['Histology_simplified'] = df['Histologic Type ICD-O-3'].apply(lambda x: 'Adenocarcinoma' if str(x) in ['8140', '8261', '8263', '8210'] else 'Other')
df['Histology_simplified'] = LabelEncoder().fit_transform(df['Histology_simplified'])

# 8. Interações
df['Interaction_TumorSize_Grade'] = df['CS tumor size (2004-2015)'].astype(float) * df['Aggressiveness_score']
df['Interaction_Age_Stage'] = df['Age_group_numeric'] * df['T_stage_num']

# 9. Label Encoding para demais variáveis categóricas relevantes
cols_to_label_encode = [
    'Sex', 'Race recode (White, Black, Other)', 'Marital status at diagnosis',
    'Origin recode NHIA (Hispanic, Non-Hisp)', 'Primary Site',
    'Grade Recode (thru 2017)', 'CEA Pretreatment Interpretation Recode (2010+)', 'Tumor Deposits Recode (2010+)'
]
for col in cols_to_label_encode:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# 10. Seleção final de features
feature_cols = [
    'Age_group_numeric', 'Sex', 'Race recode (White, Black, Other)', 'Marital status at diagnosis',
    'Origin recode NHIA (Hispanic, Non-Hisp)', 'Primary Site',
    'Grade Recode (thru 2017)', 'CS tumor size (2004-2015)', 'Tumor_size_log',
    'T_stage_num', 'N_stage_num', 'Aggressiveness_score', 'CEA_positive', 'Tumor_deposits_present',
    'Histology_simplified', 'Interaction_TumorSize_Grade', 'Interaction_Age_Stage'
]

X = df[feature_cols]

# Targets binários
y_liver = df['SEER Combined Mets at DX-liver (2010+)'].map({'No':0, 'Yes':1, 0:0, 1:1})
y_lung = df['SEER Combined Mets at DX-lung (2010+)'].map({'No':0, 'Yes':1, 0:0, 1:1})

# (Opcional) Padronização dos dados para KNN/SVM
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Exemplo de uso com qualquer modelo
print(X_scaled.head())
print("y_liver:", y_liver.value_counts())
print("y_lung:", y_lung.value_counts())


   Age_group_numeric       Sex  Race recode (White, Black, Other)  \
0           0.018962 -1.057757                          -1.435123   
1           0.383254 -1.057757                           0.519604   
2           1.476133  0.945397                           0.519604   
3           1.111840 -1.057757                           0.519604   
4           1.476133  0.945397                           0.519604   

   Marital status at diagnosis  Origin recode NHIA (Hispanic, Non-Hisp)  \
0                     0.437285                                -0.297766   
1                     0.437285                                -0.297766   
2                     1.984518                                -0.297766   
3                     1.984518                                -0.297766   
4                     0.953029                                -0.297766   

   Primary Site  Grade Recode (thru 2017)  CS tumor size (2004-2015)  \
0      1.336016                 -0.628324                  -0.

In [None]:
y_liver = df['SEER Combined Mets at DX-liver (2010+)']
y_lung = df['SEER Combined Mets at DX-lung (2010+)']

y = pd.concat([y_liver, y_lung], axis=1)

df['Binary Mets'] = ((df['SEER Combined Mets at DX-liver (2010+)'] == 'Yes') |
                     (df['SEER Combined Mets at DX-lung (2010+)'] == 'Yes')).astype(int)

y = df['Binary Mets']

print("Valores únicos de y:", y.unique())
print("Shape de X:", X.shape, "Shape de y:", y.shape)

# Pronto para usar no StratifiedKFold:
from sklearn.model_selection import StratifiedKFold

Valores únicos de y: [0 1]
Shape de X: (53448, 17) Shape de y: (53448,)


In [None]:
# Definição dos modelos de aprendizado de máquina com hiperparâmetros ajustados
models = {
    "Decision Tree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        min_samples_leaf=10,
        min_samples_split=2,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        bootstrap=True,
        criterion='entropy',
        max_depth=15,
        min_samples_leaf=5,
        min_samples_split=2,
        n_estimators=300,
        random_state=42
    ),
    "SVM": SVC(
        kernel='poly',
        gamma='scale',
        degree=3,
        C=10,
        probability=True,
        random_state=42
    ),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(
        leaf_size=20,
        metric='manhattan',
        n_neighbors=11,
        weights='uniform'
    ),
    "XGBoost": XGBClassifier(
        colsample_bytree=0.6,
        learning_rate=0.1,
        max_depth=6,
        n_estimators=100,
        reg_alpha=0.1,
        reg_lambda=10.0,
        subsample=1.0,
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        max_depth=3,
        n_estimators=200,
        learning_rate=0.1,
        subsample=0.8,
        min_samples_leaf=5,
        min_samples_split=2,
        random_state=42
    )
}

# Lista de técnicas de balanceamento
smote_techniques = {
    "SMOTE": SMOTE(random_state=42),
}

# Configuração da validação cruzada estratificada com 5 divisões (folds)
# Isso garante que a proporção de classes seja mantida em cada divisão, e o shuffle embaralha os dados antes de dividir
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
results = []
# Iteração entre técnicas de balanceamento e modelos
for smote_name, smote in smote_techniques.items():
    for model_name, model in models.items():
        print(f"\nAplicando {smote_name} com {model_name}")
        fold_metrics = []

        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            imputer = SimpleImputer(strategy='most_frequent')
            X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
            X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

            # Aplicar SMOTE
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

            # Normalizar os dados
            scaler = StandardScaler()
            X_train_res = scaler.fit_transform(X_train_res)
            X_test = scaler.transform(X_test)

            # Treinar o modelo
            model.fit(X_train_res, y_train_res)
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1]

            # Avaliar as métricas
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            auc_pr = average_precision_score(y_test, y_pred_proba)
            fold_metrics.append((f1, auc, auc_pr))

        # Métricas médias
        avg_f1, avg_auc, avg_auc_pr = np.mean(fold_metrics, axis=0)
        results.append({
            "SMOTE Technique": smote_name,
            "Model": model_name,
            "F1-Score": avg_f1,
            "AUC-ROC": avg_auc,
            "AUC-PR": avg_auc_pr
        })

# Tabela de resultados
results_df = pd.DataFrame(results)
print("\nResultados Comparativos:")
print(results_df)

# Salvar os resultados em TXT
with open("comparacao_smote_resultados.txt", "w") as file:
    for _, row in results_df.iterrows():
        file.write(f"SMOTE Technique: {row['SMOTE Technique']}\n")
        file.write(f"Model: {row['Model']}\n")
        file.write(f"F1-Score: {row['F1-Score']:.4f}\n")
        file.write(f"AUC-ROC: {row['AUC-ROC']:.4f}\n")
        file.write(f"AUC-PR: {row['AUC-PR']:.4f}\n")
        file.write("-" * 50 + "\n")

print("Resultados salvos em 'comparacao_smote_resultados.txt'")


Aplicando SMOTE com Decision Tree

Aplicando SMOTE com Random Forest

Aplicando SMOTE com SVM

Aplicando SMOTE com Naive Bayes

Aplicando SMOTE com KNN

Aplicando SMOTE com XGBoost

Aplicando SMOTE com Gradient Boosting

Resultados Comparativos:
  SMOTE Technique              Model  F1-Score   AUC-ROC    AUC-PR
0           SMOTE      Decision Tree  0.478187  0.811536  0.422620
1           SMOTE      Random Forest  0.494554  0.846725  0.532971
2           SMOTE                SVM  0.500580  0.814183  0.490754
3           SMOTE        Naive Bayes  0.410339  0.758925  0.342819
4           SMOTE                KNN  0.470554  0.798997  0.432645
5           SMOTE            XGBoost  0.457894  0.848984  0.540143
6           SMOTE  Gradient Boosting  0.455077  0.843331  0.528723
Resultados salvos em 'comparacao_smote_resultados.txt'
