In [1]:
import warnings

import matplotlib.pyplot as plt

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix, balanced_accuracy_score, roc_auc_score, \
    classification_report, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings('ignore')

In [2]:
# Carregando o dataset
df = pd.read_csv('../data/lung_cancer_prediction_dataset.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/lung_cancer_prediction_dataset_2.csv'

In [None]:
df.info()

In [None]:
# Verificando a distribuição da variável alvo
df['Lung_Cancer_Diagnosis'].value_counts(normalize=True)

In [None]:
df['Cancer_Stage'].value_counts(normalize=True)

In [None]:
df.isna().sum()

In [None]:
df.fillna('Unknown', inplace=True)

In [None]:
# Normalizando as colunas
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])

df.head()

In [None]:
# Verificando colunas com valores nulos
df.isna().sum()

In [None]:
features = ['Population_Size', 'Country', 'Age', 'Gender', 'Smoker', 'Years_of_Smoking', 'Cigarettes_per_Day'
    , 'Passive_Smoker', 'Family_History', 'Lung_Cancer_Diagnosis', 'Air_Pollution_Exposure',
            'Occupational_Exposure', 'Indoor_Pollution']

In [None]:
df[features].duplicated().sum()

In [None]:
mm_scaler = MinMaxScaler()

df['Population_Size'] = mm_scaler.fit_transform(df[['Population_Size']])
df['Age'] = mm_scaler.fit_transform(df[['Age']])
df['Years_of_Smoking'] = mm_scaler.fit_transform(df[['Years_of_Smoking']])
df['Cigarettes_per_Day'] = mm_scaler.fit_transform(df[['Cigarettes_per_Day']])
df['Annual_Lung_Cancer_Deaths'] = mm_scaler.fit_transform(df[['Annual_Lung_Cancer_Deaths']])
df['Lung_Cancer_Prevalence_Rate'] = mm_scaler.fit_transform(df[['Lung_Cancer_Prevalence_Rate']])


In [None]:
df_clean = df[features].drop_duplicates()
df_processed = df_clean.copy()

df_processed

In [None]:
import seaborn as sns

correlation_matrix = df_processed.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
target_correlations = correlation_matrix['Lung_Cancer_Diagnosis'].abs().sort_values(ascending=False)
print(target_correlations)

In [None]:
y = df_processed['Lung_Cancer_Diagnosis']
X = df_processed.drop(columns=['Lung_Cancer_Diagnosis'])

In [None]:
print(f"Shape das features: {x.shape}")
print(f"Shape do target: {y.shape}")
print(f"Distribuição do target: {y.value_counts().to_dict()}")

In [None]:
# Dividindo o dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
print(f"Tamanho do conjunto de treino: {X_train.shape}")
print(f"Tamanho do conjunto de teste: {X_test.shape}")
print(f"Distribuição no treino: {y_train.value_counts().to_dict()}")
print(f"Distribuição no teste: {y_test.value_counts().to_dict()}")

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy=0.5)
x_train_rus, y_train_rus = rus.fit_resample(X, y)

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(shrinkage=0.1)

x_train_ros, y_train_ros = ros.fit_resample(X, y)

In [None]:
def evaluate_models(x_data, y_data):
    models = {
        'KNN': KNeighborsClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Decision Tree': DecisionTreeClassifier()
    }

    results = {}
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for name, model in models.items():
        cv_results = cross_val_score(model, x_data, y_data, cv=kfold, scoring='accuracy')
        results[name] = {
            'mean_accuracy': cv_results.mean(),
            'std_accuracy': cv_results.std(),
            'data': (x_data, y_data)
        }
        print(f"{name}: {cv_results.mean():.4f} (+/- {cv_results.std() * 2:.4f})")
    return results

In [None]:
print("Avaliando Resultados:")

print("Original:")
results_original = evaluate_models(X_train, y_train)
print("\n")
print("Com Amostragem Subamostrada")
results_rus = evaluate_models(x_train_rus, y_train_rus)
print("\n")
print("Com Amostragem Sobreamostrada")
results_ros = evaluate_models(x_train_ros, y_train_ros)


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))


def plot_cv_results(results, title, ax):
    models = list(results.keys())
    means = [results[model]['mean_accuracy'] for model in models]
    stds = [results[model]['std_accuracy'] for model in models]

    bars = ax.bar(models, means, yerr=stds, capsize=5, alpha=0.7)
    ax.set_title(title)
    ax.set_ylabel('Acurácia')
    ax.set_ylim(0, 1)
    ax.tick_params(axis='x', rotation=45)

    # Adicionar valores nas barras
    for bar, mean in zip(bars, means):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
                f'{mean:.3f}', ha='center', va='bottom')


plot_cv_results(results_original, 'Resultados Originais', axes[0])
plot_cv_results(results_rus, 'Resultados com Subamostragem', axes[1])
plot_cv_results(results_ros, 'Resultados com Sobreamostragem', axes[2])

plt.tight_layout()
plt.show()

In [None]:
# TODO: Pegar com base no score

traning_model_name = 'rf'
X_best_train, y_best_train = x_train_ros, y_train_ros

In [None]:
models_params = {
    'rf': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        },
        'data': X_best_train
    },
    'knn': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [6, 8],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'cosine']
        },
        'data': X_best_train
    },
    'dt': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
        },
        'data': X_best_train
    }
}

In [None]:
training_model = models_params[traning_model_name]

print(f"Verificando os melhores parâmetros para o modelo {traning_model_name}")
grid_search = GridSearchCV(estimator=training_model['model'],
                           param_grid=training_model['params'],
                           scoring=make_scorer(accuracy_score),
                           cv=5,
                           n_jobs=4)
grid_search.fit(training_model['data'], y_best_train)
best_model = {
    'model': grid_search.best_estimator_,
    'best_params': grid_search.best_params_,
    'best_score': grid_search.best_score_
}

print(f"Melhor parametro para {traning_model_name}: {grid_search.best_params_}")
print(f"Melhor score para {traning_model_name}: {grid_search.best_score_:.4f}\n")
print(f"Training {traning_model_name} model...")

In [None]:
best_model

In [None]:
final_results = {}

model = best_model['model']
y_pred = model.predict(X_best_train)
accuracy = accuracy_score(y_best_train, y_pred)
balanced_acc = balanced_accuracy_score(y_best_train, y_pred)

y_pred_proba = model.predict_proba(X_best_train)[:, 1]
auc_score = roc_auc_score(y_best_train, y_pred_proba)

final_results = {
    'accuracy': accuracy,
    'balanced_accuracy': balanced_acc,
    'auc': auc_score,
    'cv_score': best_model['best_score'],
    'best_params': best_model['best_params'],
    'predictions': y_pred,
    'probabilities': y_pred_proba if 'y_pred_proba' in locals() else None,
}

print(f"Acurácia no teste: {accuracy:.4f}")
print(f"Acurácia balanceada: {balanced_acc:.4f}")
if auc_score:
    print(f"AUC-ROC: {auc_score:.4f}")
print(f"Score CV: {best_model['best_score']:.4f}")
print(f"Parâmetros: {best_model['best_params']}")


In [None]:
best_model_name = model
best_model_info = final_results

best_model = model
best_predictions = best_model_info['predictions']
best_probabilities = best_model_info['probabilities']

In [None]:
print("relatório de classificação:")
print(classification_report(y_best_train, best_predictions))

In [None]:
cm = confusion_matrix(y_best_train, best_predictions)
print("matriz de confusão:")

plt.figure(figsize=(8, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predição")
plt.ylabel("Dados Reais")

In [None]:
fpr, tpr, _ = roc_curve(y_best_train, best_probabilities)

plt.plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {best_model_info["auc"]:.3f})')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('Taxa de Falsos Positivos')
plt.ylabel('Taxa de Verdadeiros Positivos')
plt.title(f'Curva ROC - {best_model_name}')
plt.legend()
plt.grid(True, alpha=0.3)