COM OPTUNA

In [6]:
# Instalação e importação de Bibliotecas
# pip install pandas scikit-learn optuna

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import optuna

# Carregar os dados e renomear as colunas
data = pd.read_csv("C:/Users/ana_v/OneDrive/Área de Trabalho/GitHub/breastcancer-classification/Wisconsin Repository/wdbc.csv", header=None)
col_names = ["ID", "Diagnosis", "radius1", "texture1", "perimeter1", "area1",
             "smoothness1", "compactness1", "concavity1", "concave_points1",
             "symmetry1", "fractal_dimension1", "radius2", "texture2",
             "perimeter2", "area2", "smoothness2", "compactness2", "concavity2",
             "concave_points2", "symmetry2", "fractal_dimension2", "radius3",
             "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
             "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"]
data.columns = col_names

# Selecionar dados para o modelo
data_model = data.drop(columns=["ID"])
data_model['Diagnosis'] = data_model['Diagnosis'].map({'M': 1, 'B': 0}) 

# Dividir os dados em treino e teste
train_data, test_data = train_test_split(data_model, test_size=0.2, random_state=123)
X_train = train_data.drop(columns=["Diagnosis"])
y_train = train_data["Diagnosis"]
X_test = test_data.drop(columns=["Diagnosis"])
y_test = test_data["Diagnosis"]

# Função objetivo para otimizar o SVM
def objective_svm(trial):
    svc_c = trial.suggest_loguniform('svc_c', 1e-5, 1e2)
    svc_kernel = trial.suggest_categorical('svc_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    
    model_svm = SVC(C=svc_c, kernel=svc_kernel, random_state=123)
    score = cross_val_score(model_svm, X_train, y_train, cv=10, scoring='accuracy').mean()
    return score

# Função objetivo para otimizar o Random Forest
def objective_rf(trial):
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 200)
    rf_max_depth = trial.suggest_int('rf_max_depth', 10, 50)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 20)
    
    model_rf = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth,
                                      min_samples_split=rf_min_samples_split, random_state=123)
    score = cross_val_score(model_rf, X_train, y_train, cv=10, scoring='accuracy').mean()
    return score

# Função objetivo para otimizar a Regressão Logística
def objective_lr(trial):
    lr_c = trial.suggest_loguniform('lr_c', 1e-5, 1e2)
    lr_solver = trial.suggest_categorical('lr_solver', ['newton-cg', 'lbfgs', 'liblinear', 'saga'])
    
    model_lr = LogisticRegression(C=lr_c, solver=lr_solver, max_iter=10000, random_state=123)
    score = cross_val_score(model_lr, X_train, y_train, cv=10, scoring='accuracy').mean()
    return score

# Estudar otimização com Optuna para SVM
study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=10)
best_params_svm = study_svm.best_params
print("Melhores parâmetros para SVM:", best_params_svm)

# Estudar otimização com Optuna para Random Forest
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=10)
best_params_rf = study_rf.best_params
print("Melhores parâmetros para Random Forest:", best_params_rf)

# Estudar otimização com Optuna para Regressão Logística
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=10)
best_params_lr = study_lr.best_params
print("Melhores parâmetros para Regressão Logística:", best_params_lr)

In [7]:
# Treinar e avaliar o modelo SVM com os melhores parâmetros
best_model_svm = SVC(C=best_params_svm['svc_c'], kernel=best_params_svm['svc_kernel'], random_state=123)
best_model_svm.fit(X_train, y_train)
predictions_svm = best_model_svm.predict(X_test)

print("\nResultados do SVM:")
print("Matriz de Confusão:\n", confusion_matrix(y_test, predictions_svm))
print("Acurácia:", accuracy_score(y_test, predictions_svm))
print("Precisão:", precision_score(y_test, predictions_svm))
print("Recall:", recall_score(y_test, predictions_svm))
print("F1 Score:", f1_score(y_test, predictions_svm))

# Treinar e avaliar o modelo Random Forest com os melhores parâmetros
best_model_rf = RandomForestClassifier(
    n_estimators=best_params_rf['rf_n_estimators'],
    max_depth=best_params_rf['rf_max_depth'],
    min_samples_split=best_params_rf['rf_min_samples_split'],
    random_state=123
)
best_model_rf.fit(X_train, y_train)
predictions_rf = best_model_rf.predict(X_test)

print("\nResultados do Random Forest:")
print("Matriz de Confusão:\n", confusion_matrix(y_test, predictions_rf))
print("Acurácia:", accuracy_score(y_test, predictions_rf))
print("Precisão:", precision_score(y_test, predictions_rf))
print("Recall:", recall_score(y_test, predictions_rf))
print("F1 Score:", f1_score(y_test, predictions_rf))

# Treinar e avaliar o modelo Regressão Logística com os melhores parâmetros
best_model_lr = LogisticRegression(
    C=best_params_lr['lr_c'],
    solver=best_params_lr['lr_solver'],
    max_iter=10000,
    random_state=123
)
best_model_lr.fit(X_train, y_train)
predictions_lr = best_model_lr.predict(X_test)

print("\nResultados da Regressão Logística:")
print("Matriz de Confusão:\n", confusion_matrix(y_test, predictions_lr))
print("Acurácia:", accuracy_score(y_test, predictions_lr))
print("Precisão:", precision_score(y_test, predictions_lr))
print("Recall:", recall_score(y_test, predictions_lr))
print("F1 Score:", f1_score(y_test, predictions_lr))

[I 2024-07-30 18:49:51,167] A new study created in memory with name: no-name-de8793a3-5376-4377-bf3d-8ae4bba8aafe
  svc_c = trial.suggest_loguniform('svc_c', 1e-5, 1e2)
[I 2024-07-30 18:49:51,309] Trial 0 finished with value: 0.6241545893719807 and parameters: {'svc_c': 0.03409027138592591, 'svc_kernel': 'sigmoid'}. Best is trial 0 with value: 0.6241545893719807.
  svc_c = trial.suggest_loguniform('svc_c', 1e-5, 1e2)
[I 2024-07-30 18:49:51,395] Trial 1 finished with value: 0.8196618357487925 and parameters: {'svc_c': 0.0036359893174285293, 'svc_kernel': 'poly'}. Best is trial 1 with value: 0.8196618357487925.
  svc_c = trial.suggest_loguniform('svc_c', 1e-5, 1e2)
[I 2024-07-30 18:49:51,519] Trial 2 finished with value: 0.42661835748792265 and parameters: {'svc_c': 1.5615105171187569, 'svc_kernel': 'sigmoid'}. Best is trial 1 with value: 0.8196618357487925.
  svc_c = trial.suggest_loguniform('svc_c', 1e-5, 1e2)
[I 2024-07-30 18:49:57,699] Trial 3 finished with value: 0.9536714975845412 

Melhores parâmetros para SVM: {'svc_c': 0.46018897157513666, 'svc_kernel': 'linear'}


[I 2024-07-30 18:49:59,286] Trial 0 finished with value: 0.9517391304347826 and parameters: {'rf_n_estimators': 70, 'rf_max_depth': 15, 'rf_min_samples_split': 7}. Best is trial 0 with value: 0.9517391304347826.
[I 2024-07-30 18:50:01,057] Trial 1 finished with value: 0.9472463768115942 and parameters: {'rf_n_estimators': 99, 'rf_max_depth': 10, 'rf_min_samples_split': 4}. Best is trial 0 with value: 0.9517391304347826.
[I 2024-07-30 18:50:04,448] Trial 2 finished with value: 0.9516908212560388 and parameters: {'rf_n_estimators': 200, 'rf_max_depth': 12, 'rf_min_samples_split': 4}. Best is trial 0 with value: 0.9517391304347826.
[I 2024-07-30 18:50:05,640] Trial 3 finished with value: 0.9384541062801933 and parameters: {'rf_n_estimators': 67, 'rf_max_depth': 42, 'rf_min_samples_split': 13}. Best is trial 0 with value: 0.9517391304347826.
[I 2024-07-30 18:50:08,257] Trial 4 finished with value: 0.9494685990338164 and parameters: {'rf_n_estimators': 156, 'rf_max_depth': 47, 'rf_min_sampl

Melhores parâmetros para Random Forest: {'rf_n_estimators': 70, 'rf_max_depth': 15, 'rf_min_samples_split': 7}


[I 2024-07-30 18:50:21,694] Trial 0 finished with value: 0.916376811594203 and parameters: {'lr_c': 0.00017176197510614208, 'lr_solver': 'saga'}. Best is trial 0 with value: 0.916376811594203.
  lr_c = trial.suggest_loguniform('lr_c', 1e-5, 1e2)
[I 2024-07-30 18:50:33,545] Trial 1 finished with value: 0.9493236714975846 and parameters: {'lr_c': 5.041295295998572, 'lr_solver': 'lbfgs'}. Best is trial 1 with value: 0.9493236714975846.
  lr_c = trial.suggest_loguniform('lr_c', 1e-5, 1e2)
[I 2024-07-30 18:50:34,422] Trial 2 finished with value: 0.9383091787439615 and parameters: {'lr_c': 0.0052451606746714945, 'lr_solver': 'newton-cg'}. Best is trial 1 with value: 0.9493236714975846.
  lr_c = trial.suggest_loguniform('lr_c', 1e-5, 1e2)
[I 2024-07-30 18:50:34,465] Trial 3 finished with value: 0.9185990338164253 and parameters: {'lr_c': 0.0006455565890398845, 'lr_solver': 'liblinear'}. Best is trial 1 with value: 0.9493236714975846.
  lr_c = trial.suggest_loguniform('lr_c', 1e-5, 1e2)
[I 202

Melhores parâmetros para Regressão Logística: {'lr_c': 11.447786354722936, 'lr_solver': 'lbfgs'}

Resultados do SVM:
Matriz de Confusão:
 [[73  0]
 [ 2 39]]
Acurácia: 0.9824561403508771
Precisão: 1.0
Recall: 0.9512195121951219
F1 Score: 0.975

Resultados do Random Forest:
Matriz de Confusão:
 [[73  0]
 [ 1 40]]
Acurácia: 0.9912280701754386
Precisão: 1.0
Recall: 0.975609756097561
F1 Score: 0.9876543209876543

Resultados da Regressão Logística:
Matriz de Confusão:
 [[73  0]
 [ 2 39]]
Acurácia: 0.9824561403508771
Precisão: 1.0
Recall: 0.9512195121951219
F1 Score: 0.975
