Importação de bibliotecas

In [11]:
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from collections import Counter
import random
import statistics as st
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression



Classe para treinar e criar um classificador bayesiano

In [12]:

class ClassifierTrainer:
    def __init__(self, datasets, model):
        self.datasets = datasets
        self.model = model
    
    def build_pipeline(self):
        # Criação do pipeline com o escalador e o classificador
        return Pipeline([
            ('scaler', StandardScaler()),
            ('clf', self.model)
        ])

    def find_majority_and_choose_random(self, row):
        counts = Counter(row)
        max_count = max(counts.values())
        majority_numbers = [num for num, count in counts.items() if count == max_count]
        return random.choice(majority_numbers) if len(majority_numbers) > 1 else majority_numbers[0]

    def find_majority_numbers(self, matrix):
        majority = []
        for row in matrix:
            majority.append(self.find_majority_and_choose_random(row))
        return majority

    def train_classifiers_with_random_states(self, n_iterations=30):
        precision_scores, recall_scores, f1_scores, accuracy_scores = [], [], [], []
        total_rows = len(self.datasets)
        labels = self.datasets['diagnosis']

        for _ in tqdm(range(n_iterations)):
            train_indices, test_indices = train_test_split(np.arange(total_rows), test_size=0.2, random_state=None, stratify=labels)
            train_data, test_data = self.datasets.iloc[train_indices], self.datasets.iloc[test_indices]
            X_train, y_train = train_data.drop('diagnosis', axis=1), train_data['diagnosis']
            X_test, y_test = test_data.drop('diagnosis', axis=1), test_data['diagnosis']
            
            # Parâmetros para o KNeighborsClassifier
            # Parâmetros para o LogisticRegression
            param_grid = {
                'clf__penalty': ['l1', 'l2'],  # Tipos de penalidade
                'clf__C': [0.1, 1, 10, 100],  # Valores de regularização
                'clf__solver': ['liblinear', 'saga']  # Solvers compatíveis
            }
            # Criar o pipeline e configurar o GridSearchCV
            pipeline = self.build_pipeline()
            grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')  # 5 folds para validação cruzada
            grid_search.fit(X_train, y_train)

            # Melhor modelo encontrado
            best_model = grid_search.best_estimator_

            # Realizar a predição no conjunto de teste
            y_pred = best_model.predict(X_test)
           
            # Calcular as métricas
            precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
            recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
            accuracy_scores.append(accuracy_score(y_test, y_pred))

        # Exibir as métricas
        self.print_metrics(precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, y_pred)

    def print_metrics(self, precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, majority_labels):
        print(f"Precision Mean: {st.mean(precision_scores)}, Std Dev: {st.stdev(precision_scores)}, CI: {np.percentile(precision_scores, [2.5, 97.5])}")
        print(f"Recall Mean: {st.mean(recall_scores)}, Std Dev: {st.stdev(recall_scores)}, CI: {np.percentile(recall_scores, [2.5, 97.5])}")
        print(f"F1 Mean: {st.mean(f1_scores)}, Std Dev: {st.stdev(f1_scores)}, CI: {np.percentile(f1_scores, [2.5, 97.5])}")
        print(f"Accuracy Mean: {st.mean(accuracy_scores)}, Std Dev: {st.stdev(accuracy_scores)}, CI: {np.percentile(accuracy_scores, [2.5, 97.5])}")
        print(classification_report(y_test, majority_labels))


Importar os datasets 

In [13]:
spectf_heart = fetch_ucirepo(id=96)

# data (as pandas dataframes)
X = spectf_heart.data.features
y = spectf_heart.data.targets


In [14]:
X.head()

Unnamed: 0,F1R,F1S,F2R,F2S,F3R,F3S,F4R,F4S,F5R,F5S,...,F18R,F18S,F19R,F19S,F20R,F20S,F21R,F21S,F22R,F22S
0,59,52,70,67,73,66,72,61,58,52,...,66,56,62,56,72,62,74,74,64,67
1,72,62,69,67,78,82,74,65,69,63,...,65,71,63,60,69,73,67,71,56,58
2,71,62,70,64,67,64,79,65,70,69,...,73,70,66,65,64,55,61,41,51,46
3,69,71,70,78,61,63,67,65,59,59,...,61,61,66,65,72,73,68,68,59,63
4,70,66,61,66,61,58,69,69,72,68,...,67,69,70,66,70,64,60,55,49,41


In [15]:
y.head()

Unnamed: 0,diagnosis
0,1
1,1
2,1
3,1
4,1


In [16]:
df = X.copy()
df['diagnosis'] = y['diagnosis'] 


In [17]:
df.columns

Index(['F1R', 'F1S', 'F2R', 'F2S', 'F3R', 'F3S', 'F4R', 'F4S', 'F5R', 'F5S',
       'F6R', 'F6S', 'F7R', 'F7S', 'F8R', 'F8S', 'F9R', 'F9S', 'F10R', 'F10S',
       'F11R', 'F11S', 'F12R', 'F12S', 'F13R', 'F13S', 'F14R', 'F14S', 'F15R',
       'F15S', 'F16R', 'F16S', 'F17R', 'F17S', 'F18R', 'F18S', 'F19R', 'F19S',
       'F20R', 'F20S', 'F21R', 'F21S', 'F22R', 'F22S', 'diagnosis'],
      dtype='object')

In [18]:
print(len(df))

267


In [19]:
df['diagnosis'].value_counts()

diagnosis
1    212
0     55
Name: count, dtype: int64

In [20]:

# Instanciar o classificador LogisticRegression
log_reg_model = LogisticRegression(max_iter=1000)

# Criar o objeto ClassifierTrainer passando o dataset e o modelo
trainer = ClassifierTrainer(df, log_reg_model)

# Treinar o classificador
trainer.train_classifiers_with_random_states(n_iterations=30)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 30/30 [08:48<00:00, 17.61s/it]

Precision Mean: 0.793868286339681, Std Dev: 0.05315958502878729, CI: [0.69267082 0.87256404]
Recall Mean: 0.8135802469135802, Std Dev: 0.03470749652265343, CI: [0.75416667 0.87037037]
F1 Mean: 0.7936831909127811, Std Dev: 0.03991771451945285, CI: [0.72373926 0.85486844]
Accuracy Mean: 0.8135802469135802, Std Dev: 0.03470749652265343, CI: [0.75416667 0.87037037]
              precision    recall  f1-score   support

           0       0.67      0.36      0.47        11
           1       0.85      0.95      0.90        43

    accuracy                           0.83        54
   macro avg       0.76      0.66      0.69        54
weighted avg       0.82      0.83      0.81        54




