Importação de bibliotecas

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from collections import Counter
import numpy as np
import statistics as st
from tqdm import tqdm
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


Classe para treinar e criar um classificador bayesiano

In [None]:

class ParzenWindowClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.kdes_ = {label: KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel).fit(X[y == label])
                      for label in self.classes_}
        return self

    def predict(self, X):
        log_probs = np.array([self.kdes_[label].score_samples(X) for label in self.classes_]).T
        return self.classes_[np.argmax(log_probs, axis=1)]

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

    def get_params(self, deep=True):
        return {'bandwidth': self.bandwidth, 'kernel': self.kernel}


In [None]:

class ClassifierTrainer:
    def __init__(self, datasets):
        self.datasets = datasets

    def build_pipeline(self, model):
        return Pipeline([
            ('scaler', StandardScaler()),
            ('clf', model)
        ])

    def find_majority_and_choose_first(self, row):
        counts = Counter(row)  # Conta a frequência de cada classe
        max_count = max(counts.values())  # Salva a frequência máxima
        majority_numbers = [num for num, count in counts.items() if count == max_count]
        return sorted(majority_numbers)[0]  # Retorna a primeira classe ordenada

    def find_majority_numbers(self, matrix):
        return [self.find_majority_and_choose_first(row) for row in matrix]

    def train_classifiers(self, n_iterations=30):
        precision_scores, recall_scores, f1_scores, accuracy_scores = [], [], [], []
        total_rows = len(self.datasets)
        labels = self.datasets['diagnosis']

        for _ in tqdm(range(n_iterations)):
            train_indices, test_indices = train_test_split(np.arange(total_rows), test_size=0.2, random_state=None, stratify=labels)
            train_data, test_data = self.datasets.iloc[train_indices], self.datasets.iloc[test_indices]
            X_train, y_train = train_data.drop('diagnosis', axis=1), train_data['diagnosis']
            X_test, y_test = test_data.drop('diagnosis', axis=1), test_data['diagnosis']

            # Define base models
            knn = KNeighborsClassifier()
            nb = GaussianNB()
            lr = LogisticRegression(max_iter=1000)
            parzen = ParzenWindowClassifier()

            # Set up pipeline and grid search
            knn_predictions = knn.fit(X_train, y_train).predict(X_test)
            nb_predictions = nb.fit(X_train, y_train).predict(X_test)
            lr_predictions = lr.fit(X_train, y_train).predict(X_test)
            parzen_predictions = parzen.fit(X_train, y_train).predict(X_test)

            arr_y_preds = [knn_predictions, nb_predictions, lr_predictions, parzen_predictions]
            predictions = [list(row) for row in zip(*arr_y_preds)]
            majority_labels = self.find_majority_numbers(predictions)


            # Calculate metrics
            precision_scores.append(precision_score(y_test, majority_labels, average='weighted'))
            recall_scores.append(recall_score(y_test, majority_labels, average='weighted'))
            f1_scores.append(f1_score(y_test, majority_labels, average='weighted'))
            accuracy_scores.append(accuracy_score(y_test, majority_labels))

        # Display metrics
        self.print_metrics(precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, majority_labels)

    def print_metrics(self, precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, majority_labels):
        print(f"Precision Mean: {st.mean(precision_scores)}, Std Dev: {st.stdev(precision_scores)}, CI: {np.percentile(precision_scores, [2.5, 97.5])}")
        print(f"Recall Mean: {st.mean(recall_scores)}, Std Dev: {st.stdev(recall_scores)}, CI: {np.percentile(recall_scores, [2.5, 97.5])}")
        print(f"F1 Mean: {st.mean(f1_scores)}, Std Dev: {st.stdev(f1_scores)}, CI: {np.percentile(f1_scores, [2.5, 97.5])}")
        print(f"Accuracy Mean: {st.mean(accuracy_scores)}, Std Dev: {st.stdev(accuracy_scores)}, CI: {np.percentile(accuracy_scores, [2.5, 97.5])}")
        print(classification_report(y_test, majority_labels))

Importar os datasets

In [None]:
spectf_heart = fetch_ucirepo(id=96)

# data (as pandas dataframes)
X = spectf_heart.data.features
y = spectf_heart.data.targets


In [None]:
X.head()

Unnamed: 0,F1R,F1S,F2R,F2S,F3R,F3S,F4R,F4S,F5R,F5S,...,F18R,F18S,F19R,F19S,F20R,F20S,F21R,F21S,F22R,F22S
0,59,52,70,67,73,66,72,61,58,52,...,66,56,62,56,72,62,74,74,64,67
1,72,62,69,67,78,82,74,65,69,63,...,65,71,63,60,69,73,67,71,56,58
2,71,62,70,64,67,64,79,65,70,69,...,73,70,66,65,64,55,61,41,51,46
3,69,71,70,78,61,63,67,65,59,59,...,61,61,66,65,72,73,68,68,59,63
4,70,66,61,66,61,58,69,69,72,68,...,67,69,70,66,70,64,60,55,49,41


In [None]:
y.head()

Unnamed: 0,diagnosis
0,1
1,1
2,1
3,1
4,1


In [None]:
df = X.copy()
df['diagnosis'] = y['diagnosis']


In [None]:
df.columns

Index(['F1R', 'F1S', 'F2R', 'F2S', 'F3R', 'F3S', 'F4R', 'F4S', 'F5R', 'F5S',
       'F6R', 'F6S', 'F7R', 'F7S', 'F8R', 'F8S', 'F9R', 'F9S', 'F10R', 'F10S',
       'F11R', 'F11S', 'F12R', 'F12S', 'F13R', 'F13S', 'F14R', 'F14S', 'F15R',
       'F15S', 'F16R', 'F16S', 'F17R', 'F17S', 'F18R', 'F18S', 'F19R', 'F19S',
       'F20R', 'F20S', 'F21R', 'F21S', 'F22R', 'F22S', 'diagnosis'],
      dtype='object')

In [None]:
print(len(df))

267


In [None]:
df['diagnosis'].value_counts()

diagnosis
1    212
0     55
Name: count, dtype: int64

In [None]:
# Create the ClassifierTrainer with Parzen window model
trainer = ClassifierTrainer(df)

# Train the ParzenWindowClassifier
trainer.train_classifiers(n_iterations=30)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Precision Mean: 0.7933696631978272, Std Dev: 0.051095521099809985, CI: [0.70054442 0.88488981]
Recall Mean: 0.7462962962962963, Std Dev: 0.06509245406916692, CI: [0.62962963 0.86203704]
F1 Mean: 0.7616067718371073, Std Dev: 0.058243062137164056, CI: [0.66001899 0.86364221]
Accuracy Mean: 0.7462962962962963, Std Dev: 0.06509245406916692, CI: [0.62962963 0.86203704]
              precision    recall  f1-score   support

           0       0.26      0.45      0.33        11
           1       0.83      0.67      0.74        43

    accuracy                           0.63        54
   macro avg       0.55      0.56      0.54        54
weighted avg       0.71      0.63      0.66        54




