Importação de bibliotecas

In [49]:
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from collections import Counter
import random
import statistics as st
from sklearn.model_selection import train_test_split


Classe para treinar e criar um classificador bayesiano

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, KFold

class ClassifierTrainer:
    def __init__(self, datasets, model):
        self.datasets = datasets
        self.model = model
    
    def build_pipeline(self, k, metric):
        """
        Função para criar o pipeline
        """
        self.model.k = k
        self.model.distance_metric = metric
        return Pipeline([
            ('scaler', StandardScaler()),
            ('clf', self.model)
        ])

    def find_majority_and_choose_random(self, row):
        """
        Identifica a classe com a estimativa de máxima verossimilhança
        Se existirem 2 classes com a mesma probabilidade, retorna de forma aleatória
        """
        counts = Counter(row)
        max_count = max(counts.values())
        majority_numbers = [num for num, count in counts.items() if count == max_count]
        return random.choice(majority_numbers) if len(majority_numbers) > 1 else majority_numbers[0]

    def find_majority_numbers(self, matrix):
        """
        As predições foram salvas em uma matriz.
        A função find_majority_and_choose_random recebe uma predição e retorna a maioria.
        """
        majority = []
        for row in matrix:
            majority.append(self.find_majority_and_choose_random(row))
        return majority

    def train_classifiers_with_random_states(self, n_iterations=30):
        """
        Realiza a validação cruzada e treina o modelo em várias partições do conjunto de dados.
        """
        precision_scores, recall_scores, f1_scores, accuracy_scores = [], [], [], []
        
        kf = KFold(n_splits=5, shuffle=True, random_state=42)  # Validação cruzada de 5 partes
        # Parametrização da busca de k e métricas de distância
        param_grid = {
            'k': [3, 5, 7, 9],  # Diferentes valores de k
            'distance_metric': ['euclidean', 'cityblock', 'chebyshev']  # Diferentes métricas
        }
        
        for k in param_grid['k']:
            for metric in param_grid['distance_metric']:
                print(f"Treinando para k={k} e métrica de distância {metric}")
        
                # Realiza a validação cruzada e avalia o modelo
                for train_idx, test_idx in kf.split(self.datasets):
                    train_data, test_data = self.datasets.iloc[train_idx], self.datasets.iloc[test_idx]
                    X_train, y_train = train_data.drop('diagnosis', axis=1), train_data['diagnosis']
                    X_test, y_test = test_data.drop('diagnosis', axis=1), test_data['diagnosis']
                    
                    pipeline = self.build_pipeline(k,metric)
                    pipeline.fit(X_train, y_train)
                    y_pred = pipeline.predict(X_test)

                    precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
                    recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
                    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
                    accuracy_scores.append(accuracy_score(y_test, y_pred))

        self.print_metrics(precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, y_pred)

    def print_metrics(self, precision_scores, recall_scores, f1_scores, accuracy_scores, y_test, y_pred):
        print(f"Precision Mean: {st.mean(precision_scores)}, Std Dev: {st.stdev(precision_scores)}, CI: {np.percentile(precision_scores, [2.5, 97.5])}")
        print(f"Recall Mean: {st.mean(recall_scores)}, Std Dev: {st.stdev(recall_scores)}, CI: {np.percentile(recall_scores, [2.5, 97.5])}")
        print(f"F1 Mean: {st.mean(f1_scores)}, Std Dev: {st.stdev(f1_scores)}, CI: {np.percentile(f1_scores, [2.5, 97.5])}")
        print(f"Accuracy Mean: {st.mean(accuracy_scores)}, Std Dev: {st.stdev(accuracy_scores)}, CI: {np.percentile(accuracy_scores, [2.5, 97.5])}")
        print(classification_report(y_test, y_pred))




Importar os datasets 

In [51]:
spectf_heart = fetch_ucirepo(id=96)

# data (as pandas dataframes)
X = spectf_heart.data.features
y = spectf_heart.data.targets


In [52]:
X.head()

Unnamed: 0,F1R,F1S,F2R,F2S,F3R,F3S,F4R,F4S,F5R,F5S,...,F18R,F18S,F19R,F19S,F20R,F20S,F21R,F21S,F22R,F22S
0,59,52,70,67,73,66,72,61,58,52,...,66,56,62,56,72,62,74,74,64,67
1,72,62,69,67,78,82,74,65,69,63,...,65,71,63,60,69,73,67,71,56,58
2,71,62,70,64,67,64,79,65,70,69,...,73,70,66,65,64,55,61,41,51,46
3,69,71,70,78,61,63,67,65,59,59,...,61,61,66,65,72,73,68,68,59,63
4,70,66,61,66,61,58,69,69,72,68,...,67,69,70,66,70,64,60,55,49,41


In [53]:
y.head()

Unnamed: 0,diagnosis
0,1
1,1
2,1
3,1
4,1


In [54]:
df = X.copy()
df['diagnosis'] = y['diagnosis'] 


In [55]:
df.columns

Index(['F1R', 'F1S', 'F2R', 'F2S', 'F3R', 'F3S', 'F4R', 'F4S', 'F5R', 'F5S',
       'F6R', 'F6S', 'F7R', 'F7S', 'F8R', 'F8S', 'F9R', 'F9S', 'F10R', 'F10S',
       'F11R', 'F11S', 'F12R', 'F12S', 'F13R', 'F13S', 'F14R', 'F14S', 'F15R',
       'F15S', 'F16R', 'F16S', 'F17R', 'F17S', 'F18R', 'F18S', 'F19R', 'F19S',
       'F20R', 'F20S', 'F21R', 'F21S', 'F22R', 'F22S', 'diagnosis'],
      dtype='object')

In [56]:
print(len(df))

267


In [57]:
df['diagnosis'].value_counts()

diagnosis
1    212
0     55
Name: count, dtype: int64

In [58]:
!pip install numpy




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [59]:

class Classifier_KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.class_priors = {}  # P(\omega_i)
        self.distance_metric = distance_metric  # 'euclidean', 'cityblock' ou 'chebyshev'
    
    def fit(self, X, y):
        """
        Treina o modelo armazenando os dados de entrada e as classes
        """
        self.classes = np.unique(y)
        self.X_train = X
        self.y_train = y
        for c in self.classes:
            X_c = X[y == c]  # Dados da classe c
            self.class_priors[c] = len(X_c) / len(X)  # Prior P(\omega_i)
    def get_params(self, deep=True):
        return {'k': self.k, 'distance_metric': self.distance_metric}
    
    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self
    def _euclidean_distance(self, x1, x2):
        """
        Calcula a distância Euclidiana entre dois pontos x1 e x2.
        """
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def _cityblock_distance(self, x1, x2):
        """
        Calcula a distância Cityblock (ou Manhattan) entre dois pontos x1 e x2.
        """
        return np.sum(np.abs(x1 - x2))
    
    def _chebyshev_distance(self, x1, x2):
        """
        Calcula a distância Chebyshev entre dois pontos x1 e x2.
        """
        return np.max(np.abs(x1 - x2))
    
    def _calculate_distance(self, x1, x2):
        """
        Calcula a distância entre dois pontos com base na métrica escolhida.
        """
        if self.distance_metric == 'euclidean':
            return self._euclidean_distance(x1, x2)
        elif self.distance_metric == 'cityblock':
            return self._cityblock_distance(x1, x2)
        elif self.distance_metric == 'chebyshev':
            return self._chebyshev_distance(x1, x2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")
        
    def _get_nearest_neighbors(self, x):
        """
        Obtém os k vizinhos mais próximos para cada ponto de teste X_test.
        """
        distances = []
        for x_train in self.X_train:
            dist = self._calculate_distance(x, x_train)
            distances.append(dist)
        # Ordena as distâncias e pega os k vizinhos mais próximos
        sorted_indices = np.argsort(distances)
        nearest_neighbors = sorted_indices[:self.k]
        return nearest_neighbors
    
    def _likelihood(self, neighbors, c):
        # Pega as classes dos vizinhos
        neighbor_classes = self.y_train.iloc[neighbors]

        # Calculando kj (número de vizinhos na classe c)
        kj = sum(neighbor_classes == c)
        
        # Calculando nj (número de exemplos na classe c)
        nj = len(self.y_train[self.y_train == c])
        
        # Calculando a likelihood
        likelihood = kj / nj
        
        return likelihood

    def _posterior(self, x):
        """
        Calcula a posteriori P(\omega_i | x_k) para cada classe.
        """
        posteriors = {}
        neighbors = self._get_nearest_neighbors(x)
        for c in self.classes:
            likelihood = self._likelihood(neighbors, c)
            posterior = likelihood * self.class_priors[c]
            posteriors[c] = posterior
        return posteriors
                
    def predict(self, X_test):
        """
        Faz as previsões para o conjunto de teste X_test.
        """
        predictions = []
        for x in X_test:
            posteriors = self._posterior(x)
            predictions.append(max(posteriors, key=posteriors.get))  # Classe com maior P(\omega_i | x_k)
            
        return np.array(predictions)

In [60]:
# treinando um classificador bayesiano
model = Classifier_KNN() 
#inicializando um classificador com a nossa base df e o modelo escolhido 
trainer = ClassifierTrainer(df, model=model)
#realizando o treino e teste nos dados para o classificador trainer, usando 30 partições
trainer.train_classifiers_with_random_states()

Treinando para k=3 e métrica de distância euclidean
Treinando para k=3 e métrica de distância cityblock
Treinando para k=3 e métrica de distância chebyshev
Treinando para k=5 e métrica de distância euclidean
Treinando para k=5 e métrica de distância cityblock
Treinando para k=5 e métrica de distância chebyshev
Treinando para k=7 e métrica de distância euclidean
Treinando para k=7 e métrica de distância cityblock
Treinando para k=7 e métrica de distância chebyshev
Treinando para k=9 e métrica de distância euclidean
Treinando para k=9 e métrica de distância cityblock
Treinando para k=9 e métrica de distância chebyshev
Precision Mean: 0.7446236054038455, Std Dev: 0.06269246364586813, CI: [0.63761744 0.85235086]
Recall Mean: 0.7451432564640111, Std Dev: 0.051156999096056095, CI: [0.6504717  0.85149371]
F1 Mean: 0.7417751189497471, Std Dev: 0.05493896913573125, CI: [0.65660597 0.84137049]
Accuracy Mean: 0.7451432564640111, Std Dev: 0.051156999096056095, CI: [0.6504717  0.85149371]
         