In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import logging
import datetime

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [3]:
class NaiveBayesClassifier:
    def __init__(self):
        self.classes = None
        self.mean = None
        self.var = None
        self.priors = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        logger.info(f"Starting training with {n_samples} samples, {n_features} features")
        logger.info(f"Detected classes: {self.classes}")
        
        # Initialize parameters
        self.mean = np.zeros((n_classes, n_features))
        self.var = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)
        
        # Calculate mean, variance, and prior for each class
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0)
            self.priors[idx] = len(X_c) / n_samples
            
            logger.info(f"Class {c} statistics:")
            logger.info(f"- Prior probability: {self.priors[idx]:.3f}")
            logger.info(f"- Mean range: [{self.mean[idx].min():.3f}, {self.mean[idx].max():.3f}]")
            logger.info(f"- Variance range: [{self.var[idx].min():.3f}, {self.var[idx].max():.3f}]")
    
    def _calculate_likelihood(self, X, mean, var):
        # Calculate Gaussian probability density function
        epsilon = 1e-10  # To avoid division by zero
        exponent = -0.5 * np.square(X - mean) / (var + epsilon)
        return np.sum(exponent - 0.5 * np.log(2 * np.pi * (var + epsilon)), axis=1)
    
    def predict(self, X):
        n_samples = X.shape[0]
        logger.info(f"Predicting {n_samples} samples")
        
        # Calculate likelihood for each class
        posteriors = []
        for idx in range(len(self.classes)):
            likelihood = self._calculate_likelihood(X, self.mean[idx], self.var[idx])
            posterior = likelihood + np.log(self.priors[idx])
            posteriors.append(posterior)
        
        # Get class with highest posterior
        posteriors = np.array(posteriors).T
        predictions = np.argmax(posteriors, axis=1)
        
        logger.info(f"Prediction complete")
        return self.classes[predictions]

In [4]:
# Load and prepare the data
logger.info("Loading Breast Cancer Wisconsin dataset")
data = load_breast_cancer()
X, y = data.data, data.target

2024-11-21 10:08:23,663 - INFO - Loading Breast Cancer Wisconsin dataset


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
logger.info(f"Train set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

2024-11-21 10:08:39,038 - INFO - Train set size: 455, Test set size: 114


In [6]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
logger.info("Data scaling complete")

2024-11-21 10:08:48,225 - INFO - Data scaling complete


In [7]:
# Train the model
logger.info("Training Naive Bayes classifier")
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train_scaled, y_train)

2024-11-21 10:08:58,834 - INFO - Training Naive Bayes classifier
2024-11-21 10:08:58,837 - INFO - Starting training with 455 samples, 30 features
2024-11-21 10:08:58,838 - INFO - Detected classes: [0 1]
2024-11-21 10:08:58,840 - INFO - Class 0 statistics:
2024-11-21 10:08:58,842 - INFO - - Prior probability: 0.371
2024-11-21 10:08:58,843 - INFO - - Mean range: [-0.076, 1.026]
2024-11-21 10:08:58,845 - INFO - - Variance range: [0.461, 1.911]
2024-11-21 10:08:58,848 - INFO - Class 1 statistics:
2024-11-21 10:08:58,848 - INFO - - Prior probability: 0.629
2024-11-21 10:08:58,849 - INFO - - Mean range: [-0.606, 0.045]
2024-11-21 10:08:58,850 - INFO - - Variance range: [0.036, 1.296]


In [8]:
# Make predictions
logger.info("Making predictions on test set")
y_pred = nb_classifier.predict(X_test_scaled)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
logger.info(f"Test set accuracy: {accuracy:.3f}")

# Print feature names and their importance
feature_importance = np.abs(nb_classifier.mean[1] - nb_classifier.mean[0])
for feature_name, importance in zip(data.feature_names, feature_importance):
    logger.info(f"Feature '{feature_name}' importance: {importance:.3f}")

2024-11-21 10:09:25,214 - INFO - Making predictions on test set
2024-11-21 10:09:25,217 - INFO - Predicting 114 samples
2024-11-21 10:09:25,219 - INFO - Prediction complete
2024-11-21 10:09:25,221 - INFO - Test set accuracy: 0.965
2024-11-21 10:09:25,223 - INFO - Feature 'mean radius' importance: 1.486
2024-11-21 10:09:25,224 - INFO - Feature 'mean texture' importance: 0.861
2024-11-21 10:09:25,226 - INFO - Feature 'mean perimeter' importance: 1.515
2024-11-21 10:09:25,228 - INFO - Feature 'mean area' importance: 1.439
2024-11-21 10:09:25,230 - INFO - Feature 'mean smoothness' importance: 0.776
2024-11-21 10:09:25,231 - INFO - Feature 'mean compactness' importance: 1.222
2024-11-21 10:09:25,232 - INFO - Feature 'mean concavity' importance: 1.414
2024-11-21 10:09:25,234 - INFO - Feature 'mean concave points' importance: 1.610
2024-11-21 10:09:25,235 - INFO - Feature 'mean symmetry' importance: 0.720
2024-11-21 10:09:25,236 - INFO - Feature 'mean fractal dimension' importance: 0.030
2024