In [14]:
from sklearn.datasets import (make_classification, load_breast_cancer)
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# ___Naive Bayes Classifiers___
-------------------

In [1]:
# Naive Bayes classifiers -> Based on simple probabilistic models of how the data in each class might have been generated.
# They make the assumption that each features are independent of one another.

# Since they are based on simple probabilistic rules, learning is very fast, only few feature statistics are needed to be estimated.
# The tradeoff for this comes as poor generalizations.

In [2]:
# There are three flavours of Naive Bayes classifiers available in sklearn.
    # 1) Bernoulli -> binary features (present / absent), does not count frequencies.
    # 2) Multinomial -> class frequencies
    # 3) Guassian -> continuous / real valued features.
        # Estimated the mean and the standard deviation for each feature value.

## ___Guassian Naive Bayes Models___
------------

In [3]:
# For predictions, the classifier compares the features of the given data point with the feature statistics of each class, and 
# predicts the class that best matches the data point.
# Guassian Naive Bayes models assume that each class is generated by a simple Guassian distribution.
# Prediction process essentially involves computing the probabilities of a class's Guassian distribution having generated this data point.
# The classifier then picks the class with the highest probability.

In [7]:
x, y = make_classification(n_samples = 10000, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)

In [8]:
x.shape

(10000, 2)

In [9]:
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size = 0.75)

In [10]:
nbClassifier = GaussianNB().fit(train_x, train_y)

In [12]:
# Performance is okayish on synthetic data.

nbClassifier.score(test_x, test_y)

0.7644

In [13]:
# Naive Bayes classifiers implement a method called .partial_fit() in addition to .fit()
# This helps to train the model incrementally, which is particularly useful when dealing with huge datasets.

In [15]:
# Naive bayes models on real world data.

In [18]:
bcancer = load_breast_cancer()
train_x, test_x, train_y, test_y = train_test_split(bcancer.data, bcancer.target, train_size = 0.8)

In [19]:
train_x.shape, test_x.shape

((455, 30), (114, 30))

In [20]:
nbClassifier = GaussianNB().fit(train_x, train_y)

In [21]:
nbClassifier.score(test_x, test_y)

0.9298245614035088

In [22]:
# However the performance is quite remarkable in real-world datasets!!

In [23]:
# Gaussian Naive Bayes models are typically used with high dimensional datasets.
# Where each data points have features on a scale of thousands.

# Bernoulli and Multinomial models are widely used in text classification problems!
# Where the feature vectors contain a huge number of tokens, and the feature matrices are commonly sparse!

# Naive Bayes models are mathematically related to linear models! and share similar merits and demerits!