In [None]:
from math import sqrt
from math import pi
from math import exp
from sklearn import naive_bayes, datasets
import numpy as np

Implementamos una función para separar el dataset por valores de las clases

In [None]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

Calculamos la media de una lista de números

In [None]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

Calculamos la desviación estándar de una lista de números

In [None]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

Se calcula la media, desviación estándar y conteo para cada columna dentro del dataset

In [None]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

Se divide el dataset por clase y se calculan estadísticos para cada registro

In [None]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

Se calcula la distribución de probabilidad Gaussiana para x

In [None]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

Se calculan las probabilidades de predecir cada clase para un registro dado

In [None]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

Se define un dataset

In [None]:
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]

Se calculan las probabilidades para cada registro

In [None]:
summaries = summarize_by_class(dataset)
for row in dataset:
    print(calculate_class_probabilities(summaries, row))

In [None]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [None]:
summaries = summarize_by_class(dataset)
for row in dataset:
    print(predict(summaries, row))

Vamos a trabajar con el dataset de cáncer de seno

In [None]:
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target
data = np.concatenate((X, np.expand_dims(y, axis=1)), axis = 1)
data[0:5]

Vamos a utilizar un clasificador Naïve Bayes que supone una distribución Gaussiana de los datos numéricos, ya que los valores de las variables independientes son continuos.

In [None]:
gnb = naive_bayes.GaussianNB()
modeloGNB = gnb.fit(X, y)

Tenemos el conteo de clases

In [None]:
modeloGNB.class_count_

Tenemos la probabilidad a priori de las clases

In [None]:
modeloGNB.class_prior_

Tenemos la media

In [None]:
modeloGNB.theta_

Tenemos la desviación estándar

In [None]:
modeloGNB.var_

Realizamos la predicción

In [None]:
y_pred = modeloGNB.predict(cancer.data)

In [None]:
print("El modelo de Naïve Bayes se equivocó en %d de los %d registros que componen el dataset original"
      % ((y != y_pred).sum(), cancer.data.shape[0]))