In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn import metrics, linear_model
from scipy import stats
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt

import dmining

## Construct the spam problem

In [2]:
df_spam = dmining.load_spam_data()
problem_spam = dmining.Problem.from_data_frame(df_spam)
problem_train, problem_test = problem_spam.train_test_split(0.25, 0)

## Manual Gaussian Naive Bayes

In [3]:
group_proba = dmining.group_probability(problem_train.target)
group_proba

{False: 0.6078260869565217, True: 0.39217391304347826}

In [4]:
scalers = {}
for k, v in group_proba.items():
    sc = preprocessing.StandardScaler()
    sc.fit(problem_train.data[problem_train.target == k])
    scalers[k] = sc

Suppose that for each class k, the features are independent. So $Pr(feature | class = k) = \prod_i Pr(feature_i | class = k)$.

In [5]:
descriminant = pd.DataFrame(0., index=problem_test.index, columns=group_proba.keys())
for k, sc in scalers.items():
    data_k = sc.transform(problem_test.data)
    prob_k = stats.norm.pdf(data_k)
    descriminant[k] = group_proba[k] * np.prod(prob_k, axis=1)

In [6]:
test_predict = descriminant.idxmax(axis=1)

In [7]:
print metrics.classification_report(problem_test.target, test_predict)

             precision    recall  f1-score   support

      False       0.87      0.92      0.89       691
       True       0.87      0.79      0.83       460

avg / total       0.87      0.87      0.87      1151



## Gaussian Naive Bayesian from scikit-learn

In [8]:
from sklearn import naive_bayes

In [9]:
gaussian_nb = naive_bayes.GaussianNB()

In [10]:
gaussian_nb.fit(problem_train.data, problem_train.target)

GaussianNB(priors=None)

In [11]:
y_predict = gaussian_nb.predict(problem_test.data)

In [12]:
print metrics.classification_report(problem_test.target, y_predict)

             precision    recall  f1-score   support

      False       0.94      0.72      0.81       691
       True       0.69      0.93      0.79       460

avg / total       0.84      0.80      0.80      1151



In [17]:
x = np.random.random(12).reshape(4, 3)
x

array([[ 0.39351171,  0.23225529,  0.23354321],
       [ 0.74879882,  0.19547051,  0.56008605],
       [ 0.9878556 ,  0.48276179,  0.13372494],
       [ 0.94656448,  0.61272792,  0.94587472]])

In [19]:
np.rollaxis(x, 0)

array([[ 0.39351171,  0.23225529,  0.23354321],
       [ 0.74879882,  0.19547051,  0.56008605],
       [ 0.9878556 ,  0.48276179,  0.13372494],
       [ 0.94656448,  0.61272792,  0.94587472]])

In [20]:
np.rollaxis?

In [21]:
np.atleast_2d?