In [3]:

import warnings

from abc import ABCMeta, abstractmethod


import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import binarize
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import label_binarize
from sklearn.utils import check_X_y, check_array, deprecated
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.fixes import logsumexp
from sklearn.utils.multiclass import _check_partial_fit_first_call
from sklearn.utils.validation import check_is_fitted, check_non_negative, column_or_1d
from sklearn.utils.validation import _check_sample_weight

__all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
           'CategoricalNB']


class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
    """Abstract base class for naive Bayes estimators"""

    @abstractmethod
    def _joint_log_likelihood(self, X):
        """Compute the unnormalized posterior log probability of X

        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].

        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """

    def _check_X(self, X):
        """To be overridden in subclasses with the actual checks."""
        # Note that this is not marked @abstractmethod as long as the
        # deprecated public alias sklearn.naive_bayes.BayesNB exists
        # (until 0.24) to preserve backward compat for 3rd party projects
        # with existing derived classes.
        return X

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)

        Returns
        -------
        C : ndarray of shape (n_samples,)
            Predicted target values for X
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        return self.classes_[np.argmax(jll, axis=1)]

    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)

        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        check_is_fitted(self)
        X = self._check_X(X)
        jll = self._joint_log_likelihood(X)
        # normalize by P(x) = P(f_1, ..., f_n)
        log_prob_x = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob_x).T

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)

        Returns
        -------
        C : array-like of shape (n_samples, n_classes)
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute :term:`classes_`.
        """
        return np.exp(self.predict_log_proba(X))




_ALPHA_MIN = 1e-10


class _BaseDiscreteNB(_BaseNB):
    def _check_X(self, X):
        return check_array(X, accept_sparse='csr')

    def _check_X_y(self, X, y):
        return check_X_y(X, y, accept_sparse='csr')

    def _update_class_log_prior(self, class_prior=None):
        n_classes = len(self.classes_)
        if class_prior is not None:
            if len(class_prior) != n_classes:
                raise ValueError("Number of priors must match number of"
                                 " classes.")
            self.class_log_prior_ = np.log(class_prior)
        elif self.fit_prior:
            with warnings.catch_warnings():
                # silence the warning when count is 0 because class was not yet
                # observed
                warnings.simplefilter("ignore", RuntimeWarning)
                log_class_count = np.log(self.class_count_)

            # empirical prior, with sample_weight taken into account
            self.class_log_prior_ = (log_class_count -
                                     np.log(self.class_count_.sum()))
        else:
            self.class_log_prior_ = np.full(n_classes, -np.log(n_classes))
           


    def _check_alpha(self):
        if np.min(self.alpha) < 0:
            raise ValueError('Smoothing parameter alpha = %.1e. '
                             'alpha should be > 0.' % np.min(self.alpha))
        if isinstance(self.alpha, np.ndarray):
            if not self.alpha.shape[0] == self.n_features_:
                raise ValueError("alpha should be a scalar or a numpy array "
                                 "with shape [n_features]")
        if np.min(self.alpha) < _ALPHA_MIN:
            warnings.warn('alpha too small will result in numeric errors, '
                          'setting alpha = %.1e' % _ALPHA_MIN)
            return np.maximum(self.alpha, _ALPHA_MIN)
        return self.alpha

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        X, y = self._check_X_y(X, y)
        _, n_features = X.shape

        if _check_partial_fit_first_call(self, classes):
            # This is the first call to partial_fit:
            # initialize various cumulative counters
            n_effective_classes = len(classes) if len(classes) > 1 else 2
            self._init_counters(n_effective_classes, n_features)
            self.n_features_ = n_features
        elif n_features != self.n_features_:
            msg = "Number of features %d does not match previous data %d."
            raise ValueError(msg % (n_features, self.n_features_))

        Y = label_binarize(y, classes=self.classes_)
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        if X.shape[0] != Y.shape[0]:
            msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
            raise ValueError(msg % (X.shape[0], y.shape[0]))

        # label_binarize() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently
        Y = Y.astype(np.float64, copy=False)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        self._count(X, Y)

        # XXX: OPTIM: we could introduce a public finalization method to
        # be called by the user explicitly just once after several consecutive
        # calls to partial_fit and prior any call to predict[_[log_]proba]
        # to avoid computing the smooth log probas at each call to partial fit
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self

    def fit(self, X, y, sample_weight=None):

        X, y = self._check_X_y(X, y)
        _, n_features = X.shape
        self.n_features_ = n_features

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently;
        # this means we also don't have to cast X to floating point
        if sample_weight is not None:
            Y = Y.astype(np.float64, copy=False)
            sample_weight = np.asarray(sample_weight)
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior
      
        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]

        self._init_counters(n_effective_classes, n_features)
        self._count(X, Y)
        alpha = self._check_alpha()
        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        return self

    def _init_counters(self, n_effective_classes, n_features):
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.feature_count_ = np.zeros((n_effective_classes, n_features),
                                       dtype=np.float64)

    # XXX The following is a stopgap measure; we need to set the dimensions
    # of class_log_prior_ and feature_log_prob_ correctly.
    def _get_coef(self):
        return (self.feature_log_prob_[1:]
                if len(self.classes_) == 2 else self.feature_log_prob_)

    def _get_intercept(self):
        return (self.class_log_prior_[1:]
                if len(self.classes_) == 2 else self.class_log_prior_)

    coef_ = property(_get_coef)
    intercept_ = property(_get_intercept)

    def _more_tags(self):
        return {'poor_score': True}


class MultinomialNB(_BaseDiscreteNB):


    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
        self.alpha = alpha
        self.fit_prior = fit_prior
        self.class_prior = class_prior

    def _more_tags(self):
        return {'requires_positive_X': True}

    def _count(self, X, Y):
        """Count and smooth feature occurrences."""
        check_non_negative(X, "MultinomialNB (input X)")
        self.feature_count_ += safe_sparse_dot(Y.T, X)
        self.class_count_ += Y.sum(axis=0)

        print("count")
        print(self.feature_count_)
        #print(self.class_count_)

    def _update_feature_log_prob(self, alpha):
        """Apply smoothing to raw counts and recompute log probabilities"""
        smoothed_fc = self.feature_count_ + alpha
        print("smoothed fc")
        print(smoothed_fc)
        smoothed_cc = smoothed_fc.sum(axis=1)
        print("smoothed_cc")
        print(smoothed_cc)
        print(np.log(smoothed_cc.reshape(-1, 1)))

        self.feature_log_prob_ = (np.log(smoothed_fc) -
                                  np.log(smoothed_cc.reshape(-1, 1)))
        print("feature_iiiiiiiiiilog_prob")
        print(self.feature_log_prob_)

    def _joint_log_likelihood(self, X):
        """Calculate the posterior log probability of the samples X"""
        print("jlllllllllllllll")
        print(self.class_log_prior_)
        print((safe_sparse_dot(X, self.feature_log_prob_.T) +
                self.class_log_prior_))
        return (safe_sparse_dot(X, self.feature_log_prob_.T) +
                self.class_log_prior_)



import seaborn as sn
import matplotlib.pyplot as plt
import pandas
from sklearn.metrics import accuracy_score
from sklearn import naive_bayes as nb
import numpy as np
from sklearn.model_selection import train_test_split

dataframe = pandas.read_table('/content/mixed200.txt')
data = dataframe.drop("No.", axis=1)
data = data.drop("StdPageRank", axis=1)
data = data.drop("VarPageRank", axis=1)
dataset = data.values



# split into input (X) and output (Y) variables
training = 57748
testing = 24749

l = 82497
X = dataset[:,0:]
# print(X.shape)
labels = []
for i in range (0,l):
  if X[i][9] == 1:
    labels.append(1)
  else:
    labels.append(0)

features = X[0:l,0:9]

print(labels.count(0))
print(labels.count(1))


# Split our data
train, test, train_labels, test_labels = features[0:training],features[training:testing+training],labels[0:training],labels[training:testing+training]

  # Initialize our classifier



gnb = MultinomialNB()




# Train our classifier
print(len(train_labels))
model =gnb.fit(train,train_labels)
preds = gnb.predict(test)
print("----------------------------MNB--------------------------")
print(accuracy_score(test_labels, preds))



23663
58834
57748
count
[[7.20184000e+05 1.39657100e+06 2.40487000e+05 1.47190000e+04
  2.34285000e+05 1.47360000e+04 4.03109598e+02 3.55455770e+03
  2.21042790e+02]
 [1.03530300e+06 1.82424200e+06 1.71753000e+05 4.10530000e+04
  1.67663000e+05 4.10510000e+04 1.76403744e+03 2.66697401e+03
  6.30614075e+02]]
smoothed fc
[[7.20185000e+05 1.39657200e+06 2.40488000e+05 1.47200000e+04
  2.34286000e+05 1.47370000e+04 4.04109598e+02 3.55555770e+03
  2.22042790e+02]
 [1.03530400e+06 1.82424300e+06 1.71754000e+05 4.10540000e+04
  1.67664000e+05 4.10520000e+04 1.76503744e+03 2.66797401e+03
  6.31614075e+02]]
smoothed_cc
[2625169.71008624 3286135.62551802]
[[14.7806561 ]
 [15.00522285]]
feature_iiiiiiiiiilog_prob
[[-1.2933927  -0.63112488 -2.39023063 -5.18369371 -2.41635823 -5.18253949
  -8.77896998 -6.6043889  -9.37778599]
 [-1.15501719 -0.58854719 -2.95140435 -4.3825793  -2.97550559 -4.38262802
  -7.52929567 -7.11614819 -8.55694428]]
jlllllllllllllll
[-1.25030703 -0.3374563 ]
[[ -98.68792502  -

23663
58834
57748
count
[[7.20184000e+05 1.39657100e+06 2.40487000e+05 1.47190000e+04
  2.34285000e+05 1.47360000e+04 4.03109598e+02 3.55455770e+03
  2.21042790e+02]
 [1.03530300e+06 1.82424200e+06 1.71753000e+05 4.10530000e+04
  1.67663000e+05 4.10510000e+04 1.76403744e+03 2.66697401e+03
  6.30614075e+02]]
[16540. 41208.]
smoothed fc
[[7.20185000e+05 1.39657200e+06 2.40488000e+05 1.47200000e+04
  2.34286000e+05 1.47370000e+04 4.04109598e+02 3.55555770e+03
  2.22042790e+02]
 [1.03530400e+06 1.82424300e+06 1.71754000e+05 4.10540000e+04
  1.67664000e+05 4.10520000e+04 1.76503744e+03 2.66797401e+03
  6.31614075e+02]]
smoothed_cc
[2625169.71008624 3286135.62551802]
[[14.7806561 ]
 [15.00522285]]
feature_log_prob
[[-1.2933927  -0.63112488 -2.39023063 -5.18369371 -2.41635823 -5.18253949
  -8.77896998 -6.6043889  -9.37778599]
 [-1.15501719 -0.58854719 -2.95140435 -4.3825793  -2.97550559 -4.38262802
  -7.52929567 -7.11614819 -8.55694428]]
jlllllllllllllll
[[ -98.68792502  -95.55158502]
 [-174.