# 多项式模型

当特征是离散的时候，使用多项式模型。

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB


# 预处理
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import sys
import warnings

warnings.simplefilter('ignore')

In [2]:
def load_data():
    data = load_breast_cancer()
    return data.data,data.target

def load_data_split():
    X,y = load_data()
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    return X_train,X_test,y_train,y_test

def standardize(X_):
    X_ = StandardScaler().fit_transform(X_)
    return X_

def load_data_split_standarize():
    X_train,X_test,y_train,y_test = load_data_split()
    return standardize(X_train),standardize(X_test),y_train,y_test

def load_data_standarize_discret():
    X_train,X_test,y_train,y_test = load_data_split_standarize()
    kd = KBinsDiscretizer(encode='ordinal').fit(X_train)
    X_train = kd.transform(X_train)
    X_test = kd.transform(X_test)
    return X_train,X_test,y_train,y_test

In [3]:
class MultinomialNB_amy:
    '''
    Naive Bayes classifier for multinomial models
    The multinomial Naive Bayes classifier is suitable for classification with
    discrete features

    Parameters
    ----------
    alpha : float, optional (default=1.0)
            Setting alpha = 0 for no smoothing
            Setting 0 < alpha < 1 is called Lidstone smoothing
            Setting alpha = 1 is called Laplace smoothing 
    fit_prior : boolean
            Whether to learn class prior probabilities or not.
            If false, a uniform prior will be used.
    class_prior : 
            array-like, size (n_classes,)
            Prior probabilities of the classes. If specified the priors are not
            adjusted according to the data.
            类的先验概率
    '''

    def __init__(
        self
        #, alpha = 1.0
        #, fit_prior = True
        #, class_prior = None
    ):
        #self.alpha = alpha
        #self.fit_prior = fit_prior
        self.class_prior_ = None # class_prior
        self.classes_ = None
        self.conditional_prob_ = None

    def _calculate_feature_prob(
        self
        , X_features_i
        , feature_uniq
    ):
        '''
        计算对应特征的条件概率
        '''
        # 样本总数 
        n = len(X_features_i)
        # 特征数
        feature_n = len(feature_uniq)
        # 存在于该集中特征的每个值的个数
        feature_X_i_uniq ,feature_X_i_counts = np.unique(X_features_i ,return_counts=True)
        # 初始化所有特征值的条件概率
        feature_prob = {}
        # 计算所有特征值的条件概率
        for uniq in feature_uniq:
            f_uniq_cnt = feature_X_i_counts[feature_X_i_uniq == uniq]
            f_uniq_cnt = f_uniq_cnt[0] if len(f_uniq_cnt) > 0 else 0
            prob = (f_uniq_cnt + 1.) / (n + feature_n)
            feature_prob[uniq] = prob
        return feature_prob
        
    def fit(
        self
        , X
        , y
    ):
        n = len(y)
        # 类别
        self.classes_, classes_counts = np.unique(y,return_counts=True)
        # 计算类的先验概率: P(y=ck)
        self.class_prior_ = classes_counts / (n + len(self.classes_))
        # 计算条件概率: P( xj | y = ck )
        self.conditional_prob_ = {}  # like { c0:{ x0:{ value0:0.2, value1:0.8 }, x1:{} }, c1:{...} }
        for c in self.classes_:
            self.conditional_prob_[c] = {}
            for i in range(X.shape[1]):  # for each feature
                features = X[:, i] # 取特征i
                feature_uniq = np.unique(features) # 取所有特征的值
                X_features_i = features[y==c] # 取在c标签下的特征值
                self.conditional_prob_[c][i] = self._calculate_feature_prob(
                    X_features_i
                    , feature_uniq
                ) # 计算条件概率
        return self

    def predict_single(
        self
        , x):
        max_p = 0.0
        max_k = 0
        class_prior_map = dict(zip(self.classes_,self.class_prior_))            
        for k in class_prior_map.keys():
            # 类的先验概率
            c_p = class_prior_map[k]
            for f_i in self.conditional_prob_.keys():
                # 对应类k的条件概率
                c_p =c_p * self.conditional_prob_[k][f_i][x[f_i]]
            if(c_p > max_p):
                max_k = k
                max_p = c_p
        
        return max_k
    
    def predict(
        self
        , X
    ):
        y = np.empty(len(X))
        for i,x in enumerate(X):
            y[i] = self.predict_single(x)
        return y
    
    def score(
        self
        ,X
        ,y
    ):
        return np.sum(y==self.predict(X))/len(y)

In [4]:
X_train,X_test,y_train,y_test = load_data_standarize_discret()
mnb = MultinomialNB_amy().fit(X_train,y_train)
mnb.predict(X_test)
print('my score : ' , mnb.score(X_test,y_test))#应该没错
#mnb.conditional_prob_
skmnb=MultinomialNB().fit(X_train,y_train)
print('sklearn score :' ,skmnb.score(X_test,y_test))

my score :  0.8951048951048951
sklearn score : 0.7832167832167832


In [5]:
X_train,X_test,y_train,y_test = load_data_split_standarize()
skgnb = GaussianNB().fit(X_train,y_train)
print('sk score : ' , skgnb.score(X_test,y_test))

sk score :  0.9230769230769231


In [6]:
class GaussianNB_amy():
    def __init__(
        self):
        self.classes_prob_map_ = None
        self.feature_std_mean_ = None
      
    def fit(
        self
        , X
        , y):
        # 样本数
        n = len(y)
        # 特征数
        features_n = X.shape[1]
        # 类的先验概率
        class_uniq,class_cnts = np.unique(y,return_counts=True)
        class_prob = class_cnts / n + len(class_uniq)
        self.classes_prob_map_ = dict(zip(class_uniq,class_prob))
        # 计算每个类对应特征的方差和均值
        feature_std_mean = {}
        # 所有的类别
        for c in self.classes_prob_map_.keys():
            f = {}
            # 对每个特征计算均值方差
            for i in range(X.shape[1]):
                f[i] = {}
                X_feature_i = X[: ,i]
                X_c_feature_i = X_feature_i[y==c]
                # c类第i个特征的均值方差
                f[i]['mean'] = np.mean(X_c_feature_i)
                f[i]['std'] = np.std(X_c_feature_i)
            feature_std_mean[c] = f
        self.feature_std_mean_ =feature_std_mean
        return self
         
    def predict_single(
        self
        , x
    ):
        max_p = 0.0
        max_c = 0
        for c in self.classes_prob_map_.keys():
            c_p = self.classes_prob_map_[c] 
            for i in range(len(x)):
                mean = self.feature_std_mean_[c][i]['mean']
                std = self.feature_std_mean_[c][i]['std']
                c_p *= (1. / (np.sqrt(2 * np.pi) * std)) * np.exp( (-(x[i] - mean)**2) / (2 * (std ** 2)))
            if(c_p > max_p):
                max_c = c
                max_p = c_p
        return max_c
    
    def predict(
        self
        ,X
    ):
        y = np.empty(len(X))
        for i,x in enumerate(X):
            y[i] = self.predict_single(x)
        return y
    
    def score(
        self
        ,X
        ,y
    ):
        return np.sum(y==self.predict(X))/len(y)

In [7]:
X_train,X_test,y_train,y_test = load_data_split_standarize()
skgnb = GaussianNB().fit(X_train,y_train)
print('sk score : ' , skgnb.score(X_test,y_test))
amygnb = GaussianNB_amy().fit(X_train,y_train)
print('my score : ' , amygnb.score(X_test,y_test))

sk score :  0.9300699300699301
my score :  0.9230769230769231
