In [1]:
import os
import numpy as np

######朴素贝叶斯
class NBayes(object):
    
    #设置属性
    def __init__(self):
        self.trainSet = 0               #训练集数据
        self.trainLabel = 0             #训练集标记
        self.yProba = {}                #先验概率容器
        self.xyProba = {}               #条件概率容器
        self.ySet = {}                  #标记类别对应的数量
        self.ls = 1                     #加入的拉普拉斯平滑的系数
        self.n_samples = 0              #训练集样本数量
        self.n_features = 0             #训练集特征数量
        
    #计算P(y)先验概率
    def calPy(self, y, LS=True):
        """
        计算先验概率，也就是每个标记的占比

        Parameters
        ----------
        y : 1D array-like
            trainLabel.
        LS : bool, optional
            Weather Laplace Smoothing. The default is True.

        Returns
        -------
        None.

        """
        Py = {}
        yi = {}
        ySet = np.unique(y)
        for i in ySet:
            Py[i] = (sum(y == i) + self.ls) / (self.n_samples + len(ySet))
            yi[i] = sum(y == i)
        self.yProba = Py
        self.ySet = yi
        return
        #计算P(x|y)条件概率
        
    def calPxy(self, X, y, LS=True):
        """
        计算先验概率，也就是每类分类中，每个变量值的占比

        Parameters
        ----------
        X : 2D array-like
            trainSet.
        y : 1D array-like
            trainLabel.
        LS : bool, optional
            Weather Laplace Smoothing. The default is True.

        Returns
        -------
        None.

        """
        Pxy = {}
        for yi, yiCount in self.ySet.items():
            Pxy[yi] = {}                                                            #第一层是标签Y的分类
            for xIdx in range(self.n_features):
                Pxy[yi][xIdx] = {}                                                  #第二层是不同的特征
                #下标为第xIdx的特征数据
                Xi = X.iloc[:, xIdx]
                XiSet = np.unique(Xi)
                XiSetCount = XiSet.size
                #下标为第xIdx，并标签为yi的特征数据
                Xiyi = X.iloc[np.nonzero(y==yi)[0], xIdx]
                for xi in XiSet:
                    Pxy[yi][xIdx][xi] = self.classifyProba(xi, Xiyi, XiSetCount)    #第三层是变量Xi的分类概率，离散变量
        self.xyProba = Pxy
        return

    #离散变量直接计算概率
    def classifyProba(self, x, xArr, XiSetCount):
        Pxy = (sum(xArr == x) + self.ls) / (xArr.size + XiSetCount)    #加入拉普拉斯修正的概率
        return Pxy
    
    #训练
    def train(self, X, y):
        self.n_samples, self.n_features = X.shape
        #计算先验概率
        self.calPy(y)
        print('P(y)训练完毕!')
        #计算条件概率
        self.calPxy(X, y)
        print('P(x|y)训练完毕!')
        self.trainSet = X
        self.trainLabel = y
        return

In [2]:
class Aode(NBayes):
    
    def AodeTrain(self, X, y, columnsMark):
        self.n_samples, self.n_features = X.shape
        # 计算类别的先验联合概率
        Pypa = {}
        # 计算联合概率的的条件概率
        Pxypa = {}
        yset = np.unique(y)
        # 第一层是不同的分类
        for yi in yset:
            Pypa[yi] = {}; Pxypa[yi] = {}
            
            # 第二层是不同的超父属性，如果是连续值则，不能当作超父，离散值当作超父属性 
            for paIdx in range(self.n_features):
                if columnsMark[paIdx] == 1:
                    continue
                Pypa[yi][paIdx] = {}; Pxypa[yi][paIdx] = {}
                paset = np.unique(X.iloc[:, paIdx])
                
                # 第三层是不同的超父属性的属性值，分离出来对应的Xarr，和yarr
                for pai in paset:
                    yi_pai_idx = np.nonzero([(X.iloc[:,paIdx]==pai)&(y==yi)])[1]
                    
#                    if paIdx==2 and pai==1:
#                        print(yi, '\n', yi_pai_idx)
                    
                    yarr = y[yi_pai_idx]
                    ## 保存类别的先验联合概率
                    Pypa[yi][paIdx][pai] = self.__calyproba(yarr, self.n_samples, len(yset), len(paset))
                    Pxypa[yi][paIdx][pai] = {}
                    
                    # 第四层是不同的其他特征，若是超父属性则跳过，离散归离散统计，连续归连续统计
                    for xiIdx in range(self.n_features):
                        if xiIdx == paIdx:
                            continue
                        allxiset = np.unique(X.iloc[:, xiIdx])
                        Xarr = X.iloc[list(yi_pai_idx),xiIdx]
                        if columnsMark[xiIdx] == 0:
                            ## 保存离散特征的条件概率
                            Pxypa[yi][paIdx][pai][xiIdx] = self.__categorytrain(Xarr, allxiset)
                        else:
                            ## 保存连续特征的条件概率
                            Pxypa[yi][paIdx][pai][xiIdx] = self.__continuoustrain(Xarr)
                        
#                        if xiIdx == 4 and paIdx==2 and pai==1:
#                            print(Xarr)
                        
        print('P(y,pa)训练完毕!')
        print('P(x|y,pa)训练完毕!')
        self.yProba = Pypa
        self.xyProba = Pxypa
        self.trainSet = X
        self.trainLabel = y
        self.columnsMark = columnsMark        
        return
    
    
    # 计算离散特征的条件概率
    def __categorytrain(self, Xarr, xiset):
        pxypa = {}
        for xivalue in xiset:
            pxypa[xivalue] = {}
            pxypa[xivalue]['count'] = sum(Xarr==xivalue) + self.ls
            pxypa[xivalue]['ratio'] = self.classifyProba(xivalue, Xarr, len(xiset))
        return pxypa
    
    # 计算连续特征的均值和标准差
    def __continuoustrain(self, Xarr):
        pxypa = (Xarr.mean(), Xarr.std())
        return pxypa
        
    # 计算先验联合概率
    def __calyproba(self, yarr, ysum, ysetsum, pasetsum):
        yproba = {}
        yproba['count'] = len(yarr) + self.ls
        yproba['ratio'] = (len(yarr) + self.ls) / (ysum + ysetsum * pasetsum)
        return yproba
    
        # 预测
    def aodepredict(self, X, minSet=0):
        n_samples, n_features = X.shape
        proba = np.zeros((n_samples, len(self.yProba)))
        for i in range(n_samples):
            for idx, (yi, Padict) in enumerate(self.yProba.items()):
                sumvalue = 0.
                for paIdx, Pavaluedict in Padict.items():
                    subvalue = 1
                    pavalue = X.iloc[i, paIdx]
                    Statsdict = Pavaluedict[pavalue]
                    if Statsdict['count'] <= minSet:
                        continue
                    Pypa = Statsdict['ratio']
                    subvalue *= Pypa
                    Pxypadict = self.xyProba[yi][paIdx][pavalue]
                    for xiIdx, xiparams in Pxypadict.items():
                        xi = X.iloc[i, xiIdx]
                        if isinstance(xiparams, dict):
                            Pxypa = xiparams[xi]['ratio']
                        else:
                            if np.isnan(xiparams[0]) or np.isnan(xiparams[1]):
                                Pxypa = 1.0e-5
                            else:
                                miu = xiparams[0]; sigma = xiparams[1] + 1.0e-5
                                Pxypa = np.exp(-(xi-miu)**2/(2*sigma**2))/(np.power(2*np.pi, 0.5)*sigma) + 1.0e-5
                        subvalue *= Pxypa
                    sumvalue += subvalue
                proba[i, idx] = sumvalue
        return proba
    
    
    # 取对数预测
    def aodepredictLog(self, X, minSet=0):
        n_samples, n_features = X.shape
        proba_log = np.zeros((n_samples, len(self.yProba)))
        for i in range(n_samples):
            for idx, (yi, Padict) in enumerate(self.yProba.items()):
                sumvalue = 0.
                for paIdx, Pavaluedict in Padict.items():
                    subvalue = 0.
                    pavalue = X.iloc[i, paIdx]
                    Statsdict = Pavaluedict[pavalue]
                    if Statsdict['count'] <= minSet:
                        continue
                    Pypa = Statsdict['ratio']
                    subvalue += np.log(Pypa)
                    Pxypadict = self.xyProba[yi][paIdx][pavalue]
                    for xiIdx, xiparams in Pxypadict.items():
                        xi = X.iloc[i, xiIdx]
                        if isinstance(xiparams, dict):
                            Pxypa = xiparams[xi]['ratio']
                        else:
                            if np.isnan(xiparams[0]) or np.isnan(xiparams[1]):
                                Pxypa = 1.0e-5
                            else:
                                miu = xiparams[0]; sigma = xiparams[1] + 1.0e-5
                                Pxypa = np.exp(-(xi-miu)**2/(2*sigma**2))/(np.power(2*np.pi, 0.5)*sigma) + 1.0e-5
                        subvalue += np.log(Pxypa)
                    sumvalue += subvalue
                proba_log[i, idx] = sumvalue
        return proba_log