# 最大墒模型

![最大墒模型](../images/最大墒模型/最大墒模型学习.jpg)
![最大墒模型](../images/最大墒模型/最大墒模型学习2.jpg)
![最大墒模型](../images/最大墒模型/最大墒模型学习3.jpg)
![最大墒模型](../images/最大墒模型/最大墒模型学习4.jpg)

![迭代尺度算法](../images/最大墒模型/改进的迭代尺度法.jpg)
![迭代尺度算法](../images/最大墒模型/改进的迭代尺度法2.jpg)
![迭代尺度算法](../images/最大墒模型/改进的迭代尺度法3.jpg)
![迭代尺度算法](../images/最大墒模型/改进的迭代尺度法4.jpg)
![迭代尺度算法](../images/最大墒模型/改进的迭代尺度法5.jpg)

![改进的迭代尺度法](../images/最大墒模型/改进的迭代尺度法6.jpg)
![改进的迭代尺度法](../images/最大墒模型/改进的迭代尺度法7.jpg)
![改进的迭代尺度法](../images/最大墒模型/改进的迭代尺度法8.jpg)

In [1]:
import time
import numpy as np
from collections import defaultdict
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
data = load_breast_cancer()
X, y = data['data'], data['target']
train_X, test_X, train_y, test_y = train_test_split(X, y)
print('train size: %s' % len(train_X))
print('test size: %s' % len(test_X))

train size: 426
test size: 143


In [3]:
class MaxEntropy:
    def __init__(self, train_X, train_y, iters=10):
        self.train_X = train_X
        self.train_y = train_y
        self.N_features = train_X.shape[1]
        self.N_samples = train_X.shape[0]
        self.M = 426
        self.pair_xy = 0
        self.fixy_dict = self.__caculate_fixy()
        self.w = [0] * self.pair_xy
        self.xy2id_dict, self.id2xy_dict = self.__createSearchDict()
        self.Ep_xy = self.__caculate_Ep_xy()
        self.iters = iters
        self.__train()
        
    def __caculate_fixy(self):
        '''
        计算特征对出现的次数
        '''
        fixy_dict = [defaultdict(int) for i in range(self.N_features)]
        for i in range(self.N_samples):
            for j in range(self.N_features):
                fixy_dict[j][(self.train_X[i][j], self.train_y[i])] += 1
        
        for i in fixy_dict:
            self.pair_xy += len(i)
            
        return fixy_dict
        
        
    def __createSearchDict(self):
        '''
        为特征对生成唯一的索引
        '''
        xy2id_dict = [{} for i in range(self.N_features)]
        id2xy_dict = {}
        index = 0
        for feature in range(self.N_features):
            for (x, y) in self.fixy_dict[feature]:
                xy2id_dict[feature][(x, y)] = index
                id2xy_dict[index] = (x, y)
                index += 1
        return xy2id_dict, id2xy_dict        
        
        
    def __calculate_Pwyx(self, X, y):
        '''
        计算Pw(Y|X)
        '''
        
        numerator = 0
        z = 0
        for i in range(self.N_features):
            if (X[i], y) in self.xy2id_dict[i]:
                index = self.xy2id_dict[i][(X[i], y)]
                numerator += self.w[index]
            if (X[i], 1-y) in self.xy2id_dict[i]:
                index = self.xy2id_dict[i][(X[i], 1-y)]
                z += self.w[index]
        
        numerator = np.exp(numerator)
        z = np.exp(z) + numerator
        return numerator / z
    
    
    def __caculate_Epxy(self):
        '''
        计算特征函数f(x, y)关于模型P(Y|X)与经验分布P(X)的期望值
        '''
        Epxy = [0] * self.pair_xy
        for i in range(self.N_samples):
            Pwxy = [0] * 2
            Pwxy[0] = self.__calculate_Pwyx(self.train_X[i], 0)
            Pwxy[1] = self.__calculate_Pwyx(self.train_X[i], 1)
            for feature in range(self.N_features):
                for y in range(2):
                    if (self.train_X[i][feature], y) in self.fixy_dict[feature]:
                        id = self.xy2id_dict[feature][(self.train_X[i][feature], y)]
                        Epxy[id] += (1 / self.N_samples) * Pwxy[y]
                        
        return Epxy
    
    
    def __caculate_Ep_xy(self):
        '''
        计算特征函数f(x, y)关于经验分布P(x, y)的期望值
        '''
        
        Ep_xy = [0] * self.pair_xy
        for feature in range(self.N_features):
            for (x, y) in self.fixy_dict[feature]:
                id = self.xy2id_dict[feature][(x, y)]
                Ep_xy[id] = self.fixy_dict[feature][(x, y)] / self.N_samples
        
        return Ep_xy
    
    
    def __train(self):
        for i in range(self.iters):
            iterStart = time.time()
            Epxy = self.__caculate_Epxy()
            sigmaList = [0] * self.pair_xy
            for j in range(self.pair_xy):
                sigmaList[j] = (1 / self.M) * np.log(self.Ep_xy[j] / Epxy[j])
            self.w = [self.w[i] + sigmaList[i] for i in range(self.pair_xy)]
            iterEnd = time.time()
            print('iter:%d:%d, time:%d'%(i, self.iters, iterStart - iterEnd))
        
        
    def __predict(self, test):
        test = list(test)
        result = [0] * 2
        for i in range(2):
            result[i] = self.__calculate_Pwyx(test, i)
        return result.index(max(result))
    
        
    def predict(self, test_X):
        preds = []
        for i in test_X:
            preds.append(self.__predict(i))
        return preds

In [4]:
me = MaxEntropy(train_X, train_y, iters=100)
print('算法准确率: %s' % (sum(me.predict(test_X)==test_y)/len(test_y)))

iter:0:100, time:0
iter:1:100, time:0
iter:2:100, time:0
iter:3:100, time:0
iter:4:100, time:0
iter:5:100, time:0
iter:6:100, time:0
iter:7:100, time:0
iter:8:100, time:0
iter:9:100, time:0
iter:10:100, time:0
iter:11:100, time:0
iter:12:100, time:0
iter:13:100, time:0
iter:14:100, time:0
iter:15:100, time:0
iter:16:100, time:0
iter:17:100, time:0
iter:18:100, time:0
iter:19:100, time:0
iter:20:100, time:0
iter:21:100, time:0
iter:22:100, time:0
iter:23:100, time:0
iter:24:100, time:0
iter:25:100, time:0
iter:26:100, time:0
iter:27:100, time:0
iter:28:100, time:0
iter:29:100, time:0
iter:30:100, time:0
iter:31:100, time:0
iter:32:100, time:0
iter:33:100, time:0
iter:34:100, time:0
iter:35:100, time:0
iter:36:100, time:0
iter:37:100, time:0
iter:38:100, time:0
iter:39:100, time:0
iter:40:100, time:0
iter:41:100, time:0
iter:42:100, time:0
iter:43:100, time:0
iter:44:100, time:0
iter:45:100, time:0
iter:46:100, time:0
iter:47:100, time:0
iter:48:100, time:0
iter:49:100, time:0
iter:50:10