In [6]:
import numpy as np

def loaddata():
    X = np.array([[1,'S'],[1,'M'],[1,'M'],[1,'S'],
                 [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'],
                 [2, 'L'], [2, 'L'], [3, 'L'], [3, 'M'],
                 [3, 'M'], [3, 'L'], [3, 'L']])
    y = np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
    return X, y

In [7]:
def Train(trainset, train_labels):
    m = trainset.shape[0] 
    n = trainset.shape[1] 
    prior_probability = {}  # 先验概率
    conditional_probability = {}  # 条件概率
    
    labels = set(train_labels)
    
    # 计算先验概率（带拉普拉斯修正）
    for label in labels:
        prior_probability[label] = len(train_labels[train_labels == label]) + 1
    
    # 计算条件概率（带拉普拉斯修正）
    # 补充计算条件概率的代码
    for label in labels:
        # 获取当前类别的样本
        class_samples = trainset[train_labels == label]
        class_count = len(class_samples)
        
        for feature_idx in range(n):
            # 获取当前特征的所有可能取值
            feature_values = set(trainset[:, feature_idx])
            feature_value_count = len(feature_values)
            
            for value in feature_values:
                # 计算当前特征值在当前类别中出现的次数
                count = np.sum(class_samples[:, feature_idx] == value)
                
                # 计算条件概率（带拉普拉斯修正）
                prob = (count + 1) / (class_count + feature_value_count)
                
                # 存储条件概率，键为(标签, 特征索引, 特征值)
                conditional_probability[(label, feature_idx, value)] = prob
                
                # 处理训练数据中未出现的特征值（为预测做准备）
                for unseen_value in feature_values:
                    key = (label, feature_idx, unseen_value)
                    if key not in conditional_probability:
                        prob_unseen = 1 / (class_count + feature_value_count)
                        conditional_probability[key] = prob_unseen
        # 归一化先验概率（带拉普拉斯修正）
    for label in labels:
        prior_probability[label] = prior_probability[label] / (m + len(labels))
    
    return prior_probability, conditional_probability, labels


In [8]:
def predict(data):
    result = {}
    for label in train_labels_set:
        temp = prior_probability[label]  # 初始化为先验概率
        
        # 补充预测代码
        for feature_idx, value in enumerate(data):
            # 获取条件概率，如果不存在则使用默认值1e-5避免零概率
            key = (label, feature_idx, value)
            if key in conditional_probability:
                temp *= conditional_probability[key]
            else:
                # 处理训练数据中未出现的特征值组合
                temp *= 1e-5  # 小概率值避免零概率问题
        
        result[label] = temp
    
    # 归一化概率（可选）
    total = sum(result.values())
    if total > 0:
        for label in result:
            result[label] /= total
    
    print('result =', result)
    # 返回概率最大的类别
    return sorted(result.items(), key=lambda x: x[1], reverse=True)[0][0]

In [9]:
# 训练和预测
X, y = loaddata()
prior_probability, conditional_probability, train_labels_set = Train(X, y)
print("先验概率:", prior_probability)
print("条件概率示例:")
for key, value in list(conditional_probability.items())[:5]:
    print(f"{key}: {value:.4f}")

# 测试样本预测
test_sample = [2, 'S']
print("\n测试样本:", test_sample)
r_label = predict(test_sample)
print('预测结果 =', r_label)

先验概率: {np.int64(1): 0.5882352941176471, np.int64(-1): 0.4117647058823529}
条件概率示例:
(np.int64(1), 0, np.str_('2')): 0.3333
(np.int64(1), 0, np.str_('1')): 0.2500
(np.int64(1), 0, np.str_('3')): 0.4167
(np.int64(1), 1, np.str_('S')): 0.1667
(np.int64(1), 1, np.str_('L')): 0.4167

测试样本: [2, 'S']
result = {np.int64(1): np.float64(0.34883720930232553), np.int64(-1): np.float64(0.6511627906976744)}
预测结果 = -1
