In [20]:
#生成Apriori算法的数据集
def loadDataSet(numTransactions = 50, numItems = 10 , maxItemsPerTransaction = 10):
    """
    生成Apriori算法的数据集

    参数:
    - numTransactions: 生成的事务数量
    - numItems: 项的数量
    - maxItemsPerTransaction: 每个事务中的最大项数量

    返回值:
    一个包含多个事务的列表，每个事务由多个项组成
    """
    import random

    dataset = []
    for _ in range(numTransactions):
        numItemsInTransaction = random.randint(1, maxItemsPerTransaction)
        transaction = set(random.sample(range(1, numItems + 1), numItemsInTransaction))
        dataset.append(transaction)

    return dataset
#根据数据库生成所有的一项集
def createC1(dataSet):
    C1=[]
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset,C1))
#筛选候选集生成频繁项集   D: 事务数据库， Ck: 候选集， minsupport: 最小支持度
def scanD(D, CK, minSupport):
    ssCnt = {}
    for tid in D:
        for can in CK:
            if can.issubset(tid):
                if not can in ssCnt:
                    ssCnt[can]=1
                else:
                    ssCnt[can]+=1

    numItems = float(len(D))
    retList = []
    supportData={}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support>=minSupport:
            retList.insert(0,key)
            supportData[key]=support

    return retList,supportData
#频繁项集两两组合，生成新的候选项集
def aprioriGen(Lk,k):
    retList=[]
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1,lenLk):
            L1=list(Lk[i])[:k-2]
            L2=list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1==L2:
                retList.append(Lk[i] | Lk[j])
    return retList
#apriori算法实现
def apriori(dataSet, minSupport=0.5):
    C1 = createC1(dataSet)
    D=list(map(set,dataSet))
    L1,supportData = scanD(D, C1, minSupport)
    L=[L1]
    k=2
    while(len(L[k-2])>0):
        CK = aprioriGen(L[k-2], k)
        Lk,supK = scanD(D, CK, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData
#规则生成
def generateRules(L, supportData, minConf=0.7):
    """
    生成关联规则的函数

    参数:
        L: 频繁项集列表
        supportData: 支持度字典
        minConf: 最小置信度,默认为0.7

    返回值:
        bigRuleList: 存储所有满足最小置信度要求的关联规则的列表
    """

    bigRuleList = []  # 存储所有的关联规则
    for i in range(1, len(L)):
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]  # 将freqSet划分为单个元素的集合列表H1
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)  # 生成关联规则
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)  # 计算并存储关联规则
    return bigRuleList
#计算满足最小置信度要求的关联规则
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    """
    计算满足最小置信度要求的关联规则的函数

    参数:
        freqSet: 频繁项集
        H: 可能的后件集合
        supportData: 支持度字典
        brl: 存储规则的列表
        minConf: 最小置信度,默认为0.7

    返回值:
        prunedH: 符合要求的后件列表
    """

    prunedH = []  # 存储符合要求的后件
    for conseq in H:
        conf = supportData[freqSet] / supportData[freqSet - conseq]  # 计算置信度
        if conf >= minConf:  # 满足最小置信度要求
            print(freqSet - conseq, '--->', conseq, 'conf:', conf)  # 打印关联规则
            brl.append((freqSet - conseq, conseq, conf))  # 将关联规则存储在brl中
            prunedH.append(conseq)  # 将后件添加到prunedH中
    return prunedH
#从后件集合生成规则
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    """
    从后件集合生成规则的函数

    参数:
        freqSet: 频繁项集
        H: 可能的后件集合
        supportData: 支持度字典
        brl: 存储规则的列表
        minConf: 最小置信度,默认为0.7

    返回值:
        None
    """

    m = len(H[0])  # 后件长度
    if len(freqSet) > (m + 1):  # 如果频繁项集的长度大于后件长度加一
        Hmp1 = aprioriGen(H, m + 1)  # 生成下一层的后件集合
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)  # 计算满足最小置信度要求的关联规则
        if len(Hmp1) > 1:  # 如果满足最小置信度要求的关联规则个数大于1
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)  # 递归调用自身，继续生成更多规则


dataSet=loadDataSet()
#print("Dataset:")
#for transaction in dataSet:
#    print(transaction)
L,supportData=apriori(dataSet)
print(L)
rules = generateRules(L,supportData,minConf=0.7)



[[frozenset({1}), frozenset({9}), frozenset({8}), frozenset({7}), frozenset({6}), frozenset({5}), frozenset({4}), frozenset({3}), frozenset({2})], [frozenset({1, 2}), frozenset({6, 7})], []]
frozenset({2}) ---> frozenset({1}) conf: 0.78125
frozenset({1}) ---> frozenset({2}) conf: 0.78125
frozenset({7}) ---> frozenset({6}) conf: 0.8928571428571428
frozenset({6}) ---> frozenset({7}) conf: 0.8333333333333334


K-means

In [14]:
import numpy as np

def kmeans(X, k, max_iters=100):
    # 随机初始化质心
    centroids = np.random.choice(len(X), size=k, replace=False)
    centroids = X[centroids]

    for _ in range(max_iters):
        # 分配每个样本到最近的质心
        labels = np.argmin(np.linalg.norm(X[:, np.newaxis] - centroids, axis=-1), axis=-1)
        
        # 更新质心位置
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])

        # 判断是否收敛
        if np.all(new_centroids == centroids):
            break

        centroids = new_centroids
    print(f'X shape : {X[:, np.newaxis].shape}')
    print(f'centroids : {centroids.shape}')
    return labels, centroids

# 示例数据
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])

# 调用K均值算法
labels, centroids = kmeans(X, k=2)

# 打印结果
print("样本标签:", labels)
print("质心位置:", centroids)


X shape : (6, 1, 2)
centroids : (2, 2)
样本标签: [1 1 1 0 0 0]
质心位置: [[4. 2.]
 [1. 2.]]


In [16]:
import numpy as np

def distance(p1, p2):
    return np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

# 数据集
data = np.array([[3, 4], [3, 6], [7, 3], [4, 7], [3, 8], [8, 5], [4, 5], [4, 1], [7, 4], [5, 5]])

# 初始簇中心
centroids = np.array([[3, 6], [4, 1]])

# 迭代计算
for iteration in range(3):
    # 计算每个数据点到簇中心的距离
    distances = np.array([[distance(data[i], centroids[j]) for j in range(len(centroids))] for i in range(len(data))])

    # 分配数据点到最近的簇中心
    labels = np.argmin(distances, axis=1)

    # 更新簇中心
    new_centroids = np.array([data[labels == j].mean(axis=0) for j in range(len(centroids))])

    # 打印迭代结果
    print(f"第 {iteration+1} 轮迭代结果：")
    print("数据点到质心的距离：")
    print(distances)
    print("簇质心的计算过程：")
    print(new_centroids)
    print("数据点所属的簇：")
    print(labels)
    print("----------------------------------------")

    # 检查是否收敛
    if np.all(new_centroids == centroids):
        break

    centroids = new_centroids


第 1 轮迭代结果：
数据点到质心的距离：
[[2.         3.16227766]
 [0.         5.09901951]
 [5.         3.60555128]
 [1.41421356 6.        ]
 [2.         7.07106781]
 [5.09901951 5.65685425]
 [1.41421356 4.        ]
 [5.09901951 0.        ]
 [4.47213595 4.24264069]
 [2.23606798 4.12310563]]
簇质心的计算过程：
[[4.28571429 5.71428571]
 [6.         2.66666667]]
数据点所属的簇：
[0 0 1 0 0 0 0 1 1 0]
----------------------------------------
第 2 轮迭代结果：
数据点到质心的距离：
[[2.14285714 3.2829526 ]
 [1.31707778 4.48454135]
 [3.83857967 1.05409255]
 [1.31707778 4.77260702]
 [2.62250854 6.11918658]
 [3.78234351 3.07318149]
 [0.76930926 3.07318149]
 [4.72293579 2.60341656]
 [3.21031501 1.66666667]
 [1.01015254 2.53859104]]
簇质心的计算过程：
[[3.66666667 5.83333333]
 [6.5        3.25      ]]
数据点所属的簇：
[0 0 1 0 0 1 0 1 1 0]
----------------------------------------
第 3 轮迭代结果：
数据点到质心的距离：
[[1.95078332 3.57945527]
 [0.68718427 4.45112345]
 [4.37480158 0.55901699]
 [1.21335165 4.50693909]
 [2.26691175 5.90021186]
 [4.4127341  2.30488611]
 [0.89752747 3.0