In [37]:
import re
import math
import numpy as np
import pandas as pd
from os import listdir
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk

# 確保已經下載了 NLTK 的停用詞
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\morri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Stemmer
STEMMER = PorterStemmer()

# Stop words
STOP_WORDS = stopwords.words("english")

# Corpus file path
CORPUS_FILE_PATH = "./data/IRTM/"

# Doc Size
DOC_SIZE = 1095


In [39]:
def doc_preprocessing(doc: str) -> list:
    # 清理文本並轉換為小寫
    doc = re.sub(r"\s+", " ", doc)
    doc = re.sub(r"[^\w\s]", "", doc)
    doc = doc.lower()
    # 分詞
    words = doc.split(" ")
    # 詞幹提取
    stemming = [STEMMER.stem(word) for word in words]
    # 移除停用詞
    token_list = [word for word in stemming if word not in STOP_WORDS]
    return token_list


In [40]:
def get_tf_and_df(corpus: list):
    tf_list = []
    df_dict = {}
    
    for each in corpus:
        document_id, document = each
        document_word_list = doc_preprocessing(document)
        tf = {}
        for word in document_word_list:
            tf[word] = tf.get(word, 0) + 1
        tf_list.append([document_id, tf])
        
        for term in tf:
            df_dict[term] = df_dict.get(term, 0) + 1
                
    # 按照詞彙排序 DF 字典
    df_dict = dict(sorted(df_dict.items(), key=lambda x: x[0]))
    
    return tf_list, df_dict


In [41]:
def get_index_dict(df_dict: dict) -> dict:
    # 構建詞彙-索引映射
    index_dict = {term: idx for idx, term in enumerate(df_dict)}
    return index_dict  # (word: index)


In [42]:
def get_tf_vector(tf_list, index_dict):
    tf_vectors = []
    for each in tf_list:
        document_id, tf_dict = each
        tf_vector = np.zeros(len(index_dict), dtype=float)
        for word, count in tf_dict.items():
            if word in index_dict:  # 確保詞彙在索引字典中
                tf_vector[index_dict[word]] = count
            else:
                print(f"Word '{word}' not found in index_dict.")
        tf_vectors.append([document_id, tf_vector])
    return tf_vectors

def get_tf_idf_vector(tf_vectors, df_dict, index_dict):
    idf_vector = np.zeros(len(index_dict), dtype=float)
    for word, df in df_dict.items():
        if word in index_dict:
            idf = math.log(len(tf_vectors) / df, 10)
            idf_vector[index_dict[word]] = idf
        else:
            print(f"Word '{word}' not found in index_dict.")
    tf_idf_vectors = []
    for tf_vector in tf_vectors:
        tf_idf = tf_vector[1] * idf_vector
        norm = np.linalg.norm(tf_idf)
        tf_idf_unit = tf_idf / norm if norm != 0 else tf_idf
        tf_idf_vectors.append(tf_idf_unit)
        # Debugging output
        if len(tf_idf_unit) != len(index_dict):
            print(f"Document {tf_vector[0]} has inconsistent tf_idf_unit length: {len(tf_idf_unit)}")
    try:
        return np.vstack(tf_idf_vectors)
    except ValueError as e:
        print("Error stacking TF-IDF vectors:", e)
        # 打印每個向量的長度
        for i, vec in enumerate(tf_idf_vectors):
            print(f"Document {i} vector length: {len(vec)}")
        return np.array(tf_idf_vectors, dtype=object)


In [43]:
# 加載文檔
files = listdir(CORPUS_FILE_PATH)
files = [f for f in files if f.endswith(".txt") and f[0] != "."]

# 按照文件名排序
files.sort(key=lambda x: int(x[:-4]))

# 初始化語料庫列表：[[id, document], ...]
corpus = []

# 讀取文件
for file in files:
    with open(CORPUS_FILE_PATH + file, "r", encoding='utf-8') as f:
        try:
            document_id = int(file[:-4]) - 1  # 使用整數ID，從0開始
            document = f.read()
            corpus.append([document_id, document])
        except ValueError as ve:
            print(f"Error processing file {file}: {ve}")

print(f"Loaded {len(corpus)} documents.")



Loaded 1095 documents.


In [44]:
# 計算 TF 和 DF
tf_list, df_dict = get_tf_and_df(corpus)

print(f"Calculated TF for {len(tf_list)} documents.")
print(f"Vocabulary size: {len(df_dict)}")


Calculated TF for 1095 documents.
Vocabulary size: 14407


In [45]:
# 建立索引字典
index_dict = get_index_dict(df_dict)

print(f"Index dictionary created with {len(index_dict)} terms.")


Index dictionary created with 14407 terms.


In [46]:
# 生成 TF 向量
tf_vectors = get_tf_vector(tf_list, index_dict)

print(f"Generated TF vectors for {len(tf_vectors)} documents.")


Generated TF vectors for 1095 documents.


In [47]:
# 生成 TF-IDF 向量
tf_idf_vectors = get_tf_idf_vector(tf_vectors, df_dict, index_dict)

# 檢查是否返回了正確的 2D NumPy 數組
if isinstance(tf_idf_vectors, np.ndarray) and tf_idf_vectors.ndim == 2:
    doc_vectors = tf_idf_vectors
    print("TF-IDF vectors generated and normalized.")
    print(f"doc_vectors shape: {doc_vectors.shape}")
else:
    print("TF-IDF vectors are not in a proper 2D NumPy array format.")
    # 根據需要進行後續處理或停止執行


TF-IDF vectors generated and normalized.
doc_vectors shape: (1095, 14407)


In [48]:
def cosine(doc_x, doc_y, doc_vectors):
    vector_x = doc_vectors[doc_x]
    vector_y = doc_vectors[doc_y]
    cosine_sim = float(np.dot(vector_x, vector_y))
    return cosine_sim


In [49]:
def write_result(hac_dict, cluster_num):
    with open(f"./{cluster_num}.txt", "w") as f:
        for k, v in hac_dict.items():
            for doc_id in sorted(v):
                f.write(f"{doc_id+1}\n")
            f.write("\n")
    
    # 在 Notebook 中顯示結果
    from IPython.display import display, Markdown
    markdown_text = f"### 聚類結果（聚類數量：{cluster_num}）\n"
    for cluster_id, docs in hac_dict.items():
        docs_sorted = sorted([doc_id + 1 for doc_id in docs])
        markdown_text += f"**聚類 {cluster_id + 1}**: {docs_sorted}\n\n"
    display(Markdown(markdown_text))

# 初始化聚類字典
hac_dict = {i: [i] for i in range(DOC_SIZE)}
# 初始化活躍聚類的標記
active_clusters = set(range(DOC_SIZE))

# 建立初始相似度矩陣
print("Building initial similarity matrix...")
similarity_matrix = {}
for i in range(DOC_SIZE):
    for j in range(i + 1, DOC_SIZE):
        sim = cosine(i, j, doc_vectors)
        similarity_matrix[(i, j)] = sim

print("Initial similarity matrix built.")


Building initial similarity matrix...
Initial similarity matrix built.


In [50]:
# Cell 14: 聚類過程

print("Starting clustering process...")
target_clusters = [20, 13, 8]

while len(hac_dict) > min(target_clusters):
    # 找到最相似的聚類對
    max_pair = None
    max_sim = -1
    for pair, sim in similarity_matrix.items():
        if sim > max_sim:
            max_sim = sim
            max_pair = pair
    
    if max_pair is None:
        print("No more valid cluster pairs found.")
        break
    
    cluster1, cluster2 = max_pair
    print(f"Merging clusters {cluster1} and {cluster2} with similarity {max_sim:.4f}")
    
    # 合併聚類
    hac_dict[cluster1].extend(hac_dict[cluster2])
    del hac_dict[cluster2]
    active_clusters.remove(cluster2)
    
    # 更新相似度矩陣
    for other in list(active_clusters):
        if other == cluster1:
            continue
        # 計算新聚類與其他聚類之間的最小相似度（全鏈法）
        # 找到 cluster1 和 other 之間的最小相似度
        pair1 = tuple(sorted((cluster1, other)))
        pair2 = tuple(sorted((cluster2, other)))
        sim1 = similarity_matrix.get(pair1, -1)
        sim2 = similarity_matrix.get(pair2, -1)
        new_sim = min(sim1, sim2) if sim1 != -1 and sim2 != -1 else max(sim1, sim2)
        
        # 更新相似度矩陣
        new_pair = tuple(sorted((cluster1, other)))
        similarity_matrix[new_pair] = new_sim
        
        # 移除已合併的聚類對
        if pair2 in similarity_matrix:
            del similarity_matrix[pair2]
    
    # 移除與 cluster2 相關的所有聚類對
    keys_to_remove = [pair for pair in similarity_matrix if cluster2 in pair]
    for key in keys_to_remove:
        del similarity_matrix[key]
    
    # 檢查是否達到目標聚類數量
    current_cluster_num = len(hac_dict)
    if current_cluster_num in target_clusters:
        print(f"Writing result for {current_cluster_num} clusters...")
        write_result(hac_dict, cluster_num=current_cluster_num)
    
    # 定期打印進度
    if current_cluster_num % 100 == 0 or current_cluster_num in target_clusters:
        print(f"Current number of clusters: {current_cluster_num}")

print("Clustering complete.")


Starting clustering process...
Merging clusters 7 and 8 with similarity 1.0000
Merging clusters 475 and 476 with similarity 1.0000
Merging clusters 100 and 105 with similarity 1.0000
Merging clusters 210 and 211 with similarity 1.0000
Merging clusters 210 and 212 with similarity 1.0000
Merging clusters 563 and 564 with similarity 1.0000
Merging clusters 563 and 594 with similarity 1.0000
Merging clusters 563 and 595 with similarity 1.0000
Merging clusters 620 and 621 with similarity 1.0000
Merging clusters 661 and 662 with similarity 1.0000
Merging clusters 731 and 732 with similarity 1.0000
Merging clusters 791 and 795 with similarity 1.0000
Merging clusters 847 and 848 with similarity 1.0000
Merging clusters 47 and 48 with similarity 1.0000
Merging clusters 304 and 308 with similarity 1.0000
Merging clusters 942 and 943 with similarity 1.0000
Merging clusters 194 and 228 with similarity 1.0000
Merging clusters 194 and 229 with similarity 1.0000
Merging clusters 925 and 927 with simil

### 聚類結果（聚類數量：20）
**聚類 1**: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 87, 88, 89, 90, 93, 94, 95, 96, 97, 98, 99, 101, 103, 105, 106, 107, 108, 109, 110, 111, 112, 114, 116, 117, 118, 119, 120, 121, 122, 124, 125, 126, 127, 128, 226, 232, 259, 631, 632, 649, 664, 688, 689, 712, 729]

**聚類 11**: [11, 19, 29, 113, 115, 169, 278, 301, 316, 317, 321, 324, 325, 338, 341, 357, 369, 372, 377, 381, 383, 384, 386, 388, 389, 396, 400, 402, 405, 419, 422, 423, 425, 429, 431, 435, 444, 451, 460, 464, 467, 468, 476, 477, 479, 482, 489, 494, 524, 543, 625, 811]

**聚類 31**: [31, 44, 70, 83, 86, 92, 100, 102, 305, 309, 315, 320, 326, 327, 328, 331, 334, 340, 344, 345, 351, 355, 358, 360, 362, 365, 370, 371, 375, 376, 380, 382, 391, 392, 393, 394, 395, 398, 403, 404, 408, 421, 424, 433, 436, 439, 440, 441, 519, 528, 591]

**聚類 42**: [42, 68, 91, 104, 123, 129, 149, 153, 156, 189, 243, 244, 256, 271, 656, 909]

**聚類 130**: [130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 146, 148, 150, 151, 152, 154, 155, 157, 158, 159, 160, 161, 162, 163, 165, 167, 168, 170, 171, 172, 173, 174, 177, 178, 180, 181, 182, 184, 185, 186, 187, 188, 190, 191, 192, 195, 196, 197, 199, 201, 202, 203, 206, 207, 208, 210, 215, 217, 219, 221, 223, 224, 225, 227, 229, 230, 231, 233, 235, 236, 237, 238, 239, 242, 246, 247, 249, 251, 252, 253, 257, 261, 263, 265, 267, 270, 272, 273, 276, 277, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 318, 323, 329, 330, 332, 333, 335, 336, 342, 343, 348, 349, 352, 354, 359, 364, 366, 367, 374, 378, 379, 385, 387, 390, 399, 407, 409, 410, 412, 417, 418, 427, 432, 434, 437, 448, 449, 455, 456, 458, 459, 461, 462, 463, 465, 478, 484, 486, 488, 510, 511, 518, 521, 522, 525, 549, 551, 552, 560, 561, 562, 617, 618, 620, 623, 628, 630, 634, 645, 647, 650, 655, 666, 672, 679, 686, 690, 691, 692, 693, 694, 711, 714, 717, 728, 742, 746, 747, 761, 762, 763, 765, 768, 769, 772, 773, 775, 776, 777, 779, 782, 783, 785, 787, 788, 789, 790, 808, 899, 901, 902, 904, 906, 1079]

**聚類 142**: [142, 147, 166, 175, 193, 194, 198, 200, 204, 205, 209, 211, 212, 213, 214, 216, 218, 220, 222, 228, 234, 264, 266, 268, 269, 274, 294, 298, 299, 302, 303, 307, 311, 312, 313, 314, 319, 322, 361, 368, 373, 411, 414, 415, 446, 447, 452, 453, 457, 469, 470, 471, 474, 475, 750, 758, 892]

**聚類 164**: [164, 176, 179, 550, 778, 810]

**聚類 183**: [183, 635, 648, 702, 704, 705, 706, 708, 709, 852, 865, 916, 925, 926, 927, 928, 938, 1067]

**聚類 240**: [240, 241, 245, 248, 250, 254, 255, 258, 260, 275, 279, 295, 297, 300, 306, 339, 346, 347, 350, 353, 363, 406, 413, 416, 438, 442, 454, 472, 473, 481, 483, 487, 490, 491, 492, 493, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 512, 514, 515, 516, 517, 652, 715, 721, 725, 749, 753, 809, 816, 883]

**聚類 262**: [262, 296, 304, 308, 337, 397, 401, 426, 443, 445, 450, 466, 480, 513, 533, 534, 597, 624, 638, 642, 651, 653, 654, 657, 658, 659, 660, 661, 662, 663, 665, 667, 668, 669, 670, 671, 673, 674, 675, 676, 677, 678, 681, 682, 684, 685, 695, 696, 697, 698, 699, 701, 703, 707, 710, 713, 734, 745, 793]

**聚類 310**: [310, 878, 1015, 1016, 1048, 1055, 1058, 1082, 1089, 1091, 1093, 1094]

**聚類 356**: [356, 420, 428, 430, 485, 520, 523, 526, 527, 529, 530, 531, 532, 536, 537, 538, 539, 540, 541, 544, 545, 546, 547, 548, 553, 554, 555, 556, 557, 558, 559, 563, 564, 565, 566, 567, 568, 569, 570, 572, 577, 579, 580, 585, 587, 590, 592, 593, 594, 595, 596, 627, 641, 680, 683, 716, 718, 719, 720, 722, 723, 724, 726, 727, 738, 814, 836, 863, 900, 908, 913, 914, 917, 918, 919, 920, 921, 922, 924, 929, 930, 933, 936, 940, 946, 950, 966, 984, 986, 989, 992, 994, 1000, 1008, 1017, 1022, 1049, 1061, 1064, 1065, 1070, 1073, 1074, 1095]

**聚類 535**: [535, 542, 571, 573, 574, 575, 576, 578, 581, 582, 583, 584, 586, 588, 589, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 619, 621, 622, 626, 629, 633, 636, 637, 639, 640, 643, 644, 687, 736, 737, 739, 741, 743, 748, 792, 796, 827, 854, 860, 1034, 1036, 1076]

**聚類 646**: [646, 751, 794, 798, 801, 823, 831, 845, 846, 853, 880, 895, 898, 903, 923, 931, 932, 934, 935, 937, 939, 954, 955, 959, 985, 1018, 1021, 1024, 1026, 1029]

**聚類 700**: [700, 730, 735, 740, 744, 887, 890, 951, 960, 991]

**聚類 731**: [731, 732, 733, 752, 754, 755, 756, 757, 759, 760, 764, 766, 767, 770, 771, 774, 780, 784, 786, 791, 795, 797, 800, 802, 803, 804, 805, 806, 807, 855, 856, 869, 884, 945, 947, 949, 952, 953, 958, 961, 962, 963, 964, 965, 967, 972, 974, 975, 977, 979, 980, 982, 983, 990, 993, 1009, 1051]

**聚類 781**: [781, 799, 812, 815, 839, 840, 841, 842, 848, 849, 861, 877, 885, 886, 888, 894, 896, 897, 907, 910, 911, 912, 915, 941, 942, 948, 956, 957, 968, 969, 970, 971, 973, 976, 978, 981, 987, 996, 997, 1001, 1002, 1004, 1010, 1032, 1044, 1050, 1057, 1059, 1060]

**聚類 813**: [813, 817, 818, 819, 820, 821, 822, 824, 825, 826, 828, 829, 830, 832, 833, 834, 835, 837, 838, 843, 844, 847, 850, 851, 857, 858, 859, 862, 864, 866, 867, 868, 870, 872, 873, 874, 876, 879, 881, 882, 891, 893, 905, 943, 944, 988, 1006, 1023, 1025, 1027, 1028, 1030, 1031, 1033, 1035, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1045, 1046, 1047, 1052, 1053, 1054, 1056, 1062, 1063, 1066, 1068, 1069, 1071, 1072, 1078, 1080, 1081, 1083, 1084, 1085, 1086, 1087, 1088, 1090]

**聚類 871**: [871, 875]

**聚類 889**: [889, 995, 998, 999, 1003, 1005, 1007, 1011, 1012, 1013, 1014, 1019, 1020, 1075, 1077, 1092]



Current number of clusters: 20
Merging clusters 182 and 534 with similarity 0.0022
Merging clusters 182 and 355 with similarity 0.0004
Merging clusters 41 and 129 with similarity 0.0003
Merging clusters 0 and 10 with similarity 0.0000
Merging clusters 0 and 30 with similarity 0.0000
Merging clusters 0 and 41 with similarity 0.0000
Merging clusters 0 and 141 with similarity 0.0000
Writing result for 13 clusters...


### 聚類結果（聚類數量：13）
**聚類 1**: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 178, 180, 181, 182, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 242, 243, 244, 246, 247, 249, 251, 252, 253, 256, 257, 259, 261, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 276, 277, 278, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 298, 299, 301, 302, 303, 305, 307, 309, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 338, 340, 341, 342, 343, 344, 345, 348, 349, 351, 352, 354, 355, 357, 358, 359, 360, 361, 362, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 398, 399, 400, 402, 403, 404, 405, 407, 408, 409, 410, 411, 412, 414, 415, 417, 418, 419, 421, 422, 423, 424, 425, 427, 429, 431, 432, 433, 434, 435, 436, 437, 439, 440, 441, 444, 446, 447, 448, 449, 451, 452, 453, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 467, 468, 469, 470, 471, 474, 475, 476, 477, 478, 479, 482, 484, 486, 488, 489, 494, 510, 511, 518, 519, 521, 522, 524, 525, 528, 543, 549, 551, 552, 560, 561, 562, 591, 617, 618, 620, 623, 625, 628, 630, 631, 632, 634, 645, 647, 649, 650, 655, 656, 664, 666, 672, 679, 686, 688, 689, 690, 691, 692, 693, 694, 711, 712, 714, 717, 728, 729, 742, 746, 747, 750, 758, 761, 762, 763, 765, 768, 769, 772, 773, 775, 776, 777, 779, 782, 783, 785, 787, 788, 789, 790, 808, 811, 892, 899, 901, 902, 904, 906, 909, 1079]

**聚類 164**: [164, 176, 179, 550, 778, 810]

**聚類 183**: [183, 356, 420, 428, 430, 485, 520, 523, 526, 527, 529, 530, 531, 532, 535, 536, 537, 538, 539, 540, 541, 542, 544, 545, 546, 547, 548, 553, 554, 555, 556, 557, 558, 559, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 592, 593, 594, 595, 596, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 619, 621, 622, 626, 627, 629, 633, 635, 636, 637, 639, 640, 641, 643, 644, 648, 680, 683, 687, 702, 704, 705, 706, 708, 709, 716, 718, 719, 720, 722, 723, 724, 726, 727, 736, 737, 738, 739, 741, 743, 748, 792, 796, 814, 827, 836, 852, 854, 860, 863, 865, 900, 908, 913, 914, 916, 917, 918, 919, 920, 921, 922, 924, 925, 926, 927, 928, 929, 930, 933, 936, 938, 940, 946, 950, 966, 984, 986, 989, 992, 994, 1000, 1008, 1017, 1022, 1034, 1036, 1049, 1061, 1064, 1065, 1067, 1070, 1073, 1074, 1076, 1095]

**聚類 240**: [240, 241, 245, 248, 250, 254, 255, 258, 260, 275, 279, 295, 297, 300, 306, 339, 346, 347, 350, 353, 363, 406, 413, 416, 438, 442, 454, 472, 473, 481, 483, 487, 490, 491, 492, 493, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 512, 514, 515, 516, 517, 652, 715, 721, 725, 749, 753, 809, 816, 883]

**聚類 262**: [262, 296, 304, 308, 337, 397, 401, 426, 443, 445, 450, 466, 480, 513, 533, 534, 597, 624, 638, 642, 651, 653, 654, 657, 658, 659, 660, 661, 662, 663, 665, 667, 668, 669, 670, 671, 673, 674, 675, 676, 677, 678, 681, 682, 684, 685, 695, 696, 697, 698, 699, 701, 703, 707, 710, 713, 734, 745, 793]

**聚類 310**: [310, 878, 1015, 1016, 1048, 1055, 1058, 1082, 1089, 1091, 1093, 1094]

**聚類 646**: [646, 751, 794, 798, 801, 823, 831, 845, 846, 853, 880, 895, 898, 903, 923, 931, 932, 934, 935, 937, 939, 954, 955, 959, 985, 1018, 1021, 1024, 1026, 1029]

**聚類 700**: [700, 730, 735, 740, 744, 887, 890, 951, 960, 991]

**聚類 731**: [731, 732, 733, 752, 754, 755, 756, 757, 759, 760, 764, 766, 767, 770, 771, 774, 780, 784, 786, 791, 795, 797, 800, 802, 803, 804, 805, 806, 807, 855, 856, 869, 884, 945, 947, 949, 952, 953, 958, 961, 962, 963, 964, 965, 967, 972, 974, 975, 977, 979, 980, 982, 983, 990, 993, 1009, 1051]

**聚類 781**: [781, 799, 812, 815, 839, 840, 841, 842, 848, 849, 861, 877, 885, 886, 888, 894, 896, 897, 907, 910, 911, 912, 915, 941, 942, 948, 956, 957, 968, 969, 970, 971, 973, 976, 978, 981, 987, 996, 997, 1001, 1002, 1004, 1010, 1032, 1044, 1050, 1057, 1059, 1060]

**聚類 813**: [813, 817, 818, 819, 820, 821, 822, 824, 825, 826, 828, 829, 830, 832, 833, 834, 835, 837, 838, 843, 844, 847, 850, 851, 857, 858, 859, 862, 864, 866, 867, 868, 870, 872, 873, 874, 876, 879, 881, 882, 891, 893, 905, 943, 944, 988, 1006, 1023, 1025, 1027, 1028, 1030, 1031, 1033, 1035, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1045, 1046, 1047, 1052, 1053, 1054, 1056, 1062, 1063, 1066, 1068, 1069, 1071, 1072, 1078, 1080, 1081, 1083, 1084, 1085, 1086, 1087, 1088, 1090]

**聚類 871**: [871, 875]

**聚類 889**: [889, 995, 998, 999, 1003, 1005, 1007, 1011, 1012, 1013, 1014, 1019, 1020, 1075, 1077, 1092]



Current number of clusters: 13
Merging clusters 0 and 163 with similarity 0.0000
Merging clusters 0 and 182 with similarity 0.0000
Merging clusters 0 and 239 with similarity 0.0000
Merging clusters 0 and 261 with similarity 0.0000
Merging clusters 0 and 309 with similarity 0.0000
Writing result for 8 clusters...


### 聚類結果（聚類數量：8）
**聚類 1**: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 734, 736, 737, 738, 739, 741, 742, 743, 745, 746, 747, 748, 749, 750, 753, 758, 761, 762, 763, 765, 768, 769, 772, 773, 775, 776, 777, 778, 779, 782, 783, 785, 787, 788, 789, 790, 792, 793, 796, 808, 809, 810, 811, 814, 816, 827, 836, 852, 854, 860, 863, 865, 878, 883, 892, 899, 900, 901, 902, 904, 906, 908, 909, 913, 914, 916, 917, 918, 919, 920, 921, 922, 924, 925, 926, 927, 928, 929, 930, 933, 936, 938, 940, 946, 950, 966, 984, 986, 989, 992, 994, 1000, 1008, 1015, 1016, 1017, 1022, 1034, 1036, 1048, 1049, 1055, 1058, 1061, 1064, 1065, 1067, 1070, 1073, 1074, 1076, 1079, 1082, 1089, 1091, 1093, 1094, 1095]

**聚類 646**: [646, 751, 794, 798, 801, 823, 831, 845, 846, 853, 880, 895, 898, 903, 923, 931, 932, 934, 935, 937, 939, 954, 955, 959, 985, 1018, 1021, 1024, 1026, 1029]

**聚類 700**: [700, 730, 735, 740, 744, 887, 890, 951, 960, 991]

**聚類 731**: [731, 732, 733, 752, 754, 755, 756, 757, 759, 760, 764, 766, 767, 770, 771, 774, 780, 784, 786, 791, 795, 797, 800, 802, 803, 804, 805, 806, 807, 855, 856, 869, 884, 945, 947, 949, 952, 953, 958, 961, 962, 963, 964, 965, 967, 972, 974, 975, 977, 979, 980, 982, 983, 990, 993, 1009, 1051]

**聚類 781**: [781, 799, 812, 815, 839, 840, 841, 842, 848, 849, 861, 877, 885, 886, 888, 894, 896, 897, 907, 910, 911, 912, 915, 941, 942, 948, 956, 957, 968, 969, 970, 971, 973, 976, 978, 981, 987, 996, 997, 1001, 1002, 1004, 1010, 1032, 1044, 1050, 1057, 1059, 1060]

**聚類 813**: [813, 817, 818, 819, 820, 821, 822, 824, 825, 826, 828, 829, 830, 832, 833, 834, 835, 837, 838, 843, 844, 847, 850, 851, 857, 858, 859, 862, 864, 866, 867, 868, 870, 872, 873, 874, 876, 879, 881, 882, 891, 893, 905, 943, 944, 988, 1006, 1023, 1025, 1027, 1028, 1030, 1031, 1033, 1035, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1045, 1046, 1047, 1052, 1053, 1054, 1056, 1062, 1063, 1066, 1068, 1069, 1071, 1072, 1078, 1080, 1081, 1083, 1084, 1085, 1086, 1087, 1088, 1090]

**聚類 871**: [871, 875]

**聚類 889**: [889, 995, 998, 999, 1003, 1005, 1007, 1011, 1012, 1013, 1014, 1019, 1020, 1075, 1077, 1092]



Current number of clusters: 8
Clustering complete.


In [51]:
# Cell 15: 檢查結果文件是否存在

import os

cluster_numbers = [20, 13, 8]
for num in cluster_numbers:
    file_path = f"./{num}.txt"
    if os.path.exists(file_path):
        print(f"聚類數量 {num} 的結果文件已生成：{file_path}")
    else:
        print(f"聚類數量 {num} 的結果文件不存在。")


聚類數量 20 的結果文件已生成：./20.txt
聚類數量 13 的結果文件已生成：./13.txt
聚類數量 8 的結果文件已生成：./8.txt
