In [None]:
import os
import re
import string
import nltk
import time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt

# 確保已安裝NLTK資料集
nltk.download('punkt')
nltk.download('stopwords')

# 1. 文字前處理函數
def preprocess_text(text):
    # 移除第三行
    lines = text.split('\n')
    if len(lines) > 2:
        text = '\n'.join(lines[:2] + lines[3:])
    # 轉換為小寫
    text = text.lower()
    # 移除標點符號
    text = text.translate(str.maketrans('', '', string.punctuation))
    # 移除指定單詞和停用詞
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word != 'embed']
    return ' '.join(tokens)

# 計時器開始
overall_start_time = time.time()

# 從資料夾中讀取文件
folder_path = "/Users/sophiehuang/Documents/113-1/113-1-IRTM/all_lyrics"  # 指定文件所在資料夾路徑
corpus = []
file_names = []
for idx, file_name in enumerate(os.listdir(folder_path)):
    if file_name.endswith(".txt"):
        file_names.append(os.path.splitext(file_name)[0])  # 移除副檔名
        file_path = os.path.join(folder_path, file_name)
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                corpus.append(file.read())
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin-1') as file:  # 嘗試其他編碼
                    corpus.append(file.read())
            except UnicodeDecodeError:
                print(f"Error reading file: {file_name}. Skipping.")
    if (idx + 1) % 100 == 0:
        print(f"Completed processing {idx + 1} files.")

# 文字前處理
start_time = time.time()
processed_corpus = [preprocess_text(doc) for doc in corpus]
print(f"Text preprocessing completed in {time.time() - start_time:.2f} seconds.")

# 2. 計算TF-IDF
start_time = time.time()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)
print(f"TF-IDF computation completed in {time.time() - start_time:.2f} seconds.")

# 計算稀疏距離矩陣
start_time = time.time()
distance_matrix = pdist(tfidf_matrix.toarray(), metric='euclidean')
print(f"Distance matrix computation completed in {time.time() - start_time:.2f} seconds.")

# 3. 使用 HAC (Hierarchical Agglomerative Clustering)
start_time = time.time()
linkage_matrix = linkage(distance_matrix, method='complete')
print(f"HAC computation completed in {time.time() - start_time:.2f} seconds.")

# 繪製樹狀圖
def plot_dendrogram(linkage_matrix, file_names):
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix, labels=file_names)
    plt.title("Hierarchical Agglomerative Clustering Dendrogram")
    plt.xlabel("Documents")
    plt.ylabel("Distance")
    plt.show()

plot_dendrogram(linkage_matrix, file_names)

# 分群
start_time = time.time()
k = 16  # 指定群組數
clusters = fcluster(linkage_matrix, k, criterion='maxclust')
print(f"Clustering completed in {time.time() - start_time:.2f} seconds.")

# 組織分群結果
cluster_dict = {}
for file_name, cluster in zip(file_names, clusters):
    if cluster not in cluster_dict:
        cluster_dict[cluster] = []
    cluster_dict[cluster].append(file_name)

# 將結果儲存到txt檔案
start_time = time.time()
output_file = "cluster_results.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for cluster, docs in cluster_dict.items():
        f.write(f"{cluster} " + " ".join(docs) + "\n")
print(f"Results saved in {time.time() - start_time:.2f} seconds.")

# 總時間
print(f"Overall process completed in {time.time() - overall_start_time:.2f} seconds.")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sophiehuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophiehuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Completed processing 100 files.
Completed processing 200 files.
Completed processing 300 files.
Completed processing 400 files.
Completed processing 500 files.
Completed processing 600 files.
Completed processing 700 files.
Completed processing 800 files.
Completed processing 900 files.
Completed processing 1000 files.
Completed processing 1100 files.
Completed processing 1200 files.
Completed processing 1300 files.
Completed processing 1400 files.
Completed processing 1500 files.
Completed processing 1600 files.
Completed processing 1700 files.
Completed processing 1800 files.
Completed processing 1900 files.
Completed processing 2000 files.
Completed processing 2100 files.
Completed processing 2200 files.
Completed processing 2300 files.
Completed processing 2400 files.
Completed processing 2500 files.
Completed processing 2600 files.
Completed processing 2700 files.
Completed processing 2800 files.
Completed processing 2900 files.
Completed processing 3000 files.
Completed processin

Completed processing 24900 files.
Completed processing 25000 files.
Completed processing 25100 files.
Completed processing 25200 files.
Completed processing 25300 files.
Completed processing 25400 files.
Completed processing 25500 files.
Completed processing 25600 files.
Completed processing 25700 files.
Completed processing 25800 files.
Completed processing 25900 files.
Completed processing 26000 files.
Completed processing 26100 files.
Completed processing 26200 files.
Completed processing 26300 files.
Completed processing 26400 files.
Completed processing 26500 files.
Completed processing 26600 files.
Completed processing 26700 files.
Completed processing 26800 files.
Completed processing 26900 files.
Completed processing 27000 files.
Completed processing 27100 files.
Completed processing 27200 files.
Completed processing 27300 files.
Completed processing 27400 files.
Completed processing 27500 files.
Completed processing 27600 files.
Completed processing 27700 files.
Completed proc