use for testing process_raw_data.py

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
# the data path is project_root/data/merged_data.csv
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
MERGED_DATA_PATH = os.path.join(ROOT, 'data/merged_data.csv')

In [2]:
df = pd.read_csv(MERGED_DATA_PATH)
df.head(5)

Unnamed: 0,Author,Title,Publication,Date,Place,URL,Full_text,OCR_noise_ratio,Year,Decade,genre,Processed_text
0,Null,Theatre,The Times,1785-11-14,"London, England",https://link.gale.com/apps/doc/CS50463086/TTDA...,"he. saturday evening the tragedy of , the game...",0.013021,1785,1780,theater,saturday evening tragedy gamester perform time...
1,Null,Theatre,The Times,1785-02-26,"London, England",https://link.gale.com/apps/doc/CS50593882/TTDA...,"-a. , their majesties, the princess royal, eli...",0.010256,1785,1780,theater,majesty princess royal elizabeth augusta young...
2,Null,Theatre,The Times,1785-11-21,"London, England",https://link.gale.com/apps/doc/CS34472309/TTDA...,.theatre. though pope lias ccnforcd farquhar f...,0.024867,1785,1780,theater,pope lia ccnforcd farquhar iitg pert character...
3,Null,Theatre,The Times,1785-12-09,"London, England",https://link.gale.com/apps/doc/CS50463113/TTDA...,"theatre. ycfferday evening s new comic. opera,...",0.018051,1785,1780,theater,theatre ycfferday evening s new comic opera ca...
4,Null,Theatre,The Times,1785-10-24,"London, England",https://link.gale.com/apps/doc/CS33947992/TTDA...,"t kt l a t l e. a new' farce, called; appearan...",0.027823,1785,1780,theater,t kt l t l new farce call appearance ii perfor...


In [3]:
from bertopic import BERTopic
import umap
import hdbscan
from sentence_transformers import SentenceTransformer
# 假设你已经加载了 merged_df
# import pandas as pd
# merged_df = pd.read_csv(...) 

# 将文本和日期转换为列表格式
# 将处理后的评论和“十年”时间作为训练数据
docs = df['Processed_text'].tolist()
timestamps = pd.to_numeric(df["Decade"], errors="coerce").astype("Int64").tolist()

# 聚类相关的参数
n_neighbors=15
n_components=5
min_cluster_size=30
min_samples=5

# 定义模型保存的路径
# this is the baseline
MODEL_PATH = f"my_bertopic_model_{n_neighbors}_{n_components}_{min_cluster_size}_{min_samples}"

# --- 检查模型是否已经存在 ---
if os.path.exists(MODEL_PATH):
    print(f"正在从 '{MODEL_PATH}' 加载已缓存的模型...")
    # 如果模型文件存在，直接加载
    topic_model = BERTopic.load(MODEL_PATH)
    print("模型加载完成！")
else:
    print(f"未找到缓存的模型。开始训练新模型...")
    # --- 1. 定义BERTopic的各个组件 (如果模型不存在，则运行此部分) ---
    
    # 嵌入模型
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # 降维模型
    umap_model = umap.UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.1, metric='cosine', random_state=42)

    # 聚类模型
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # --- 2. 创建并配置BERTopic模型 ---
    topic_model = BERTopic(
      embedding_model=embedding_model,
      umap_model=umap_model,
      hdbscan_model=hdbscan_model,
      language="english",
      calculate_probabilities=True,
      verbose=True,
      nr_topics=None        # 自动合并相似主题
    )

    # --- 3. 训练模型 ---
    print("开始训练BERTopic模型，这可能需要一些时间...")
    # 注意：为了让加载后的模型能处理新文档，你需要在这里计算并存储文本的嵌入
    # 方法是先用 embedding_model 计算 embeddings，再传入 fit_transform
    embeddings = embedding_model.encode(docs, show_progress_bar=True)
    topics, probs = topic_model.fit_transform(docs, embeddings=embeddings)
    
    # --- 3.1 减少离群值 (Crucial Step) ---
    # 使用 embeddings 策略将离群值重新分配给最近的主题
    print("正在尝试减少离群值...")
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="embeddings", embeddings=embeddings)
    topic_model.update_topics(docs, topics=new_topics)
    
    print("模型训练完成！")

    # --- 4. 保存模型 ---
    print(f"正在将模型缓存到 '{MODEL_PATH}'...")
    topic_model.save(MODEL_PATH)
    print("模型已成功保存。")

# --- 模型使用 ---
# 现在，无论模型是新训练的还是从缓存加载的，你都可以直接使用 topic_model 对象了
print("\n模型已准备就绪，可以进行后续分析。")
print(f"共找到 {len(topic_model.get_topic_info())-1} 个主题。")

# 示例：查看前5个主题
print(topic_model.get_topic_info().head(6))

# 示例：获取某个主题的关键词
print("\nTopic 0 的关键词:")
print(topic_model.get_topic(0))

未找到缓存的模型。开始训练新模型...
开始训练BERTopic模型，这可能需要一些时间...


Batches:   0%|          | 0/1571 [00:00<?, ?it/s]

2026-01-09 09:56:48,446 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-09 09:57:26,734 - BERTopic - Dimensionality - Completed ✓
2026-01-09 09:57:26,737 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

正在尝试减少离群值...




模型训练完成！
正在将模型缓存到 'my_bertopic_model_15_5_30_5'...
模型已成功保存。

模型已准备就绪，可以进行后续分析。
共找到 137 个主题。
   Topic  Count                              Name  \
0      0   3967     0_dance_ballet_dancer_company   
1      1   2703  1_exhibition_picture_gallery_art   
2      2   1744           2_opera_signor_act_sing   
3      3   1338       3_book_milford_fiction_edit   
4      4   1758     4_sonata_recital_chopin_minor   
5      5   2224        5_character_act_theatre_mr   

                                      Representation  \
0  [dance, ballet, dancer, company, choreography,...   
1  [exhibition, picture, gallery, art, painting, ...   
2  [opera, signor, act, sing, verdi, voice, singe...   
3  [book, milford, fiction, edit, publish, transl...   
4  [sonata, recital, chopin, minor, pianist, tone...   
5  [character, act, theatre, mr, lady, play, mrs,...   

                                 Representative_Docs  
0  [umbrella weather storm val bourne drive force...  
1  [art exhibition interesting num

In [4]:
# --- Temporal analysis ---
topics_over_time = topic_model.topics_over_time(
    docs=docs,
    timestamps=timestamps,
    global_tuning=True,
    nr_bins=None  
)

topic_model.visualize_topics_over_time(topics_over_time)

23it [02:11,  5.73s/it]


1.抽样检查每个主题的10条文档，观察是否有无意义词和OCR噪声

2.为主题添加标签

3.（如果有的话）清除噪声

In [5]:
# 查看主题
topic_model.get_topic(0)

[('dance', np.float64(0.02834930356843291)),
 ('ballet', np.float64(0.023861930375929322)),
 ('dancer', np.float64(0.0181672748457941)),
 ('company', np.float64(0.011283951730219172)),
 ('choreography', np.float64(0.008294901158613838)),
 ('choreographer', np.float64(0.007774904920639097)),
 ('dancing', np.float64(0.007518784925993236)),
 ('sadler', np.float64(0.006336088206229855)),
 ('new', np.float64(0.006144701442103632)),
 ('royal', np.float64(0.0059863772463605875))]

In [6]:
# 自动生成主题词
topic_model.generate_topic_labels()

['0_dance_ballet_dancer',
 '1_exhibition_picture_gallery',
 '2_opera_signor_act',
 '3_book_milford_fiction',
 '4_sonata_recital_chopin',
 '5_character_act_theatre',
 '6_concert_tho_herr',
 '7_promenade_concerto_concert',
 '8_novel_book_story',
 '9_poetry_poet_poem',
 '10_play_theatre_man',
 '11_theatre_miss_play',
 '12_drury_lane_scene',
 '13_hall_recital_piano',
 '14_shostakovich_russian_tchaikovsky',
 '15_theatre_night_character',
 '16_philharmonic_orchestra_symphony',
 '17_choir_choral_sing',
 '18_miss_mr_play',
 '19_theatre_mr_miss',
 '20_band_jazz_album',
 '21_orchestra_concert_symphony',
 '22_prince_mr_miss',
 '23_dublin_irish_abbey',
 '24_song_miss_recital',
 '25_shakespeare_play_theatre',
 '26_quartet_string_movement',
 '27_mozart_concerto_symphony',
 '28_lady_mrs_hon',
 '29_song_recital_voice',
 '30_theatre_council_company',
 '31_philharmonia_symphony_orchestra',
 '32_garrick_mr_miss',
 '33_bach_choir_cantata',
 '34_lyceum_irving_mr',
 '35_student_college_academy',
 '36_recita

In [7]:
# 可视化检查关键词
topic_model.visualize_barchart(top_n_topics=20)

In [8]:
topic_model.visualize_hierarchy()