In [1]:
import os
import numpy as np
import chromadb
from sentence_transformers import SentenceTransformer
import re
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device




'cuda'

In [2]:
# 设置路径
chunk_folder = "Chunk_file_folder"
vector_store_path = "vector_store"

# 读取所有 chunk 文件
chunk_files = [f for f in os.listdir(chunk_folder)]

In [3]:
embedding_model = SentenceTransformer("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True, device=device)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.weight', 'classifier.bias'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
client = chromadb.PersistentClient(path=vector_store_path)
collection = client.get_or_create_collection(name="psychology_chunks")

In [5]:
def process_chunk_file(file_path): 
    """读取 chunk 文件并返回文本列表，保持原始换行结构"""
    with open(file_path, "r", encoding="utf-8") as f:
        chunks = []
        current_chunk = []
        for line in f:
            if line.startswith("chunk "):  # 检测新 chunk 起始
                if current_chunk:
                    chunks.append("\n".join(current_chunk))  # 保持原始换行
                    current_chunk = []
            elif line.strip() and not line.startswith("-"):
                current_chunk.append(line.strip())  
        if current_chunk:
            chunks.append("\n".join(current_chunk))  # 最后一个 chunk 也要加入
    return chunks


def process_table_file(file_path):
    """读取 _tables.txt 文件并转换为 Markdown 表格格式"""
    with open(file_path, "r", encoding="utf-8") as f:
        table_texts = []
        current_table = []
        table_headers = None

        for line in f:
            if re.fullmatch(r"=+", line.strip()):  # 识别表格分隔符
                if current_table:
                    # 处理表格：使用 Markdown 格式
                    table_str = "\n".join([" | ".join(row) for row in current_table])
                    if table_headers:
                        table_str = table_headers + "\n" + "-" * len(table_headers) + "\n" + table_str
                    table_texts.append(table_str)
                    current_table = []
            elif line.strip():
                cols = line.strip().split()  # 可能是列数据
                if not table_headers:
                    table_headers = " | ".join(cols)  # 记录表头
                else:
                    current_table.append(cols)

        if current_table:
            table_str = "\n".join([" | ".join(row) for row in current_table])
            if table_headers:
                table_str = table_headers + "\n" + "-" * len(table_headers) + "\n" + table_str
            table_texts.append(table_str)

    return table_texts



In [6]:

# 处理所有 chunk 文件，并向量化存储
success_count = 0

for chunk_file in chunk_files:
    chunk_path = os.path.join(chunk_folder, chunk_file)
    print(f"Processing: {chunk_file}")

    # 根据文件类型处理不同的 chunk 文件
    if chunk_file.endswith("_text.txt"):  
        chunks = process_chunk_file(chunk_path)
    #elif chunk_file.endswith("_tables.txt"):  
        #chunks = process_table_file(chunk_path)
    else:
        continue  # 其他文件忽略

    # 如果 chunks 为空，跳过处理
    if not chunks:
        print(f"Warning: {chunk_file} has no valid content. Skipping...")
        continue

    try:
        # 生成 embedding
        chunk_embeddings = embedding_model.encode(chunks)

        # 存入 ChromaDB
        for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
            collection.add(
                ids=[f"{chunk_file}_{i}"],
                embeddings=[embedding.tolist()],
                metadatas=[{"text": chunk, "source": chunk_file}]
            )
            success_count += 1  # 记录成功存储的条数

        print(f"Finished: {chunk_file}, stored {len(chunks)} chunks in vector store")

    except Exception as e:
        print(f"Error processing {chunk_file}: {str(e)}")

print(f"All done! Successfully stored {success_count} chunks. Ready for retrieval.")

Processing: Complex PTSD_ From Surviving to Thriving_tables.txt
Processing: Complex PTSD_ From Surviving to Thriving_text.txt
Finished: Complex PTSD_ From Surviving to Thriving_text.txt, stored 2679 chunks in vector store
Processing: GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs_tables.txt
Processing: GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs_text.txt
Finished: GPMHSC-Suicide-prevention-and-first-aid-resource-for-GPs_text.txt, stored 87 chunks in vector store
Processing: therapists_guide_to_brief_cbtmanual_tables.txt
Processing: therapists_guide_to_brief_cbtmanual_text.txt
Finished: therapists_guide_to_brief_cbtmanual_text.txt, stored 1271 chunks in vector store
All done! Successfully stored 4037 chunks. Ready for retrieval.


In [7]:
# 计算 chunk 长度分布
chunk_lengths = [len(chunk.split()) for chunk in chunks]

# 打印一些统计信息
print(f"Total chunks: {len(chunk_lengths)}")
print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.2f} words")
print(f"Shortest chunk: {min(chunk_lengths)} words")
print(f"Longest chunk: {max(chunk_lengths)} words")

# 如果有很多 chunk 低于 50 个单词，可能需要调整切分
short_chunks = [length for length in chunk_lengths if length < 50]
print(f"Chunks with < 50 words: {len(short_chunks)} ({len(short_chunks) / len(chunk_lengths) * 100:.2f}%)")


Total chunks: 1271
Average chunk length: 32.73 words
Shortest chunk: 1 words
Longest chunk: 427 words
Chunks with < 50 words: 969 (76.24%)


In [8]:
query_text = "In psychological therapy, emotional regulation is a crucial aspect. Patients often experience anxiety, depression, or anger, which may stem from childhood trauma, workplace stress, or interpersonal conflicts.Common emotional regulation techniques include,Mindfulness Meditation: Focusing on the present moment to reduce anxiety about the past and future."  # 你要查询的文本
query_embedding = embedding_model.encode([query_text])  # 计算查询文本的 embedding

# 执行向量检索
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=10  # 取回最相似的 5 条记录
)

# 打印查询结果
print("\n==== Top 5 Retrieved Chunks ====")
for i, (retrieved_text, score) in enumerate(zip(results["metadatas"][0], results["distances"][0])):
    print(f"\nResult {i+1} (Score: {score:.4f}):")
    print(retrieved_text["text"])
print("================================\n")



==== Top 5 Retrieved Chunks ====

Result 1 (Score: 0.6367):
My therapist’s modeling that anger, sadness, fear, and depression were emotions that could
be healthily expressed helped me to renounce the pain-repressing, emotional perfectionism in
which I was mired.

Result 2 (Score: 0.6637):
Patients with problems that are largely emotional; for example, a person who feels
incompetent at work and often feels that others are overly critical may be reacting
to emotions (e.g., depression).

Result 3 (Score: 0.6928):
Unpredictable shifts in your emotional weather are typically problematic in Cptsd.

Result 4 (Score: 0.7031):
Emotional intelligence about the healthy and
functional aspects of anger, sadness, and fear lies fallow. De-Minimizing Emotional Abandonment
As with physical abuse, effective work on the wounds of verbal and emotional abuse can
sometimes open the door to de-minimizing the awful impact of emotional neglect. I sometimes
feel the most for my clients who were “only” neglecte