# 基于jieba库分词和roberta进行词频统计

## 所有文本词频统计

In [None]:
import os
import pandas as pd
import jieba
import concurrent.futures
from collections import Counter
import tqdm

def load_stopwords(filepath, seed_dic_path):
    """加载停用词和种子词词典"""
    jieba.load_userdict(seed_dic_path)
    with open(filepath, 'r', encoding='utf-8') as f:
        return set(f.read().splitlines())

def process_text_file(filepath, stopwords, word_freq):
    """处理单个文本文件并更新词频统计"""
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        words = jieba.cut_for_search(content)
        filtered_words = (word for word in words if word not in stopwords and len(word.strip()) > 1 and not word.isdigit())
        word_freq.update(filtered_words)

def process_text_files_concurrently(directory, stopwords):
    """并发处理目录下的文本文件，统计词频"""
    word_freq = Counter()
    # 获取所有txt文件路径
    txt_files = [os.path.join(root, f) for root, _, files in os.walk(directory) for f in files if f.endswith('.txt')]
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        with tqdm.tqdm(total=len(txt_files), desc="Processing files") as pbar:
            futures = []
            for filepath in txt_files:
                future = executor.submit(process_text_file, filepath, stopwords, word_freq)
                future.add_done_callback(lambda p: pbar.update())
                futures.append(future)
            
            concurrent.futures.wait(futures)
    
    return word_freq


def del_word(word_freq_df,freq = 5,sort_store = False):
    """
    删除频次小于 freq 的词语，删除包含%或.的词语

    @param word_freq_df: 词频统计结果
    @param freq: 频次阈值
    @param sort_store: 是否保存筛选前词频统计结果
    """
    if not sort_store:
        word_freq_df = word_freq_df[word_freq_df['频次'] >= freq]
        word_freq_df = word_freq_df[~word_freq_df['词语'].str.contains('%|\.')]
        word_freq_df.to_csv('cipingdata/word_freq.csv', index=False, encoding='utf-8')
    else:
        word_freq_df.to_csv('cipingdata/word_freq_beforefilt.csv', index=False, encoding='utf-8')
        word_freq_df = word_freq_df[word_freq_df['频次'] >= freq]
        word_freq_df = word_freq_df[~word_freq_df['词语'].str.contains('%|\.')]
        word_freq_df.to_csv('cipingdata/word_freq.csv', index=False, encoding='utf-8')

    print(f'词频统计结果已保存至 cipingdata/word_freq.csv') 

# 主程序
if __name__ == "__main__":
    firm_annanls_path = 'cipingdata/testtxt' # 年报文本路径
    stopwords_dir = 'cipingdata/stopwords.txt' # 停用词文件路径
    seed_dic_path = 'cipingdata/seed_dict.csv' # 种子词词典路径

    stopwords = load_stopwords(stopwords_dir, seed_dic_path)
    word_freq = process_text_files_concurrently(firm_annanls_path, stopwords)

    # 转换成 DataFrame 并按频次排序
    word_freq_df = pd.DataFrame(word_freq.items(), columns=['词语', '频次'])
    word_freq_df = word_freq_df.sort_values(by='频次', ascending=False)
    del_word(word_freq_df,freq = 5,sort_store = False) # 删除频次小于5的词语，删除包含%或.的词语

## roberta余弦相似度构建词典

## 输出近似词列表——需自行筛选优化近似词

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
import pandas as pd
import tqdm

# 计算给定词的词向量表示
def get_word_vector(word: str, tokenizer, model, device) -> torch.Tensor:
    inputs = tokenizer(word, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings

# 计算余弦相似度
def cosine_similarity(vec1: torch.Tensor, vec2: torch.Tensor) -> float:
    return F.cosine_similarity(vec1.unsqueeze(0), vec2.unsqueeze(0)).item()

# 初始化模型和设备
def initialize_model(model_path: str):
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    return tokenizer, model, device

# 加载数据
def load_data(seed_dict_path: str, word_freq_filtered_path: str):
    seed_dict = pd.read_csv(seed_dict_path, encoding='utf-8', header=None)
    word_freq_filtered = pd.read_csv(word_freq_filtered_path, encoding='utf-8')
    word_freq_filtered_list = word_freq_filtered['词语'].tolist()
    return seed_dict, word_freq_filtered_list

# 获取词向量
def get_word_vectors(word_list, tokenizer, model, device, batch_size=32):
    word_vectors = {}
    for i in tqdm.tqdm(range(0, len(word_list), batch_size), desc="Calculating word vectors in batches"):
        word_batch = word_list[i:i + batch_size]
        inputs = tokenizer(word_batch, return_tensors='pt', padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        for word, embedding in zip(word_batch, embeddings):
            word_vectors[word] = embedding
        torch.cuda.empty_cache()
    return word_vectors

# 处理AI字典
def process_ai_dict(seed_dict, word_vectors, tokenizer, model, device):
    similar_words_dict = []
    for word in tqdm.tqdm(seed_dict[0], desc="Processing AI dictionary"):
        word_vector_ai = get_word_vector(word, tokenizer, model, device)
        similarities = [(ref_word, cosine_similarity(word_vector_ai, ref_vector)) for ref_word, ref_vector in word_vectors.items()]
        top_10_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[:10]
        similar_words_dict.append({
            'Word': word,
            'Top_10_Similar_Words': [word for word, _ in top_10_similar],
            'Similarity_Scores': [score for _, score in top_10_similar]
        })
    return similar_words_dict


# 主函数
def main(model_path,seed_dict_path,word_freq_filtered_path,similar_words_path,output_path):
    
    tokenizer, model, device = initialize_model(model_path)
    seed_dict, word_freq_filtered_list = load_data(seed_dict_path, word_freq_filtered_path)
    word_vectors = get_word_vectors(word_freq_filtered_list, tokenizer, model, device)
    similar_words_dict = process_ai_dict(seed_dict, word_vectors, tokenizer, model, device)
    
    similar_words_df = pd.DataFrame(similar_words_dict)
    similar_words_df.to_csv(similar_words_path, index=False, encoding='utf-8')
    
    similar_words_df = pd.read_csv(similar_words_path, encoding='utf-8')
    words = pd.DataFrame(similar_words_df['Word'], columns=['词语'])

    similar_words_list = [word for row in similar_words_df['Top_10_Similar_Words'].tolist() for word in eval(row)]
    similar_words_list = list(set(similar_words_list))
    # 输出所有近似词
    similar_words_list = pd.DataFrame(similar_words_list, columns=['词语'])

    words_df = pd.merge(words,similar_words_list,how='outer',on='词语')
    words_df.to_csv(output_path, index=False, encoding='utf-8', header=False)
    
    print(f'\n所有近似词已输出至 {output_path} \n共计 {len(similar_words_list)} 个词语')
    print(f'请用户自行筛选，删除不相关词语')
    print(f'也可以利用下面提供的代码进行筛选')


if __name__ == "__main__":
    model_path = 'hfl/chinese-roberta-wwm-ext'
    seed_dict_path = 'cipingdata/seed_dict.csv'
    word_freq_path = 'cipingdata/word_freq.csv'
    similar_words_path = 'cipingdata/similar_words.csv'
    output_path = 'cipingdata/words_all.csv'

    main(model_path,seed_dict_path,word_freq_path,similar_words_path,output_path)

### 无关词剔除

In [None]:
import pandas as pd
# 剔除无关词
def filter_rubbish_words(words_list, rubbish_words=[]):
    # 若未传入垃圾词列表，则使用默认垃圾词列表
    if not rubbish_words:
        rubbish_words = [
        '电子产品','现代农业','知识','网上超市','验算','节能',
        '电智','计算方法','整合营销','录音机','语音','客服部',
        '用户服务','提取液','表意','网络营销','特约服务','市场营销',
        '金融监管','智慧之门','运动用品','风控','风控全','知识表示',
        '感应式','汽车音响','传感器','专业音响','风控部','计算长度',
        '变矩器','重复性','推进改革','衰减系数','服务中心','英语教学',
        '中国银联','热线服务','特约服务','网上商城','注重实效','指代',
        '工智','智慧网','运输系统','勤奋学习','RPK','有问必答','网商小贷',
        '表意','特殊性','现代医学','GCK','意指','知识','周期性地',
        '生活必需品','RBR','读取数据','泛用','含义','问答','提取液',
        '孝敬老人','电子商城','脱虚向实','保健用品','间断性','老有所养',
        '电智','真实世界','知识型','新营销','验算','促进改革','强化班',
        '提问','用户服务','努力学习','微笑服务','客服',
        '时尚家居','节约能源','节能','具有特征','养老','电子产品',
        '交易系统','术语','虚心学习','计算','理论知识','物联新','金属产品',
        'UDEM','题库系统','家居生活','应答机','音响器','科普知识','现实主义',
        '灯管影响','防护系统','及风控','养老险','尊老敬老','掌上电脑',
        '课后复习','养老院','金属制品','证券监管','短期投资','医学知识',
        '外资保险','认真学习','通识性','CCAR','客户服务','HVLP',
        '电动汽车','LATALimited','记忆','代名词','TCXO','QDLP',
        '绿色生态','定量分析','m3a','灯光音响','市场监管','智化',
        'LMEshield','节能降耗','物联云商','环保','集中学习','分束器',
        '固有特征','红外摄像机','形状记忆','可视化','VB1','多媒体通信',
        '学习材料','TLP','应答器','识别','问题解答','安全监管','成本计算',
        'YBR','层次性','RPCB','节省能源','色谱分析','有源音箱','数据表示',
        '相控阵','生态农业','惯性导航','生物芯片','学科知识','咨询服务',
        '向量','温度传感器','据智研','计算速度','人机界面','激光雷达',
        '人工控制','数码产品','网上银行','科学知识','科学实践','音频芯片',
        '节能产品','电子政务','基本知识','交通运输业','检索系统','数学计算',
        '数据库','自动自发','手机芯片','计算机芯片','学习效果','计算机控制',
        '工控微机','中枢神经','密集式','科技知识','物联科','神经学',
        '养老保险','识别性','高智','NVME','NVMe','高科技产品',
        '统计数据','变量','分布式应用','NVMeSSD','计算力','芯片业',
        ]
        print(f'采用默认垃圾词列表，共计 {len(rubbish_words)} 个词语')
    
    words = pd.read_csv('cipingdata/words_all.csv', encoding='utf-8', header=None, names=['词语'])
    words_list = [word for word in words_list if word not in rubbish_words]
    words_df = pd.DataFrame(words_list, columns=['词语'])
    words_df = pd.merge(words_df, words, how='inner', on='词语')

    words_df.to_csv('cipingdata/words_all.csv', index=False, encoding='utf-8',header=False)


print(f'若用户已经在 words_all.csv 中手动删除了不相关词语，请跳过此步骤！')
# print(f'当然，也可以使用下面的代码删除垃圾词')
# print(f'但是注意：默认垃圾词列表为 AI 词典中的词语，请视情况使用')

rubbish_words = [] # 垃圾词列表
words_path = 'cipingdata/words_all.csv' # 词语列表路径
words_list = pd.read_csv(words_path, encoding='utf-8', header=None)[0].tolist()
              
filter_rubbish_words(words_list = words_list, rubbish_words=rubbish_words)

## 形成整体词频统计

In [None]:
import pandas as pd
import jieba
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def process_file(file_info):
    """处理单个文件，统计AI关键词词频"""
    company_name, year, report_path = file_info
    ai_word_count = 0
    
    if os.path.exists(report_path):
        with open(report_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # 分词
        words = jieba.cut_for_search(content)
        # 删除重复词
        words = list(set(words))
        # 剔除长度小于2的词和纯数字
        words = [word for word in words if len(word.strip()) > 1 and not word.isdigit()]
        
        # 去除停用词并统计AI关键词的词频
        ai_word_count = sum(word in words for word in words if word not in stopwords)
    
    return company_name, year, ai_word_count

def count_ai_words_parallel(firm_annals_path):
    """并发统计所有公司年报中的AI关键词词频"""
    file_infos = []
    for root, dirs, files in tqdm(os.walk(firm_annals_path), desc="Processing files"):
        for file in files:
            if file.endswith('.txt'):
                company_name = os.path.basename(root).replace(' ', '') # 去除空格
                year = os.path.splitext(file)[0]
                report_path = os.path.join(root, file)
                file_infos.append((company_name, year, report_path))
    
    results = []
    with ThreadPoolExecutor() as executor:
        futures = list(tqdm(executor.map(process_file, file_infos), total=len(file_infos), desc="Counting AI words"))
        for future in futures:
            results.append(future)
    
    return results

def update_word_counts(results, words_count, columns=None):
    """
        更新 words_count DataFrame。

        Args:
            results (list of tuples): 处理后的数据，格式为 [(com_name, year, AI_words_count), ...]。
            words_count (pd.DataFrame): 需要更新的现有 DataFrame。
            columns (list of str): com_name, year 和 AI_words_count 的列名。
        
        Return:
            pd.dataframe
        """
    if columns is None:
        columns = ['com_name', 'year', 'AI_words_count']
    
    for company_name, year, ai_word_count in results:
        mask = (words_count[columns[0]] == company_name) & (words_count[columns[1]] == year)
        if not words_count[mask].empty:
            words_count.loc[mask, columns[2]] += ai_word_count
        else:
            new_row = pd.DataFrame([[company_name, year, ai_word_count]], columns=columns)
            words_count = pd.concat([words_count, new_row], ignore_index=True)
    
    return words_count


# 读取AI关键词典和停用词
words = pd.read_csv('cipingdata/words_all.csv', encoding='utf-8', header=None)[0].tolist()
stopwords = set(open('cipingdata/stopwords.txt', 'r', encoding='utf-8').read().splitlines())
jieba.load_userdict('cipingdata/seed_dict.csv')

# 初始化AI_words_count DataFrame
words_count = pd.DataFrame(columns=['com_name', 'year', 'AI_words_count'])

# 统计AI关键词词频
firm_annals_path = 'cipingdata/testtxt'
results = count_ai_words_parallel(firm_annals_path)
words_count = update_word_counts(results, words_count)

# 保存结果
words_count.to_excel('words_count_result.xlsx', index=False)