In [13]:
import jieba
import jieba.posseg as psg
import collections
import math
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
#设置基准目录
os.chdir(r"C:\Users\AGN's SP\OneDrive\课件\自然语言处理\tfidf")
#读文件
def read_txt(filepath):
    contents = []
    for root, dirs, files in os.walk(filepath):
        for fname in files:
            with open(filepath+"\\"+fname, encoding='utf-8') as f:
                contents.append('\n'.join(f.readlines()))
    return files, contents
#读取停用词词典
def getstopword(path):
    with open(path, encoding='utf-8') as f:
        stopword = [line.strip('\n') for line in f.readlines()]
        stopword.append(' ')
        stopword.append('\xa0')
        stopword.append('\n')
        stopword.append('\t')

    return list(set(stopword))
#分词
def fenci(content):
    b=jieba.lcut(str(content))
    #t=[j.word for j in b if (j.flag in ['f', 'n', 'nr', 'ns', 'nsf', 'nt', 'nz', 'nl', 'ng', 'spot'])]
    return b
#给所有文章分词
def fenci_all(contents):
    contents_fenci = []
    for content in contents:
        contents_fenci.append(fenci(content))
    return contents_fenci
#去停用词
def getridofsw(content_fenci, stopword):
    result = [str(i) for i in content_fenci if str(i) not in stopword and not str(i).isdigit()]
    return result
#以下代码主函数未用到，以减少时间开销
'''
def Count(content):
    return collections.Counter(content)

def contain_fre(word, contents_list):
    return sum([1 for count in contents_list if word in count])

def idf(words, contents_list):
    idf={}
    for word in words:
        idf[word] = 1 + contain_fre(word, contents_list)
    return idf

def tf(word, count):
    return count[word]

def tfidf(word, count, contents_list,idf):
    return count[word] * math.log(len(contents_list)/idf[word])
'''

def set_up():
    #建立语料库
    yuliao_path = r"携程"
    stopword_path = r"stopwords.txt"
    files_name, contents = read_txt(yuliao_path)
    #print(contents)
    stopword = getstopword(stopword_path)
    #print(stopword)
    contents_fenci = fenci_all(contents)
    processed_contents = [getridofsw(content_fenci, stopword) for content_fenci in contents_fenci]
    return files_name, contents, processed_contents

def main():
    print("---------------------加载语料库--------------------")
    #首次运行目录下没有语料库文件，为方便之后再次运行，保存语料库
    try:
        yuliao_path = r"携程"
        files_name, contents_list =  read_txt(yuliao_path)
        contents_processed = np.load('contents_processed.npy',allow_pickle=True)
    except:
        files_name, contents_list,contents_processed = set_up()
        np.save('contents_processed.npy',contents_processed)
    print("-------------------语料库加载完毕------------------")
    passages = contents_processed
    print("--------------------文章导入完毕-------------------")
    print("--------------------关键词计算中-------------------")

    passage_num = len(contents_list)#文章数量
    
    words_total = []
    for passage in passages:
        words_total.extend(passage)
    words_total = list(set(words_total))#总词语
    
    tf = dict.fromkeys(words_total,0)
    idf = dict.fromkeys(words_total, 1)
    tfidf_score = dict.fromkeys(words_total, 0)
    
    try:
        with tqdm(range(len(passages))) as t:
            for i in t:
                t.set_description("正在计算{}".format(files_name[i]))
                count = collections.Counter(passages[i]) #词频数
                num = sum(count.values()) #文章词语总数量
                for word in count.keys():
                    tf_score = count[word]/num #词频率
                    if(tf[word]<tf_score):
                        tf[word] = tf_score #取最大tf值 ————因为idf值每个单词在语料库中表现一样
                    idf[word] += 1 #在语料库出现次数加一
    except KeyboardInterrupt:
        t.close()
        raise
    t.close()
    #计算所有词语的tfidf值
    for word in words_total:
        tfidf_score[word] = tf[word]*math.log(passage_num/idf[word])
    #排序
    tf_idf_sorted = sorted(tfidf_score.items(), key=lambda x:x[1],reverse=True)
    print('前500：', tf_idf_sorted[0:500])
    tf_idf_sorted = pd.DataFrame(tf_idf_sorted)
    tf_idf_sorted.to_csv('top500.csv', index=0) #保存
    

if __name__ == '__main__':
    main()

---------------------加载语料库--------------------
-------------------语料库加载完毕------------------
--------------------文章导入完毕-------------------
--------------------关键词计算中-------------------


正在计算普达措碧塔海点评(携程).txt: 100%|████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 13.72it/s]


前500： [('楼主', 0.020706994415210102), ('quot', 0.003573057446649652), ('马克', 0.003078890803637343), ('请问', 0.0030264068760691683), ('湖是', 0.002366925805149915), ('湖泊', 0.001981994281603683), ('小亮', 0.0019454155666117942), ('收藏', 0.0018476685930322856), ('LZ', 0.0017522195004923728), ('travels', 0.001752130296671624), ('宿', 0.0016454248282513693), ('加油', 0.0016349915264446304), ('lz', 0.0015662982955094822), ('分享', 0.001483978815180254), ('秘鲁', 0.0014285792514408405), ('mark', 0.0014111582850004488), ('强帖', 0.0014111582850004488), ('晨雾', 0.0013986379757704042), ('栎树', 0.0013908468351163741), ('留名', 0.0013809599430594792), ('取票', 0.0013409508678562962), ('碧绿', 0.0013279695591836848), ('html', 0.0013160753050436545), ('you', 0.0013160753050436545), ('ctrip', 0.0013160753050436545), ('初冬', 0.0013081333910072099), ('泡面', 0.0012969437110745296), ('详细', 0.0012640394498102518), ('精彩', 0.0012339558690148155), ('关注', 0.001187183052144203), ('岗', 0.0011834629025749574), ('古城', 0.001178577683382133

PermissionError: [Errno 13] Permission denied: 'top500.csv'