In [4]:
import numpy as np
import re
import jieba
import newspaper
import pandas as pd
from tqdm import tqdm

url = 'http://www.southcn.com/'          # 南方网
stop_word_path = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/stop_word_for_chinese.txt"
corpus = "/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP训练营笔记/南方网.csv"
stop_words = list(map(lambda x : x.replace("\n",'') ,open(stop_word_path,'r').readlines()))


def get_news_paper(url,filepath):
    south_paper = newspaper.build(url,language='zh',memoize_articles = False)    # 构建新闻源
    strings = "{}{}{}{}{}{}".format("品牌:",south_paper.brand,"描述:",south_paper.description,"共计:",len(south_paper.articles))
    news_title = []
    news_text = []
    news = south_paper.articles
    for i in tqdm(range(len(news)),desc=strings):    # 以新闻链接的长度为循环次数
        paper = news[i]
        try :
            paper.download()
            paper.parse()
            news_title.append(paper.title)     # 将新闻题目以列表形式逐一储存
            news_text.append(paper.text)       # 将新闻正文以列表形式逐一储存
        except:
            news_title.append('NULL')          # 如果无法访问，以NULL替代
            news_text.append('NULL')          
            continue
    # 建立数据表存储爬取的新闻信息
    south_paper_data = pd.DataFrame({'title':news_title,'text':news_text})
    print('新闻采集完成，采集共计',len(south_paper_data),'篇')
    south_paper_data.to_csv(filepath)

#构造列表删除临时元素
def del_element(strings,symbles=''):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

#过滤停用词
def filter_stop_word(strings,stop_word):
    return ''.join([char for char in strings if char not in stop_words])

#读取本地存储文本
def read_txt(corpus):
    return np.array([str(word) for word in pd.read_csv(corpus).text])

#分词
def split_word(original,temp_del='\n'):
    result = []
    for paper in tqdm(original,desc='已分词文章数量'):
        temp_split_words = np.array(list(jieba.cut(del_element(filter_stop_word(paper,stop_words),temp_del))))
        result.append(temp_split_words)
    return np.array(result)

#新闻采集
get_news_paper(url,corpus)

def data_preprocessing(corpus):
    # 读取原文
    read_original = read_txt(corpus) 
    # 导入文章并分词
    init_paper = split_word(read_original,stop_words)
    # 所有单词降维到一维
    all_words = np.array([j for i in init_paper for j in i])
    # 单词去重
    word_vector = np.unique(all_words)
    # 测量共有词汇量
    m = all_words.size
    word_dict = dict()
    for word in tqdm(word_vector,desc='构建频率词典'):
        prob = (all_words==word).dot(np.ones(m))/m
        temp = {word:prob}
        word_dict.update(temp)
    return word_dict,word_vector,read_original,init_paper
print("共计单词:",len(word_dict))

#构造倒排表
def inverted_index(paper,word_vector):
    result = dict()
    n = -1
    for i in tqdm(paper,desc='倒排表当前排序的文章'):
        n += 1
        for j in i:
            if j in word_vector:
                if j in result:
                    result[j] = result[j]+[n]
                else:
                    result.update({j:[n]})
    return {i:list(set(result[i])) for i in result}

#倒排表运行
Inverted_Index_List = inverted_index(init_paper,word_vector)

# 搜索倒排表
def search_inverted_index(strings,Inverted_Index_List):
    words_for_search = []
    split_word_for_search = [word for word in jieba.cut_for_search(strings) if word not in stop_words]
    print(split_word_for_search)
    for word in tqdm(split_word_for_search,desc='搜索倒排表'):
        if word in Inverted_Index_List:
            #print("\n搜索单词:",word,"\n文章序列:",Inverted_Index_List[word])
            words_for_search+=Inverted_Index_List[word]
    return np.unique(np.array(words_for_search)),split_word_for_search

# 构建文章频率特征词向量
def feature_dictionary_editor(words):
    words_list = list(word_dict) #特征向量
    feature_dict = dict(zip(words_list,np.zeros(len(words_list)))) # 特征字典
    for word in words:
        if word in words_list:
            feature_dict[word]+=1
    return np.array([frequency for word,frequency in feature_dict.items()])

#余弦相似度
def cosine(s1,s2):
    return s1.dot(s2)/(np.linalg.norm(s1) * np.linalg.norm(s2))

#搜索入口函数 
def search(key,Inverted_Index_List):
    search_paper_index,search_word = search_inverted_index(key,Inverted_Index_List) #倒排表删选
    search_result = dict() 
    search_prob = feature_dictionary_editor(search_word) #搜索内容的词向量
    change_word_vector_from_words = init_paper[search_paper_index]
    change_paper_from_words = read_original[search_paper_index]
    for i in tqdm(range(len(change_paper_from_words)),desc='已经搜索数量'):
        word_arr,paper = change_word_vector_from_words[i],change_paper_from_words[i]
        paper_prob = feature_dictionary_editor(word_arr) #倒排表当前文章的词向量
        cos = cosine(paper_prob,search_prob) #余弦相似度
        parameter = cos 
        search_result.update({parameter:paper}) #更新搜索结果
    search_result_arr = np.array(sorted(search_result.items(), key=lambda x: x[0], reverse=True)) #对结果按照余弦值排序  
    table = pd.DataFrame({'cos':search_result_arr[:,0],'newspaper':search_result_arr[:,0]})
    return table #dict(zip(search_result_arr[:,0],search_result_arr[:,1]))

品牌:southcn描述:南方网/南方新闻网是经中共广东省委，广东省人民政府批准建设的新闻宣传网站。南方网/南方新闻网由广东省委宣传部主办主管并作为南方报业传媒集团之成员单位，获国务院新闻办公室批准从事登载新闻业务并被确定为全国重点新闻网站之一。南方网/南方新闻网作为华南地区最大型的新闻融合平台，是国内外网民认识、了解广东最权威、最快捷的途径。共计:1097: 100%|██████████| 1097/1097 [06:43<00:00,  2.72it/s]


TypeError: object of type 'numpy.int64' has no len()

In [5]:
search("世界广东",Inverted_Index_List)

NameError: name 'search' is not defined