In [1]:
import numpy as np
import torch
import re
import jieba
import newspaper
import pandas as pd
from tqdm import tqdm
import os.path
from os import listdir
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

In [2]:
url = 'https://new.qq.com/omn/20200730/20200730A0TCON00.html'      # 南方网
south_paper = newspaper.build(url, language='zh')    

In [3]:
url = 'http://www.southcn.com/'          # 南方网

# 寻找文件

In [4]:
def find_file(key_word,dir = os.getcwd()):
    file_paths = [os.path.join(dir, f) for f in listdir(dir) if os.path.isfile(os.path.join(dir, f)) and key_word in os.path.join(dir, f)][0]
    return file_paths

# 摘取新闻

In [5]:
def get_news_paper(url,filepath):
    south_paper = newspaper.build(url,language='zh',memoize_articles = False)    # 构建新闻源
    strings = "{}{}{}{}{}{}".format("品牌:",south_paper.brand,"描述:",south_paper.description,"共计:",len(south_paper.articles))
    news_title = []
    news_text = []
    news = south_paper.articles
    for i in tqdm(range(len(news)),desc=strings):    # 以新闻链接的长度为循环次数
        paper = news[i]
        try :
            paper.download()
            paper.parse()
            news_title.append(paper.title)     # 将新闻题目以列表形式逐一储存
            news_text.append(paper.text)       # 将新闻正文以列表形式逐一储存
        except:
            news_title.append('NULL')          # 如果无法访问，以NULL替代
            news_text.append('NULL')          
            continue
    # 建立数据表存储爬取的新闻信息
    south_paper_data = pd.DataFrame({'title':news_title,'text':news_text})
    south_paper_data = south_paper_data.drop_duplicates(subset=['text'], keep ='first')
    south_paper_data.reset_index(drop=True)
    south_paper_data.to_csv(filepath,mode="a",header=False)
    print("{}{}{}".format("共计采集到<",news.shape[0],">篇新闻"))
    return south_paper_data


# 静态配置

In [6]:
corpus = find_file("南方网 3.csv")
stop_word_path = find_file("stop_word_for_chinese.txt","/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/")
stop_word_path

'/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/NLP数据集合/停词库/stop_word_for_chinese.txt'

# 本地csv去重

In [None]:
def drop_duplicates_csv(file_path):
    location_table = pd.read_csv(file_path)
    start = location_table.shape[0]
    location_table = location_table.drop_duplicates(subset=['text'], keep ='first')
    location_table = location_table.reset_index(drop=True)
    location_table.to_csv(file_path)
    end = location_table.shape[0]
    return start-end
drop_duplicates_csv(corpus)

(1217, 3)

# 开始采集新闻数据

In [7]:
news = get_news_paper(url,corpus)

品牌:southcn描述:南方网/南方新闻网是经中共广东省委，广东省人民政府批准建设的新闻宣传网站。南方网/南方新闻网由广东省委宣传部主办主管并作为南方报业传媒集团之成员单位，获国务院新闻办公室批准从事登载新闻业务并被确定为全国重点新闻网站之一。南方网/南方新闻网作为华南地区最大型的新闻融合平台，是国内外网民认识、了解广东最权威、最快捷的途径。共计:1102:   0%|          | 0/1102 [00:00<?, ?it/s]Building prefix dict from /usr/local/lib/python3.7/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/sl/q8x6_03132dfk7rktf00yh880000gn/T/jieba.cache
Loading model cost 1.0406560897827148 seconds.
Prefix dict has been built succesfully.
品牌:southcn描述:南方网/南方新闻网是经中共广东省委，广东省人民政府批准建设的新闻宣传网站。南方网/南方新闻网由广东省委宣传部主办主管并作为南方报业传媒集团之成员单位，获国务院新闻办公室批准从事登载新闻业务并被确定为全国重点新闻网站之一。南方网/南方新闻网作为华南地区最大型的新闻融合平台，是国内外网民认识、了解广东最权威、最快捷的途径。共计:1102: 100%|██████████| 1102/1102 [10:45<00:00,  1.71it/s]


# 数据预处理

In [24]:
#临时删除文本元素
def del_element(strings,symbles):
    srcrep = {i:'' for i in symbles }
    rep = dict((re.escape(k), v) for k, v in srcrep.items())
    pattern = re.compile("|".join(rep.keys()))
    return pattern.sub(lambda m: rep[re.escape(m.group(0))], strings)

#加载停用词
stop_words = stop_words = open(stop_word_path,'r').read().split('\n')+['\n']

#过滤停用词
def filter_stop_word(paper,stop_words):
    return np.array(list(filter(lambda x: x not in stop_words,jieba.cut(del_element(paper,'\n')))))

#读取本地新闻
def read_txt(corpus):
    return np.array([re.sub('\n','',str(word)) for word in tqdm(pd.read_csv(corpus).text,desc='加载文章')])

#只要中文
def just_chinese(strings):
    regStr = ".*?([\u4E00-\u9FA5]+).*?"
    expr = ''.join(re.findall(regStr, strings))
    if expr:
        return expr
    return '\n'

#分词
def split_word(original,temp_del=stop_words):
    result = []
    for paper in tqdm(original,desc='分词文章'):
        chinese = just_chinese(paper)
        temp_split_words = filter_stop_word(chinese,stop_words)
        result.append(temp_split_words)
    return np.array(result)

# 排序字典
def sort_dict(dict_items):
    sorted_tuple = np.array(sorted(dict_items.items(), key=lambda x: x[0], reverse=True))
    return dict(zip(sorted_tuple[:,0],sorted_tuple[:,1]))

'''数据预处理函数'''
def data_preprocessing(corpus):
    # 读取原文
    read_original = read_txt(corpus) 
    # 倒入文章并分词
    init_paper = split_word(read_original,stop_words)
    # 所有单词降维到一维
    all_words = np.array([j for i in tqdm(init_paper,desc='词列表降维') for j in i])
    # 单词去重
    word_vector = np.unique(all_words)
    # 测量共有词汇量
    m = all_words.size
    init_word_dict = {word:(all_words==word).dot(np.ones(m))/m for word in tqdm(word_vector,desc='构建频率词典')}
    #构建排序字典和特征向量 
    word_dict = sort_dict(init_word_dict)
    word_vector = np.array(list(word_dict)) 
    return word_dict,word_vector,read_original,init_paper

In [25]:
word_dict,word_vector,read_original,init_paper = data_preprocessing(corpus)

加载文章: 100%|██████████| 1217/1217 [00:00<00:00, 140840.11it/s]
分词文章: 100%|██████████| 1217/1217 [00:50<00:00, 24.28it/s]
词列表降维: 100%|██████████| 1217/1217 [00:00<00:00, 6845.88it/s]
构建频率词典: 100%|██████████| 56256/56256 [06:01<00:00, 155.53it/s]


# TF-ITF 词向量

In [26]:
def TF(paper_words,word_vector):
    m = word_vector.size
    init_TF = np.zeros(m)
    for word in paper_words:
        if word in word_vector:
            index_ = np.argwhere(word_vector==word)[0][0]
            init_TF[index_] += 1
    return init_TF

def IDF(paper_words_list,word_vector):
    m = word_vector.size
    init_IDF = np.zeros(m)
    N = paper_words_list.shape
    n = -1
    for word in tqdm(word_vector,desc = 'IDF词汇'):
        n += 1
        for paper_arr in paper_words_list:
            if word in paper_arr:
                init_IDF[n] += 1
    return np.log(N/(init_IDF+1))

def TFIDF(paper_words_list,word_vector):
    IDF_arr = IDF(init_paper,word_vector)
    TF_arr = np.array([TF(paper,word_vector) for paper in tqdm(paper_words_list,desc = 'TF矩阵')])
    return TF_arr*IDF_arr,IDF_arr

# 对数据源做TFIDF编码

In [27]:
code_of_TFIDF,IDF = TFIDF(init_paper,word_vector)

IDF词汇: 100%|██████████| 56256/56256 [09:21<00:00, 100.20it/s]
TF矩阵: 100%|██████████| 1217/1217 [07:57<00:00,  2.55it/s]


# 特征字典编辑器

In [28]:
# 构建文章频率特征词向量
def feature_dictionary_editor(words):
    words_list = list(word_dict) #特征向量
    feature_dict = dict(zip(words_list,np.zeros(len(words_list)))) # 特征字典
    for word in words:
        if word in words_list:
            feature_dict[word]+=1
    return np.array([frequency for word,frequency in feature_dict.items()])

In [29]:
code_of_TFIDF.shape

(1217, 56256)

In [30]:
arr = init_paper[2]

In [31]:
def try_index(n,arr):
    try:
        return (init_paper[n]==arr).all()
    except Exception:
        return False

def find_index(arr):
    for i in range(init_paper.shape[0]):
        if try_index(i,arr):
            return i
find_index(arr)

2

# 构造倒排表

In [32]:
def inverted_index(paper,word_vector):
    result = dict()
    n = -1
    for i in tqdm(paper,desc='倒排表当前排序的文章'):
        n += 1
        for j in i:
            if j in word_vector:
                if j in result:
                    result[j] = result[j]+[n]
                else:
                    result.update({j:[n]})
    return {i:list(set(result[i])) for i in result}

Inverted_Index_List = inverted_index(init_paper,word_vector)

倒排表当前排序的文章: 100%|██████████| 1217/1217 [03:55<00:00,  5.17it/s]


# 搜索引擎模块

In [33]:
# 搜索倒排表
def search_inverted_index(strings,Inverted_Index_List):
    words_for_search = []
    split_word_for_search = [word for word in jieba.cut_for_search(strings) if word not in stop_words]
    print(split_word_for_search)
    for word in split_word_for_search:
        if word in Inverted_Index_List:
            print("\n搜索单词:",word,"\n文章序列:",Inverted_Index_List[word])
            words_for_search+=Inverted_Index_List[word]
    return np.unique(np.array(words_for_search)),split_word_for_search

#余弦相似度
def cosine(s1,s2):
    return s1.dot(s2)/(np.linalg.norm(s1) * np.linalg.norm(s2))



# 特征向量搜索入口函数

In [34]:
 
def search(key,Inverted_Index_List):
    search_paper_index,search_word = search_inverted_index(key,Inverted_Index_List)
    search_result = []
    search_prob = feature_dictionary_editor(search_word) #搜索内容的词向量
    change_word_vector_from_words = init_paper[search_paper_index]
    change_paper_from_words = read_original[search_paper_index]
    for i in tqdm(range(len(change_paper_from_words)),desc='已经搜索数量'):
        word_arr,paper = change_word_vector_from_words[i],change_paper_from_words[i]
        paper_prob = feature_dictionary_editor(word_arr) #倒排表当前文章的词向量
        cos = cosine(paper_prob,search_prob) #余弦相似度
        parameter = cos 
        search_result.append([cos,paper])

    search_result_arr = np.array(search_result)
    result_table = pd.DataFrame({"cos":search_result_arr[:,0],"newspaper":search_result_arr[:,1]})
    sort_table = result_table.sort_values(["cos"],ascending=False).reset_index(drop=True)
    file_path = "{}{}{}".format(os.getcwd(),"/",key)
    sort_table.to_csv(file_path)
    return sort_table

In [35]:
TFsearch = search("字节跳动",Inverted_Index_List)
TFsearch

已经搜索数量:   0%|          | 0/6 [00:00<?, ?it/s]['字节', '跳动']

搜索单词: 字节 
文章序列: [337, 1177, 1053, 1214, 1023]

搜索单词: 跳动 
文章序列: [609, 337, 1177, 1053, 1214, 1023]
已经搜索数量: 100%|██████████| 6/6 [00:02<00:00,  2.99it/s]


Unnamed: 0,cos,newspaper
0,0.2472340882170743,8月3日中午，字节跳动创始人张一鸣发送公司全员信，回应了TikTok美国业务面临被CFIUS（美国外资投资委员会）强制要求出售的问题。8月3日中午，字节跳动创始人张一鸣发送公司全员信，回应了T...
1,0.0996889572558453,分别都是哪些企业？中国有227家企业上榜最新的胡润全球高成长性企业榜，大湾区上榜者有33家。8月4日，胡润研究院发布2020年高成长性企业榜单，列出了全球成立于2000年之后、价值10亿美元以...
2,0.0536924844171219,8月4日，胡润研究院发布了“2020年全球高成长性企业”榜单显示，全球上榜有586家企业，上榜企业总价值12.9万亿，企业平均年龄仅9岁。中国仍是全球高成长性企业诞生的热土，共227家企业入围...
3,0.0410132391597531,详情·钟南山成为共和国勋章建议人选·台风“黑格比”将带来强降水·广州地铁八号线北延段今年内开通·支持深圳等升直辖市论文作者回应·大连疫情可能始于海产品加工车间·录取通知书不得投递至快件箱自提点...
4,0.029450003202688,7月26日，理想汽车向美国SEC更新招股书，宣布IPO股票公开发行规模为9500万股ADS，发行价区间为8美元至10美元。按照目前公布的发行价格，此次理想汽车IPO融资总额将达到12.54亿美...
5,0.0125117352552837,自4月底公告股票停牌、宣布引战以来，泰禾股票先后拉出4个涨停板，股价从最低点4.06元一度追涨至7.7元，最大涨幅高达89.6%，呈现出一度爆棚的投资者信心。只听楼梯响，不见人下来。泰禾集团（...


# ITITF 向量搜索入口函数 

In [37]:

def search_TFITF(key,Inverted_Index_List):
    search_paper_index,search_word = search_inverted_index(key,Inverted_Index_List)
    search_result = [] 
    TFITF_search = TF(search_word,word_vector)*IDF #搜索内容的词向量
    change_word_vector_from_words = init_paper[search_paper_index]
    change_paper_from_words = read_original[search_paper_index]
    for i in tqdm(range(len(change_paper_from_words)),desc='已经搜索数量'):
        word_arr,paper = change_word_vector_from_words[i],change_paper_from_words[i]
        TFIDF_ROW = code_of_TFIDF[find_index(word_arr)]#倒排表当前文章的词向量z s
        cos = cosine(TFIDF_ROW,TFITF_search) #余弦相似度
        parameter = cos 
        search_result.append([cos,paper])
    search_result_arr = np.array(search_result)
    result_table = pd.DataFrame({"cos":search_result_arr[:,0],"newspaper":search_result_arr[:,1]})
    sort_table = result_table.sort_values(["cos"],ascending=False).reset_index(drop=True)
    file_path = "{}{}{}".format(os.getcwd(),"/",key)
    sort_table.to_csv(file_path)
    return sort_table

TFITFsearch  = search_TFITF(input(),Inverted_Index_List)
TFITFsearch

已经搜索数量: 100%|██████████| 11/11 [00:00<00:00, 392.25it/s]['字节', '跳动', '特朗普']

搜索单词: 字节 
文章序列: [337, 1177, 1053, 1214, 1023]

搜索单词: 跳动 
文章序列: [609, 337, 1177, 1053, 1214, 1023]

搜索单词: 特朗普 
文章序列: [836, 1190, 1068, 504, 1053, 767]



Unnamed: 0,cos,newspaper
0,0.3229172319362852,8月3日中午，字节跳动创始人张一鸣发送公司全员信，回应了TikTok美国业务面临被CFIUS（美国外资投资委员会）强制要求出售的问题。8月3日中午，字节跳动创始人张一鸣发送公司全员信，回应了T...
1,0.1239793226802535,分别都是哪些企业？中国有227家企业上榜最新的胡润全球高成长性企业榜，大湾区上榜者有33家。8月4日，胡润研究院发布2020年高成长性企业榜单，列出了全球成立于2000年之后、价值10亿美元以...
2,0.0885912311837257,8月4日，胡润研究院发布了“2020年全球高成长性企业”榜单显示，全球上榜有586家企业，上榜企业总价值12.9万亿，企业平均年龄仅9岁。中国仍是全球高成长性企业诞生的热土，共227家企业入围...
3,0.0633635952533991,详情·钟南山成为共和国勋章建议人选·台风“黑格比”将带来强降水·广州地铁八号线北延段今年内开通·支持深圳等升直辖市论文作者回应·大连疫情可能始于海产品加工车间·录取通知书不得投递至快件箱自提点...
4,0.0362702850389491,7月26日，理想汽车向美国SEC更新招股书，宣布IPO股票公开发行规模为9500万股ADS，发行价区间为8美元至10美元。按照目前公布的发行价格，此次理想汽车IPO融资总额将达到12.54亿美...
5,0.0318721943151762,“法轮功”是被中国政府依法取缔的邪教组织。这样一个反社会、反科学、反人类的邪教团体，却长期受到国外资助，混迹于香港街头，依托香港为跳板进行非法活动。但在香港国安法制定以后，部分“法轮功”分子告...
6,0.0283827470438771,汪文斌在当日例行记者会上说，一段时间以来，美方在拿不出任何证据的情况下，泛化国家安全概念，滥用国家力量，无理打压特定的非美国企业，这违背市场经济原则，也违反世贸组织开放、透明、非歧视原则，是赤...
7,0.0187134229239045,我觉得中美贸易摩擦问题的发生有着深刻复杂的国际背景，当然也是中美关系的一部分。2019年4月中旬，“百名法学家百场报告会”省商务厅专场暨南粤法治报告会第五十八讲拟定在广州举办。现任中国社会科学...
8,0.0162403274374621,新冠肺炎疫情在全球大流行，让不少似是而非的言论借助汲取疫情教训流行开来。疫情究竟改变了什么？世界到底应汲取什么教训？疫情暴发后，污名化也流行开来，这在提示我们，似是而非的言论成为又一种病毒，损...
9,0.0140569278106001,自4月底公告股票停牌、宣布引战以来，泰禾股票先后拉出4个涨停板，股价从最低点4.06元一度追涨至7.7元，最大涨幅高达89.6%，呈现出一度爆棚的投资者信心。只听楼梯响，不见人下来。泰禾集团（...


In [19]:
import numpy as np
a = np.mat(np.arange(100).reshape(10,10))    
b = np.arange(100)

def checktype(mat):
    try:
        if isinstance(mat,(np.ndarray)):
            if len(mat.shape)>=2:
                return mat
            else:
                print('shape error')
        else:
            print('type error')
    except Exception as error:
        return error

checktype(a)

In [29]:
data = {"name":"test","lable":lambda data : data.dot(np.ones(10)),"data":np.arange(100).reshape(10,10)}
e = isinstance(data,dict)
type(data['lable']),(data["data"])

(function,
 array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
        [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
        [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
        [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
        [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]]))

In [35]:
def load(data):
    if isinstance(data,dict):
        if isinstance(data["data"],np.ndarray):
            return data['lable'](data["data"])
load(data)

array([ 45., 145., 245., 345., 445., 545., 645., 745., 845., 945.])