In [2]:
import jieba
import codecs
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize
from sklearn.externals import joblib
import re
from scipy.sparse import vstack
from tqdm import tqdm

## 加载数据
尝试将 train 和 test 文件都切词之后，计算每个 word 的 tf-idf，最后对于每个句子，选其中的 top10 或者 top20 个词作为候选关键词

然后根据词性，名词作为关键词的可能性比较大，再次筛选，最终得出候选关键词

再根据训练集给出的 实体，候选实体与真实实体进行对比，是实体标记为1，不是实体标记为0

再进行二分类，构造特征进行训练，由无监督变为有监督训练，提高准确率

In [2]:
def loadData(filePath):
    f = codecs.open(filePath,'r', 'utf-8')
    data = []
    for line in f.readlines():
        news = json.loads(line.strip())
        data.append(news)
    return data

In [4]:
# 1. train tfIdf as core entity score model
trainData = loadData('../data/coreEntityEmotion_train.txt')

In [5]:
trainData[0]

{'newsId': '4e36d02a',
 'coreEntityEmotions': [{'entity': '3d', 'emotion': 'POS'},
  {'entity': '工业', 'emotion': 'POS'},
  {'entity': '机器视觉', 'emotion': 'POS'}],
 'title': 'sia智慧工厂展，誉洋以“智”取胜',
 'content': '第十七届上海国际工业自动化及机器人展与上海智能工厂展览会于2019年3月1日圆满落下帷幕。展会展出规模90762平方米，国内外参展厂商达1000多家，并吸引超过100000名专业观众前来参观。此次展会全面展示了工业自动化技术，工业装配与传输技术、工业机器人整机与零部件、机器视觉工业应用、agv无人搬运、智慧工厂解决方案、工业自动化全面解决方案等产品领域。无论是展会规模、展示范围以及专业观众的人数，都实现了快速的增长，极大地推动了我国制造业的转型升级和跨越发展！誉洋3d机器视觉引发展会参观热潮在大连誉洋工业智能的展位，机器人自动抓取物件让人眼前一亮。机器人在3d视觉的引导下精准定位杂乱无序的目标，并实现准确快速抓取，整个过程井然有序，无需任何人工干预。誉洋现场工程师介绍，制造企业采用誉洋kineye®3d机器视觉系统，以往繁琐、枯燥的物料搬运工作交给了机器人，这样不但解放了人力，还提升了物流效率。这套3d机器视觉已在国内多家知名企业成功实施应用，得到了一致认可与好评。kineye®3d机器视觉实现企业智能制造誉洋工业智能有限公司自成立以来，一直专注于工业智能设备的研发、生产制造和服务，创新理念伴随企业不断成长。今天的誉洋已与中国多家企业达成合作伙伴关系，成为中国领先的智能设备供应商之一。以前瞻思维引领企业创新步伐，针对制造企业面临改革升级的痛点问题，誉洋与欧洲科研机构联合研发的kineye®3d机器视觉系统，拥有比肩世界的技术水准，可实现对制造业生产线的智能化改造。不仅如此，誉洋还可以提供柔性生产系统、智能仓储、智能制造设备等整套智能硬件、软件和一站式解决方案。'}

## 提取所有实体

In [6]:
entity = set()
emotion = set()
chars = {}
data = []
min_count = 2

with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
    for l in tqdm(f):
        a = json.loads(l.strip())
        data.append(
            {
                'newsId':a['newsId'],
                'title': a['title'],
                'content': a['content'],
                'coreEntityEmotions': [(i['entity'], i['emotion']) for i in a['coreEntityEmotions']],
            }
        )
        for c in a['content']:
            chars[c] = chars.get(c, 0) + 1
        for c in a['title']:
            chars[c] = chars.get(c, 0) + 1
        for c in a['coreEntityEmotions']:
            entity.add(c['entity'])
            emotion.add(c['emotion'])

40000it [00:19, 2015.57it/s]


In [488]:
data_ = []

with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
    for l in tqdm(f):
        a = json.loads(l.strip())
        data_.append(
            {
                'title': a['title'],
                'content': a['content'],
            }
        )


0it [00:00, ?it/s][A
132it [00:00, 1265.79it/s][A
661it [00:00, 1639.67it/s][A
1214it [00:00, 2066.90it/s][A
1981it [00:00, 2639.24it/s][A
2744it [00:00, 3283.30it/s][A
3776it [00:00, 4126.09it/s][A
4454it [00:01, 2708.16it/s][A
5316it [00:01, 3409.47it/s][A
6585it [00:01, 4367.47it/s][A
7401it [00:01, 4736.05it/s][A
8149it [00:01, 4402.31it/s][A
9002it [00:01, 5129.58it/s][A
9699it [00:01, 5548.38it/s][A
10661it [00:01, 6353.22it/s][A
11806it [00:02, 7331.19it/s][A
12813it [00:02, 7982.40it/s][A
13735it [00:02, 8115.00it/s][A
15445it [00:02, 9623.97it/s][A
16897it [00:02, 10706.41it/s][A
18216it [00:02, 11341.43it/s][A
19474it [00:02, 11322.99it/s][A
20693it [00:02, 11490.87it/s][A
22472it [00:02, 12856.44it/s][A
23853it [00:02, 11866.32it/s][A
25122it [00:03, 11522.57it/s][A
26662it [00:03, 12439.23it/s][A
28175it [00:03, 13139.31it/s][A
29544it [00:03, 13207.84it/s][A
30903it [00:03, 11776.10it/s][A
32241it [00:03, 12215.01it/s][A
33643it [00:03, 1265

In [540]:
def filter_text(text):
    re_tag0 = re.compile('</?\w+[^>]*>')  # HTML标签
    re_tag1 = re.compile(r'http://[a-zA-Z0-9.?/&=:]*',re.S)
    re_tag2 = re.compile(r'https://[a-zA-Z0-9.?/&=:]*',re.S)
    new_text = re.sub(re_tag0,"",text)
    new_text = re.sub(re_tag1,"",text)
    new_text = re.sub(re_tag2,"",text)
    new_text = re.sub("-+", "-", new_text)  # 合并-
    new_text = re.sub("———+", "——", new_text)  # 合并-
    return new_text

In [573]:
len(filter_data)

40000

In [572]:
%%time
filter_data = []
for data in data_:
    sen = [filter_text(data['title']) + filter_text(data['content'])]
    filter_data.extend(sen)

CPU times: user 2.24 s, sys: 1.05 s, total: 3.29 s
Wall time: 4.79 s


In [227]:
train_entity_dic = {}
with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
    for l in tqdm(f):
        a = json.loads(l.strip())
        train_entity_dic[a['newsId']] = [i['entity'] for i in a['coreEntityEmotions']]


0it [00:00, ?it/s][A
1369it [00:00, 13686.42it/s][A
3480it [00:00, 15283.20it/s][A
5462it [00:00, 15806.14it/s][A
6620it [00:00, 9569.66it/s] [A
8590it [00:00, 11315.02it/s][A
10812it [00:00, 13267.61it/s][A
12363it [00:00, 12796.69it/s][A
14251it [00:00, 13860.66it/s][A
15803it [00:01, 14319.77it/s][A
18091it [00:01, 16127.89it/s][A
19850it [00:01, 16375.71it/s][A
21923it [00:01, 17383.10it/s][A
23958it [00:01, 18175.73it/s][A
25848it [00:01, 18209.71it/s][A
27982it [00:01, 19047.88it/s][A
29933it [00:01, 15104.29it/s][A
31604it [00:02, 14036.71it/s][A
33651it [00:02, 15254.98it/s][A
35874it [00:02, 16838.41it/s][A
37693it [00:02, 15174.43it/s][A
39336it [00:02, 14663.26it/s][A
40000it [00:02, 15603.37it/s][A

In [558]:
len(train_entity_dic)

40000

In [228]:
train_entity_dic

{'4e36d02a': ['3d', '工业', '机器视觉'],
 'cb8e8b79': ['数据', '卫星', '可视化'],
 'f85c18e2': ['物流', '货运', '信息'],
 '889199b4': ['路面', '水泥', '病害'],
 'a811d20e': ['阳线', '股票', '涨势'],
 '4e798999': ['evga', '电源', '组件'],
 '1f5679fb': ['水泥', '病害', '路面'],
 '1a8339ea': ['信托', '房地产', '股权'],
 '51203867': ['租购并举', '公寓资产', '租客'],
 '5bb9047a': ['心理', '心理咨询师', '小程序开发'],
 '478af383': ['标普500', '摩根大通', '地缘政治'],
 'f0a7d8d9': ['移动支付', '人民币', '中华人民共和国'],
 'afee8716': ['周芷若', '杨逍', '纪晓芙'],
 'efd34fe8': ['廊坊市', '建筑面积', '土地'],
 '73ce42e1': ['宜采网', '简讯', '工厂'],
 '783adc02': ['集合竞价', '大盘', '江苏天鼎'],
 '960b8e04': ['股债双牛', '债券', '薛掌柜'],
 'd7cb8b85': ['一线', '价格', '走势'],
 '66fb9c0c': ['特斯拉', '人民币', '商业王道'],
 'cd510c0d': ['2019胡润全球白手起家女富豪榜', '胡润研究院', '吴亚军'],
 '80800a17': ['短线', '京威股份', '低价'],
 '2322249a': ['实验室', '方案', '领导'],
 'ac148da5': ['印度标准局', '印度', 'bis'],
 'fe0ae621': ['亚马逊', '淘宝', '店铺'],
 '530ccf21': ['广告机', '商场', '消费者'],
 '7885555d': ['关键词', '着陆页', '百度'],
 'c494a2ab': ['dna', 'hla', '基因'],
 '779320d5': ['闵志华', '重庆水利电力职

In [197]:
with codecs.open('../runs/train_entity_dic.json', 'w', encoding='utf-8') as f:
    json.dump(train_entity_dic, f, indent=4, ensure_ascii=False)

In [8]:
with open('../runs/entity.txt', 'w', encoding='utf-8') as f:
    for v in entity:
        f.write(v+'\n')

In [9]:
id2emotion = {i:j for i,j in enumerate(emotion)}
emotion2id = {j:i for i,j in id2emotion.items()}

with open('../runs/emotion.json', 'w', encoding='utf-8') as f:
    json.dump([id2emotion, emotion2id], f, indent=4, ensure_ascii=False)

In [10]:
len(data)

40000

In [11]:
data[0]

{'newsId': '4e36d02a',
 'title': 'sia智慧工厂展，誉洋以“智”取胜',
 'content': '第十七届上海国际工业自动化及机器人展与上海智能工厂展览会于2019年3月1日圆满落下帷幕。展会展出规模90762平方米，国内外参展厂商达1000多家，并吸引超过100000名专业观众前来参观。此次展会全面展示了工业自动化技术，工业装配与传输技术、工业机器人整机与零部件、机器视觉工业应用、agv无人搬运、智慧工厂解决方案、工业自动化全面解决方案等产品领域。无论是展会规模、展示范围以及专业观众的人数，都实现了快速的增长，极大地推动了我国制造业的转型升级和跨越发展！誉洋3d机器视觉引发展会参观热潮在大连誉洋工业智能的展位，机器人自动抓取物件让人眼前一亮。机器人在3d视觉的引导下精准定位杂乱无序的目标，并实现准确快速抓取，整个过程井然有序，无需任何人工干预。誉洋现场工程师介绍，制造企业采用誉洋kineye®3d机器视觉系统，以往繁琐、枯燥的物料搬运工作交给了机器人，这样不但解放了人力，还提升了物流效率。这套3d机器视觉已在国内多家知名企业成功实施应用，得到了一致认可与好评。kineye®3d机器视觉实现企业智能制造誉洋工业智能有限公司自成立以来，一直专注于工业智能设备的研发、生产制造和服务，创新理念伴随企业不断成长。今天的誉洋已与中国多家企业达成合作伙伴关系，成为中国领先的智能设备供应商之一。以前瞻思维引领企业创新步伐，针对制造企业面临改革升级的痛点问题，誉洋与欧洲科研机构联合研发的kineye®3d机器视觉系统，拥有比肩世界的技术水准，可实现对制造业生产线的智能化改造。不仅如此，誉洋还可以提供柔性生产系统、智能仓储、智能制造设备等整套智能硬件、软件和一站式解决方案。',
 'coreEntityEmotions': [('3d', 'POS'), ('工业', 'POS'), ('机器视觉', 'POS')]}

In [12]:
with codecs.open('../runs/all_train_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

In [13]:
with codecs.open('../runs/all_chars.json', 'w', encoding='utf-8') as f:
    chars = {i:j for i,j in chars.items() if j >= min_count}
    id2char = {i+2:j for i,j in enumerate(chars)} # padding: 0, unk: 1
    char2id = {j:i for i,j in id2char.items()}
    json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)

## 分词

In [5]:
import os
files = os.listdir('../字典')
files

['歌手.txt',
 '手机型号.txt',
 '百度明星.txt',
 'val_keywords.txt',
 'person.txt',
 '实体名词.txt',
 '美食.txt',
 '显卡.txt',
 '自定义词典.txt',
 'FIFA.txt',
 '漫漫看_明星.txt',
 '网络流行新词.txt',
 '篮球.txt',
 'NBA.txt',
 '动漫.txt',
 'origin_zimu.txt',
 '明星.txt',
 '创造101.txt',
 '足球.txt',
 '流行歌.txt',
 '电影.txt',
 '电视剧.txt',
 '出现的作品名字.txt',
 '百度热点人物+手机+软件.txt']

In [595]:
stop_words = []
with open('../data/stop_words.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip('\n')
        stop_words.append(line)

In [7]:
import jieba
for file_name in files:
    jieba.load_userdict('../字典/'+file_name)
jieba.load_userdict('../runs/entity.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/gn/q_dy30hj2_l93fy7_4h_b2b40000gn/T/jieba.cache
Loading model cost 1.035 seconds.
Prefix dict has been built succesfully.


In [8]:
import jieba.posseg as psg
import jieba.analyse
jieba.analyse.set_stop_words('../data/stop_words.txt')

### train
太费时了，刚开始以为是jieba和jieba.posseg切词慢，后来注释掉后面的过滤代码之后，发现秒切

写的代码时间复杂度太高。

避免使用for循环，就算使用for循环，也要进一步优化剪枝

尽量使用map，filter，lambda等函数

#### 分词，包括词性，存储在字典中，保存在文件里

In [26]:
%%time
train_data = []
train_cut_dic = {}
i = 0
for x in data:
    if len(x.items()) == 4:
        try:
            title = x['title']
            content = x['content']

            title_words = psg.cut(title)
            content_words = psg.cut(content)
            
            title_filter = [word for word in title_words if word not in stop_words]
            content_filter = [word for word in content_words if word not in stop_words]
            
            for word,flag in title_filter:
                train_cut_dic[word] = flag
            for word,flag in content_filter:
                train_cut_dic[word] = flag

            temp = {}
            temp['newsId'] = x['newsId']
            temp['title'] = title_filter
            temp['content'] = content_filter
            entitys = [c[0] for c in x['coreEntityEmotions']]
            temp['entities'] = entitys
            train_data.append(temp)
            if i % 500 == 0:
                print(i, ' 条数据')
            i += 1
        except KeyError:
            print(x)
            pass

CPU times: user 4h 21min 24s, sys: 4min 8s, total: 4h 25min 33s
Wall time: 15h 35min 5s


In [33]:
data_path = '../runs/all_train_data.json'
def load_train_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as load_f:
        data = json.load(load_f)
        return data
def cut_train(data_path):
    final_train_data = []
    train_data = load_train_data(data_path)
    i = 0
    for x in train_data:
        if len(x.items()) == 4:
            try:
                title = x['title']
                content = x['content']

                title_words = jieba.cut(title)
                content_words = jieba.cut(content)
                
                title_filter = filter(lambda x: x not in stop_words, title_words)
                content_filter = filter(lambda x: x not in stop_words, content_words)

                temp = {}
                temp['newsId'] = x['newsId']
                temp['title'] = list(title_filter)
                temp['content'] = list(content_filter)
                entitys = [c[0] for c in x['coreEntityEmotions']]
                temp['entities'] = entitys
                final_train_data.append(temp)

                if i % 500 == 0:
                    print(i,' data finish')
                i+=1
            except KeyError:
                print(x)
                pass
    return final_train_data

In [597]:
stop_words.index('\\n')

38

In [602]:
%%time
filter_data_ = []
i = 0
for data in filter_data:
    words = jieba.cut(data)
    words = filter(lambda x: x not in stop_words, words)
    filter_data_.append(list(words))
    if i % 500 == 0:
        print(i)
    i+=1

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
CPU times: user 12min 53s, sys: 1.67 s, total: 12min 54s
Wall time: 12min 57s


In [611]:
sentences = []
for data in filter_data_:
    line = ' '.join(data)
    sentences.append(line)                   

In [612]:
len(sentences)

40000

In [613]:
sentences[0]

'sia 智慧工厂 誉洋以 第十七届 工业自动化 机器人 智能工厂 展览会 2019 90762 平方米 国内外 1000 100000 工业自动化 传输技术 工业机器人 零部件 机器视觉 agv 智慧工厂 解决方案 工业自动化 解决方案 无论是 制造业 机器视觉 机器人 眼前一亮 机器人 精准定位 杂乱无序 井然有序 人工干预 工程师 kine 机器视觉 机器人 解放了 机器视觉 知名企业 kine 机器视觉 智能制造 有限公司 智能设备 合作伙伴 智能设备 供应商 科研机构 kine 机器视觉 技术水准 制造业 生产线 智能化 不仅如此 智能仓储 智能制造 一站式 解决方案'

In [615]:
with open('../runs/filter_data.txt', 'w', encoding='utf-8') as f:
    for data in sentences:
        f.write(data+"\n")

In [None]:

def cut_train(data_path):
    final_train_data = []
    train_data = load_train_data(data_path)
    i = 0
    for x in train_data:
        if len(x.items()) == 4:
            try:
                title = x['title']
                content = x['content']

                title_words = jieba.cut(title)
                content_words = jieba.cut(content)
                
                title_filter = filter(lambda x: x not in stop_words, title_words)
                content_filter = filter(lambda x: x not in stop_words, content_words)

                temp = {}
                temp['newsId'] = x['newsId']
                temp['title'] = list(title_filter)
                temp['content'] = list(content_filter)
                entitys = [c[0] for c in x['coreEntityEmotions']]
                temp['entities'] = entitys
                final_train_data.append(temp)

                if i % 500 == 0:
                    print(i,' data finish')
                i+=1
            except KeyError:
                print(x)
                pass
    return final_train_data

In [34]:
%%time
final_train_data = cut_train(data_path)
final_train_data[0]

0  data finish
500  data finish
1000  data finish
1500  data finish
2000  data finish
2500  data finish
3000  data finish
3500  data finish
4000  data finish
4500  data finish
5000  data finish
5500  data finish
6000  data finish
6500  data finish
7000  data finish
7500  data finish
8000  data finish
8500  data finish
9000  data finish
9500  data finish
10000  data finish
10500  data finish
11000  data finish
11500  data finish
12000  data finish
12500  data finish
13000  data finish
13500  data finish
14000  data finish
14500  data finish
15000  data finish
15500  data finish
16000  data finish
16500  data finish
17000  data finish
17500  data finish
18000  data finish
18500  data finish
19000  data finish
19500  data finish
20000  data finish
20500  data finish
21000  data finish
21500  data finish
22000  data finish
22500  data finish
23000  data finish
23500  data finish
24000  data finish
24500  data finish
25000  data finish
25500  data finish
26000  data finish
26500  data finis

In [35]:
final_train_data[0]

{'newsId': '4e36d02a',
 'title': ['sia', '智慧工厂', '展', '誉洋以', '智', '取胜'],
 'content': ['第十七届',
  '上海',
  '国际',
  '工业自动化',
  '机器人',
  '展与',
  '上海',
  '智能工厂',
  '展览会',
  '2019',
  '年',
  '月',
  '日',
  '圆满',
  '落下',
  '帷幕',
  '展会',
  '展出',
  '规模',
  '90762',
  '平方米',
  '国内外',
  '参展',
  '厂商',
  '达',
  '1000',
  '多家',
  '吸引',
  '超过',
  '100000',
  '名',
  '专业',
  '观众',
  '前来',
  '参观',
  '展会',
  '展示',
  '工业自动化',
  '技术',
  '工业',
  '装配',
  '传输技术',
  '工业机器人',
  '整机',
  '零部件',
  '机器视觉',
  '工业',
  'agv',
  '无人',
  '搬运',
  '智慧工厂',
  '解决方案',
  '工业自动化',
  '解决方案',
  '产品',
  '领域',
  '无论是',
  '展会',
  '规模',
  '展示',
  '专业',
  '观众',
  '人数',
  '快速',
  '增长',
  '推动',
  '我国',
  '制造业',
  '转型',
  '升级',
  '跨越',
  '发展',
  '誉洋',
  '3d',
  '机器视觉',
  '引发',
  '展会',
  '参观',
  '热潮',
  '大连',
  '誉洋',
  '工业',
  '智能',
  '展位',
  '机器人',
  '自动',
  '抓取',
  '物件',
  '眼前一亮',
  '机器人',
  '3d',
  '视觉',
  '引导',
  '精准定位',
  '杂乱无序',
  '目标',
  '准确',
  '快速',
  '抓取',
  '过程',
  '井然有序',
  '无需',
  '人工干预',
  '誉洋',
  '现场',
  '工程师',
  '介绍',
  '制造

In [36]:
with open('../runs/all_train_cut.json', 'w', encoding='utf-8') as f:
    json.dump(final_train_data, f, indent=4, ensure_ascii=False)

In [67]:
final_train_data[0]

{'newsId': '4e36d02a',
 'title': ['sia', '智慧工厂', '展', '誉洋以', '智', '取胜'],
 'content': ['第十七届',
  '上海',
  '国际',
  '工业自动化',
  '机器人',
  '展与',
  '上海',
  '智能工厂',
  '展览会',
  '2019',
  '年',
  '月',
  '日',
  '圆满',
  '落下',
  '帷幕',
  '展会',
  '展出',
  '规模',
  '90762',
  '平方米',
  '国内外',
  '参展',
  '厂商',
  '达',
  '1000',
  '多家',
  '吸引',
  '超过',
  '100000',
  '名',
  '专业',
  '观众',
  '前来',
  '参观',
  '展会',
  '展示',
  '工业自动化',
  '技术',
  '工业',
  '装配',
  '传输技术',
  '工业机器人',
  '整机',
  '零部件',
  '机器视觉',
  '工业',
  'agv',
  '无人',
  '搬运',
  '智慧工厂',
  '解决方案',
  '工业自动化',
  '解决方案',
  '产品',
  '领域',
  '无论是',
  '展会',
  '规模',
  '展示',
  '专业',
  '观众',
  '人数',
  '快速',
  '增长',
  '推动',
  '我国',
  '制造业',
  '转型',
  '升级',
  '跨越',
  '发展',
  '誉洋',
  '3d',
  '机器视觉',
  '引发',
  '展会',
  '参观',
  '热潮',
  '大连',
  '誉洋',
  '工业',
  '智能',
  '展位',
  '机器人',
  '自动',
  '抓取',
  '物件',
  '眼前一亮',
  '机器人',
  '3d',
  '视觉',
  '引导',
  '精准定位',
  '杂乱无序',
  '目标',
  '准确',
  '快速',
  '抓取',
  '过程',
  '井然有序',
  '无需',
  '人工干预',
  '誉洋',
  '现场',
  '工程师',
  '介绍',
  '制造

In [None]:
tran_dic_path = '../runs/train_dic.json'
with codecs.open(tran_dic_path, 'w', encoding='utf-8') as f:
    json.dump(final_train_cut_dic, f, indent=4, ensure_ascii=False)

### test

In [56]:
def load_test():
    test_chars = {}
    test_data = []
    min_count = 2

    with open('../data/coreEntityEmotion_test_stage1.txt', encoding='utf-8') as f:
        for l in tqdm(f):
            a = json.loads(l.strip())
            test_data.append(
                {
                    'newsId': a['newsId'],
                    'title': a['title'],
                    'content': a['content'],
                }
            )
            for c in a['content']:
                test_chars[c] = test_chars.get(c, 0) + 1
            for c in a['title']:
                test_chars[c] = test_chars.get(c, 0) + 1

    with codecs.open('../runs/test_chars.json', 'w', encoding='utf-8') as f:
        chars = {i: j for i, j in test_chars.items() if j >= min_count}
        id2char = {i + 2: j for i, j in enumerate(chars)}  # padding: 0, unk: 1
        char2id = {j: i for i, j in id2char.items()}
        json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)
    return test_data

In [57]:
test_data = load_test()

40000it [00:17, 2225.92it/s]


In [63]:
def cut_test(test_data):
    final_test_data = []
    i = 0
    for x in test_data:
        if len(x.items()) == 3:
            try:
                title = x['title']
                content = x['content']

                title_words = jieba.cut(title)
                content_words = jieba.cut(content)

                title_filter = filter(lambda x: x not in stop_words and len(x.strip()) > 0, title_words)
                content_filter = filter(lambda x: x not in stop_words and len(x.strip()) > 0, content_words)

                temp = {}
                temp['newsId'] = x['newsId']
                temp['title'] = list(title_filter)
                temp['content'] = list(content_filter)
                final_test_data.append(temp)
                if i%500 == 0:
                    print(i,' data finish')
                i+=1
            except KeyError:
                print(x)
                pass
    return final_test_data

In [64]:
%%time
final_test_data = cut_test(test_data)
final_test_data[0]

0  data finish
500  data finish
1000  data finish
1500  data finish
2000  data finish
2500  data finish
3000  data finish
3500  data finish
4000  data finish
4500  data finish
5000  data finish
5500  data finish
6000  data finish
6500  data finish
7000  data finish
7500  data finish
8000  data finish
8500  data finish
9000  data finish
9500  data finish
10000  data finish
10500  data finish
11000  data finish
11500  data finish
12000  data finish
12500  data finish
13000  data finish
13500  data finish
14000  data finish
14500  data finish
15000  data finish
15500  data finish
16000  data finish
16500  data finish
17000  data finish
17500  data finish
18000  data finish
18500  data finish
19000  data finish
19500  data finish
20000  data finish
20500  data finish
21000  data finish
21500  data finish
22000  data finish
22500  data finish
23000  data finish
23500  data finish
24000  data finish
24500  data finish
25000  data finish
25500  data finish
26000  data finish
26500  data finis

In [65]:
final_test_data[4]

{'newsId': 'b6514ace',
 'title': ['管理', '多个', '云', '环境'],
 'content': ['安全性',
  '控制',
  '兼容性',
  '关键性',
  '论点',
  'it',
  '管理人员',
  '迁移',
  '公共',
  '云',
  '基础架构',
  '服务',
  '云计算',
  'iaas',
  '提供',
  '好处',
  '传统',
  '内部',
  '部署',
  '计算',
  '模型',
  '相比',
  '服务',
  '配置',
  '支付',
  '方式',
  '差异',
  '迁移',
  '公共',
  'iaas',
  'it',
  '管理人员',
  '接受',
  '专注',
  '云服务',
  '新',
  '管理层',
  '多个',
  '云',
  '部署',
  '事情',
  '变得',
  '拥有',
  '一套',
  '强大',
  '配置管理',
  '工具',
  'it',
  '部门',
  '发现',
  '添加',
  '第二个',
  '第三个',
  '云计算',
  '服务提供商',
  '改变',
  'it',
  '管理人员',
  '应',
  '需求',
  '制定',
  '云计算',
  '决策',
  '如果没有',
  '某种',
  '类型',
  '控制',
  '监控',
  'iaas',
  '云',
  '环境',
  'it',
  '经理',
  '盲目',
  '运行',
  '面临',
  '预算',
  '风险',
  'it',
  '经理',
  '永远',
  '奢侈',
  '关闭',
  '内部',
  '托管',
  '数据中心',
  '工作',
  '负载',
  '转移',
  '单个',
  '云计算',
  '提供商',
  '不可避免',
  '工作',
  '负载',
  '保留',
  '内部',
  '支持',
  '平台',
  '遗留',
  '应用程序',
  '应用程序',
  '打印机',
  '管理',
  '公私',
  '混合',
  '方法',
  '云计算',
  '环境',
  '管理',
  '变得复杂',
  '

In [66]:
cut_test_path = '../runs/all_test_cut.json'
with codecs.open(cut_test_path, 'w', encoding='utf-8') as f:
    json.dump(final_test_data, f, indent=4, ensure_ascii=False)

## 特征抽取
### 构建TF-IDF词典

In [119]:
#! -*- coding:utf-8 -*-
def cut_data():
    files = os.listdir('../字典')
    for file_name in files:
        jieba.load_userdict('../字典/' + file_name)
    jieba.load_userdict('../runs/entity.txt')

    orig_data = []
    k = 0
    with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
        for l in tqdm(f):
            a = json.loads(l.strip())
            orig_data.append(
                {
                    'newsId': a['newsId'],
                    'title': a['title'],
                    'content': a['content']
                }
            )
            k += 1
            if k%20==0:
                print('k', k)
            if k==100:
                break

    stop_words = []
    with open('../data/stop_words.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace("\n", "")
            stop_words.append(line)

    small_word_dic = {}
    all_docs = []
    i = 0
    for data in orig_data:
        sentence = []
        title = filter_text(data['title'])
        title = title.replace("\n", "")
        content = filter_text(data['content'])
        content = content.replace("\n", "")
        titles = jieba.cut(title)
        contents = jieba.cut(content)
        titles = filter(lambda x: x not in stop_words, titles)
        contents = filter(lambda x: x not in stop_words, contents)
        sentence.append(list(titles))
        sentence.append(list(contents))
        small_word_dic[data['newsId']] = sentence
        if i % 500 == 0:
            print(i, ' data finish')
        i += 1

    with codecs.open('../runs/small_word_dic.json', 'w', encoding='utf-8') as f:
        json.dump(small_word_dic, f, indent=4, ensure_ascii=False)
    return small_word_dic

In [120]:
small_word_dic = cut_data()

0it [00:00, ?it/s]


k 20
k 40
k 60
k 80
k 100
0  data finish


In [122]:
len(small_word_dic)

100

In [121]:
small_word_dic['4e36d02a']

[['sia', '智慧工厂', '展', '誉洋以', '智', '取胜'],
 ['第十七届',
  '上海',
  '国际',
  '工业自动化',
  '机器人',
  '展与',
  '上海',
  '智能工厂',
  '展览会',
  '无人',
  '搬运',
  '智慧工厂',
  '解决方案',
  '工业自动化',
  '解决方案',
  '产品',
  '领域',
  '无论是',
  '展会',
  '规模',
  '展示',
  '专业',
  '观众',
  '人数',
  '快速',
  '增长',
  '推动',
  '我国',
  '制造业',
  '转型',
  '升级',
  '跨越',
  '发展',
  '誉洋',
  '3d',
  '机器视觉',
  '引发',
  '展会',
  '参观',
  '热潮',
  '大连',
  '誉洋',
  '工业',
  '智能',
  '展位',
  '机器人',
  '自动',
  '抓取',
  '物件',
  '眼前一亮',
  '机器人',
  '3d',
  '视觉',
  '引导',
  '精准定位',
  '杂乱无序',
  '目标',
  '准确',
  '快速',
  '抓取',
  '过程',
  '井然有序',
  '无需',
  '人工干预',
  '誉洋',
  '现场',
  '工程师',
  '介绍',
  '制造',
  '企业',
  '采用',
  '誉洋',
  'kine',
  'ye',
  '®',
  '3d',
  '机器视觉',
  '系统',
  '以往',
  '繁琐',
  '枯燥',
  '物料',
  '搬运',
  '工作',
  '交给',
  '机器人',
  '解放了',
  '人力',
  '提升',
  '物流',
  '效率',
  '这套',
  '3d',
  '机器视觉',
  '国内',
  '多家',
  '知名企业',
  '成功',
  '实施',
  '认可',
  '好评',
  'kine',
  'ye',
  '®',
  '3d',
  '机器视觉',
  '企业',
  '智能制造',
  '誉洋',
  '工业',
  '智能',
  '有限公司',
  '成立',
  '专

In [497]:
#! -*- coding:utf-8 -*-
def cut_test_data():
    files = os.listdir('../字典')
    for file_name in files:
        jieba.load_userdict('../字典/' + file_name)
    jieba.load_userdict('../runs/entity.txt')

    orig_data = []
    k = 0
    with open('../data/coreEntityEmotion_test_stage1.txt', encoding='utf-8') as f:
        for l in tqdm(f):
            a = json.loads(l.strip())
            orig_data.append(
                {
                    'newsId': a['newsId'],
                    'title': a['title'],
                    'content': a['content']
                }
            )
            k += 1
            if k%20==0:
                print('k', k)
            if k==100:
                break

    stop_words = []
    with open('../data/stop_words.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace("\n", "")
            stop_words.append(line)

    small_word_dic = {}
    all_docs = []
    i = 0
    for data in orig_data:
        sentence = []
        title = filter_text(data['title'])
        title = title.replace("\n", "")
        content = filter_text(data['content'])
        content = content.replace("\n", "")
        titles = jieba.cut(title)
        contents = jieba.cut(content)
        titles = filter(lambda x: x not in stop_words, titles)
        contents = filter(lambda x: x not in stop_words, contents)
        sentence.append(list(titles))
        sentence.append(list(contents))
        small_word_dic[data['newsId']] = sentence
        if i % 500 == 0:
            print(i, ' data finish')
        i += 1

    with codecs.open('../runs/small_word_test_dic.json', 'w', encoding='utf-8') as f:
        json.dump(small_word_dic, f, indent=4, ensure_ascii=False)
    return small_word_dic

In [498]:
small_word_test_dic = cut_test_data()

0it [00:00, ?it/s]


k 20
k 40
k 60
k 80
k 100
0  data finish


In [500]:
len(small_word_test_dic)

100

In [499]:
small_word_test_dic['a8df14bc']

[['现阶段', '国内', '各类', '资管', '计划', '运作', '模式', '区别'],
 ['资管',
  '计划',
  '投资者',
  '提供',
  '高',
  '收益',
  '是因为',
  '它少',
  '中间环节',
  '融资',
  '项目',
  '走',
  '信托',
  '通道',
  '信托公司',
  '2%',
  '高',
  '费用',
  '代理',
  '销售',
  '机构',
  '1%',
  '高',
  '费用',
  '一般而言',
  '信托',
  '产品',
  '最终',
  '募集',
  '资金',
  '至少',
  '5%',
  '要用',
  '中间环节',
  '资管',
  '计划',
  '绕开',
  '信托公司',
  '费用',
  '节省',
  '融资成本',
  '体现',
  '投资者',
  '投资收益',
  '资管',
  '计划',
  '收益率',
  '高',
  '基金',
  '子公司',
  '资管',
  '计划',
  '业务',
  '分为',
  '通道',
  '类',
  '业务',
  '基金',
  '子公司',
  '主动',
  '管理',
  '类',
  '业务',
  '通道',
  '类',
  '业务',
  '包括',
  '管理',
  '人为',
  '第三方',
  '通道',
  '业务',
  '资管',
  '嵌套',
  '信托',
  '类',
  '业务',
  '市场',
  '常见',
  '资管',
  '业务',
  '包括',
  '基金',
  '子公司',
  '资管',
  '券商',
  '资管',
  '资管',
  '银行',
  '资管',
  '熟悉',
  '信托',
  '资管',
  '范畴',
  '赘述',
  '基金',
  '子公司',
  '资管',
  '政策法规',
  '2012',
  '年',
  '月',
  '26',
  '日',
  '证监会',
  '公布',
  '基金',
  '管理',
  '公司',
  '特定',
  '客户',
  '资产管理',
  '业务',
  '试点',
  '办法',
  '10',
  

In [19]:
#! -*- coding:utf-8 -*-

import re
from tqdm import tqdm
import jieba
import codecs
import json
import os

def filter_text(text):
    re_tag0 = re.compile('</?\w+[^>]*>')  # HTML标签
    re_tag1 = re.compile(r'http://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.S)
    re_tag2 = re.compile(r'https://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.S)
    re_tag3 = re.compile('(?<=\>).*?(?=\<)')
    re_tag4 = re.compile('购买链接')
    re_tag5 = re.compile('京东：')
    re_tag6 = re.compile('淘宝：')
    re_tag7 = re.compile(r'\d.*?w|\d.*?v|\d.*?a|\d.*?亿元|\d.*?元|\d.*?plus')
    new_text = re.sub(re_tag0,"",text)
    new_text = re.sub(re_tag1,"",new_text)
    new_text = re.sub(re_tag2,"",new_text)
    new_text = re.sub(re_tag3,"",new_text)
    new_text = re.sub(re_tag4,"",new_text)
    new_text = re.sub(re_tag5,"",new_text)
    new_text = re.sub(re_tag6,"",new_text)
    new_text = re.sub(re_tag7,"",new_text)
    new_text = re.sub("-+", "-", new_text)  # 合并-
    new_text = re.sub("———+", "——", new_text)  # 合并-
    return new_text


def filter_cut_data():
    files = os.listdir('../字典')
    for file_name in files:
        jieba.load_userdict('../字典/' + file_name)
    jieba.load_userdict('../runs/entity.txt')

    orig_data = []
    k = 0
    with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
        for l in tqdm(f):
            a = json.loads(l.strip())
            orig_data.append(
                {
                    'newsId': a['newsId'],
                    'title': a['title'],
                    'content': a['content']
                }
            )
            k += 1
            if k%20==0:
                print('k', k)
            if k==100:
                break

    stop_words = []
    with open('../data/stop_words.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace("\n", "")
            stop_words.append(line)

    all_word_dic = {}
    all_docs = []
    i = 0
    for data in orig_data:
        sentence = []
        line = filter_text(data['title']) + filter_text(data['content'])
        line = line.replace("\n", "")
        words = jieba.cut(line)
        words = filter(lambda x: x not in stop_words, words)
        sentence.extend(list(words))
        all_word_dic[data['newsId']] = sentence
        all_docs.append(sentence)
        if i % 500 == 0:
            print(i, ' data finish')
        i += 1

    with codecs.open('../runs/small_train_test_dic.json', 'w', encoding='utf-8') as f:
        json.dump(all_word_dic, f, indent=4, ensure_ascii=False)

    with codecs.open('../runs/small_docs.json', 'w', encoding='utf-8') as f:
        json.dump(all_docs, f, indent=4, ensure_ascii=False)
    
    return all_word_dic, all_docs

In [509]:
#! -*- coding:utf-8 -*-

import re
from tqdm import tqdm
import jieba
import codecs
import json
import os

def filter_text(text):
    re_tag0 = re.compile('</?\w+[^>]*>')  # HTML标签
    re_tag1 = re.compile(r'http://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.S)
    re_tag2 = re.compile(r'https://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.S)
    re_tag3 = re.compile('(?<=\>).*?(?=\<)')
    re_tag4 = re.compile('购买链接')
    re_tag5 = re.compile('京东：')
    re_tag6 = re.compile('淘宝：')
    re_tag7 = re.compile(r'\d.*?w|\d.*?v|\d.*?a|\d.*?亿元|\d.*?元|\d.*?plus')
    new_text = re.sub(re_tag0,"",text)
    new_text = re.sub(re_tag1,"",new_text)
    new_text = re.sub(re_tag2,"",new_text)
    new_text = re.sub(re_tag3,"",new_text)
    new_text = re.sub(re_tag4,"",new_text)
    new_text = re.sub(re_tag5,"",new_text)
    new_text = re.sub(re_tag6,"",new_text)
    new_text = re.sub(re_tag7,"",new_text)
    new_text = re.sub("-+", "-", new_text)  # 合并-
    new_text = re.sub("———+", "——", new_text)  # 合并-
    return new_text


def filter_cut_data():
    files = os.listdir('../字典')
    for file_name in files:
        jieba.load_userdict('../字典/' + file_name)
    jieba.load_userdict('../runs/entity.txt')

    orig_data = []
    k = 0
    with open('../data/coreEntityEmotion_test_stage1.txt', 'r', encoding='utf-8') as f:
        for l in tqdm(f):
            a = json.loads(l.strip())
            orig_data.append(
                {
                    'newsId': a['newsId'],
                    'title': a['title'],
                    'content': a['content']
                }
            )
            k += 1
            if k%20==0:
                print('k', k)
            if k==100:
                break

    stop_words = []
    with open('../data/stop_words.txt', 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace("\n", "")
            stop_words.append(line)

    all_word_dic = {}
    all_docs = []
    i = 0
    for data in orig_data:
        sentence = []
        line = filter_text(data['title']) + filter_text(data['content'])
        line = line.replace("\n", "")
        words = jieba.cut(line)
        words = filter(lambda x: x not in stop_words, words)
        sentence.extend(list(words))
        all_word_dic[data['newsId']] = sentence
        all_docs.append(sentence)
        if i % 500 == 0:
            print(i, ' data finish')
        i += 1

    with codecs.open('../runs/small_test_dic.json', 'w', encoding='utf-8') as f:
        json.dump(all_word_dic, f, indent=4, ensure_ascii=False)

    with codecs.open('../runs/small_test_docs.json', 'w', encoding='utf-8') as f:
        json.dump(all_docs, f, indent=4, ensure_ascii=False)
    
    return all_word_dic, all_docs

In [510]:
test_word_dic, test_docs = filter_cut_data()

0it [00:00, ?it/s]


k 20
k 40
k 60
k 80
k 100
0  data finish


In [20]:
all_word_dic, all_docs = filter_cut_data()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/gn/q_dy30hj2_l93fy7_4h_b2b40000gn/T/jieba.cache
Loading model cost 1.302 seconds.
Prefix dict has been built succesfully.
0it [00:00, ?it/s]


k 20
k 40
k 60
k 80
k 100
0  data finish


In [22]:
all_docs[0]

['sia',
 '智慧工厂',
 '展',
 '誉洋以',
 '智',
 '取胜',
 '第十七届',
 '上海',
 '国际',
 '工业自动化',
 '机器人',
 '展与',
 '上海',
 '智能工厂',
 '展览会',
 '无人',
 '搬运',
 '智慧工厂',
 '解决方案',
 '工业自动化',
 '解决方案',
 '产品',
 '领域',
 '无论是',
 '展会',
 '规模',
 '展示',
 '专业',
 '观众',
 '人数',
 '快速',
 '增长',
 '推动',
 '我国',
 '制造业',
 '转型',
 '升级',
 '跨越',
 '发展',
 '誉洋',
 '3d',
 '机器视觉',
 '引发',
 '展会',
 '参观',
 '热潮',
 '大连',
 '誉洋',
 '工业',
 '智能',
 '展位',
 '机器人',
 '自动',
 '抓取',
 '物件',
 '眼前一亮',
 '机器人',
 '3d',
 '视觉',
 '引导',
 '精准定位',
 '杂乱无序',
 '目标',
 '准确',
 '快速',
 '抓取',
 '过程',
 '井然有序',
 '无需',
 '人工干预',
 '誉洋',
 '现场',
 '工程师',
 '介绍',
 '制造',
 '企业',
 '采用',
 '誉洋',
 'kine',
 'ye',
 '®',
 '3d',
 '机器视觉',
 '系统',
 '以往',
 '繁琐',
 '枯燥',
 '物料',
 '搬运',
 '工作',
 '交给',
 '机器人',
 '解放了',
 '人力',
 '提升',
 '物流',
 '效率',
 '这套',
 '3d',
 '机器视觉',
 '国内',
 '多家',
 '知名企业',
 '成功',
 '实施',
 '认可',
 '好评',
 'kine',
 'ye',
 '®',
 '3d',
 '机器视觉',
 '企业',
 '智能制造',
 '誉洋',
 '工业',
 '智能',
 '有限公司',
 '成立',
 '专注',
 '工业',
 '智能设备',
 '研发',
 '生产',
 '制造',
 '服务',
 '创新',
 '理念',
 '伴随',
 '企业',
 '成长',
 '誉洋',
 '中国',
 '多家',
 '企业',


In [21]:
all_word_dic.keys()

dict_keys(['4e36d02a', 'cb8e8b79', 'f85c18e2', '889199b4', 'a811d20e', '4e798999', '1f5679fb', '1a8339ea', '51203867', '5bb9047a', '478af383', 'f0a7d8d9', 'afee8716', 'efd34fe8', '73ce42e1', '783adc02', '960b8e04', 'd7cb8b85', '66fb9c0c', 'cd510c0d', '80800a17', '2322249a', 'ac148da5', 'fe0ae621', '530ccf21', '7885555d', 'c494a2ab', '779320d5', '46f7801f', 'ed3a5d29', 'b32613ba', '905a9ab3', 'bc3d1606', '59c0e2d8', '9d207e9f', '4dc75398', 'e43652a2', 'e69d3922', '19e8c4a1', '3f3e2630', '049c541c', '22745d83', '342a6448', '96778d66', '114ad54b', '85a079a4', '9c5ba3e7', 'db268d92', '27c7e128', '1203712d', '057046f1', '97c6069e', '3a8570b3', 'eab07684', '3d1a201b', 'f7bd97f7', '36b4f3b5', '68e8e571', '630bd6c1', '5350d73e', 'ec2653a7', 'e34d2aed', '78d7a4ba', '52ea3732', '78e67428', 'e9799988', 'f07ef591', 'eafaa899', '09002bb5', 'b97ab275', '5f2de988', '254203ca', 'b8caaa52', 'e3c6a6f4', '317494c0', 'bd7f602c', '922c203c', '579edaad', 'cdf7d348', 'b4bb3ad0', '5b655e03', '39bf0855', 'e5ec

In [511]:
f = codecs.open('../runs/small_test_dic.json', 'r', 'utf-8')
final_train_test_dic_1 = json.load(f)

In [512]:
final_train_test_dic_1['da1f7a3b']

['用益',
 '信托',
 '理财',
 '周刊',
 '集合',
 '信托',
 '发行',
 '规模',
 '超',
 '300',
 '亿',
 '基础产业',
 '信托',
 '收益',
 '飙至',
 '9.35',
 '夺魁',
 '产品',
 '动态',
 '每周',
 '高净值',
 '理财产品',
 '精选',
 '产品分析',
 '信托',
 '发行',
 '市场',
 '评述',
 '市场',
 '概述',
 '本周',
 '集合',
 '信托',
 '发行',
 '市场',
 '稳步',
 '回暖',
 '发行',
 '规模',
 '突破',
 '公开',
 '资料',
 '统计',
 '本周',
 '环比',
 '8.39%',
 '本周',
 '集合',
 '产品',
 '总',
 '发行',
 '规模',
 '超过',
 '300',
 '亿',
 '发行',
 '状况',
 '稳步增长',
 '成立',
 '市场',
 '飙升',
 '相比',
 '本周',
 '集合',
 '产品',
 '发行',
 '市场热度',
 '稍逊',
 '规模',
 '稳定',
 '防风险',
 '通道',
 '大环境',
 '解决',
 '存续',
 '产品',
 '到期',
 '风险',
 '累积',
 '信托公司',
 '工作',
 '重心',
 '产品',
 '发行',
 '受制于',
 '风控',
 '落地',
 '项目',
 '增多',
 '资管',
 '发行',
 '市场',
 '评述',
 '市场',
 '概述',
 '本周',
 '券商',
 '集合',
 '资管',
 '产品',
 '发行',
 '数量',
 '回升',
 '发行',
 '规模',
 '降幅',
 '较大',
 '公开',
 '资料',
 '统计',
 '本周',
 '券商',
 '资管',
 '产品',
 '发行',
 '规模',
 '上周',
 '下降',
 '本周',
 '基金',
 '子公司',
 '集合',
 '资管',
 '产品',
 '发行',
 '规模',
 '上周',
 '增长',
 '资管',
 '细则',
 '期货',
 '资管',
 '产品',
 '设立',
 '门槛',
 '提高',
 '至后',
 '期货',
 '资管',
 '产品',


In [513]:
len(final_train_test_dic_1)

100

In [564]:
f = codecs.open('../runs/test_docs.json', 'r', 'utf-8')
test_docs = json.load(f)

In [567]:
test_docs[0]

2024

In [569]:
len(test_docs)

40000

In [568]:
f = codecs.open('../runs/train_docs.json', 'r', 'utf-8')
train_docs = json.load(f)

In [572]:
all_docs = []
all_docs.extend(test_docs)
all_docs.extend(train_docs)

In [573]:
len(all_docs)

80000

In [514]:
#! -*- coding:utf-8 -*-

import codecs
import json
import math

# 词在文章中出现次数 / 该文章总词数
def tf(word, doc):
    count = sum(1 for w in doc if w==word)
    return count / len(doc)
# log(总文档数 / (包含该词文档数+1))
def idf(word, docs):
    count = sum(1 for doc in docs if word in doc)
    return math.log(len(docs) / (1 + count))
# tf*idf
def tfidf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)
# tfidf矩阵
def tfidf_metrix(docs=[]):
    s = set(word for doc in docs for word in doc)
    res = list([tfidf(word,doc,docs) for word in s] for doc in docs)
    return res

def word_tfidf_dic():
    f = codecs.open('../runs/small_test_dic.json', 'r', 'utf-8')
    final_train_test_dic = json.load(f)
    docs = final_train_test_dic.items()
    tfidf_dics = {}
    idf_dics  ={}
    tf_dics = {}
    i = 0
    for key,doc in final_train_test_dic.items():
        tfidf_dic = {}
        for word in doc:
            tfidf_dic[word] = tfidf(word, doc, docs)
            idf_dics[word] = idf(word, docs)
            tf_dics[word] = tf(word, doc)
        if i%20 == 0:
            print(i, ' doc finish')
        i += 1
        tfidf_dics[key] = tfidf_dic
    return tfidf_dics, idf_dics, tf_dics

def save(tfidf_dics, idf_dics, tf_dics):
    with codecs.open('../runs/test_tfidf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(tfidf_dics, f, indent=4, ensure_ascii=False)
    with codecs.open('../runs/test_idf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(idf_dics, f, indent=4, ensure_ascii=False)
    with codecs.open('../runs/test_tf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(tf_dics, f, indent=4, ensure_ascii=False)

In [515]:
test_tfidf_dics, test_idf_dics, test_tf_dics = word_tfidf_dic()

0  doc finish
20  doc finish
40  doc finish
60  doc finish
80  doc finish


In [517]:
test_tfidf_dics['4a76109a']

{'拿下': 0.013746776674591319,
 '30': 0.013746776674591319,
 '00后': 0.013746776674591319,
 '走': 0.013746776674591319,
 '狙击': 0.013746776674591319,
 '4k': 0.09622743672213924,
 '概念': 0.054987106698365276,
 '真': 0.027493553349182638,
 '龙头': 0.027493553349182638,
 '今日': 0.027493553349182638,
 '大盘': 0.054987106698365276,
 '又一次': 0.013746776674591319,
 '走出': 0.027493553349182638,
 '创新': 0.027493553349182638,
 '高后': 0.027493553349182638,
 '冲高回落': 0.027493553349182638,
 '长': 0.027493553349182638,
 '上影线': 0.054987106698365276,
 '本轮': 0.027493553349182638,
 '行情': 0.054987106698365276,
 '中': 0.04124033002377396,
 'k线': 0.04124033002377396,
 '形态': 0.06873388337295659,
 '短线': 0.054987106698365276,
 '调整': 0.06873388337295659,
 '后期': 0.013746776674591319,
 '再创新高': 0.013746776674591319,
 '这就是': 0.013746776674591319,
 '典型': 0.013746776674591319,
 '牛市': 0.08248066004754792,
 '仙人指路': 0.04124033002377396,
 '特征': 0.013746776674591319,
 '跳空缺口': 0.013746776674591319,
 '形式': 0.013746776674591319,
 '站上': 0.0137

In [24]:
#! -*- coding:utf-8 -*-

import codecs
import json
import math

# 词在文章中出现次数 / 该文章总词数
def tf(word, doc):
    count = sum(1 for w in doc if w.strip()==word.strip())
    return count / len(doc)
# log(总文档数 / (包含该词文档数+1))
def idf(word, docs):
    count = sum(1 for doc in docs if word in doc)
    return math.log(len(docs) / (1 + count))
# tf*idf
def tfidf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)
# tfidf矩阵
def tfidf_metrix(docs=[]):
    s = set(word for doc in docs for word in doc)
    res = list([tfidf(word,doc,docs) for word in s] for doc in docs)
    return res

def word_tfidf_dic():
    f = codecs.open('../runs/small_train_test_dic.json', 'r', 'utf-8')
    final_train_test_dic = json.load(f)
    docs = final_train_test_dic.items()
    tfidf_dics = {}
    idf_dics  ={}
    tf_dics = {}
    i = 0
    for key,doc in final_train_test_dic.items():
        tfidf_dic = {}
        for word in doc:
            tfidf_dic[word] = tfidf(word, doc, docs)
            idf_dics[word] = idf(word, docs)
            tf_dics[word] = tf(word, doc)
        if i%20 == 0:
            print(i, ' doc finish')
        i += 1
        tfidf_dics[key] = tfidf_dic
    return tfidf_dics, idf_dics, tf_dics

def save(tfidf_dics, idf_dics, tf_dics):
    with codecs.open('../runs/tfidf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(tfidf_dics, f, indent=4, ensure_ascii=False)
    with codecs.open('../runs/idf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(idf_dics, f, indent=4, ensure_ascii=False)
    with codecs.open('../runs/tf_dics.json', 'w', encoding='utf-8') as f:
        json.dump(tf_dics, f, indent=4, ensure_ascii=False)

In [25]:
tfidf_dics, idf_dics, tf_dics = word_tfidf_dic()

0  doc finish
20  doc finish
40  doc finish
60  doc finish
80  doc finish


In [26]:
save(tfidf_dics, idf_dics, tf_dics)

## 提取每个样本Top10的关键词

In [27]:
len(tfidf_dics)

100

In [146]:
def topn_tfidf_dics(tfidf_dics, n=10):
    top_tfidf_dics = {}
    for newsId,tfidf_doc in tfidf_dics.items():
        top_tfidf_dic = {}
        sorted_dict = sorted(tfidf_doc.items(), key=lambda item: item[1], reverse=True)
        for word,tfidf in sorted_dict[:n]:
            top_tfidf_dic[word] = tfidf
        top_tfidf_dics[newsId] = top_tfidf_dic
    return top_tfidf_dics

In [150]:
top10_tfidf_dics = topn_tfidf_dics(tfidf_dics, n=10)
top10_tfidf_dics

{'4e36d02a': {'誉洋': 0.1959646887654507,
  '3d': 0.14697351657408803,
  '企业': 0.14697351657408803,
  '机器视觉': 0.1224779304784067,
  '机器人': 0.09798234438272535,
  '解决方案': 0.07348675828704401,
  '工业': 0.07348675828704401,
  '智能': 0.07348675828704401,
  '制造': 0.07348675828704401,
  'kine': 0.07348675828704401},
 'cb8e8b79': {'可视化': 0.2574933867434202,
  '数据可视化': 0.09903591797823855,
  '领域': 0.09903591797823855,
  '运行': 0.09903591797823855,
  '数据': 0.09903591797823855,
  '卫星': 0.09903591797823855,
  '态势': 0.08913232618041468,
  '设备': 0.06932514258476698,
  '智能': 0.06932514258476698,
  '应用领域': 0.05942155078694312},
 'f85c18e2': {'行业': 0.09569184802053178,
  '越来越': 0.09569184802053178,
  '货运': 0.07176888601539884,
  '年': 0.05980740501283237,
  '高': 0.05980740501283237,
  '特别': 0.05980740501283237,
  '服务': 0.05980740501283237,
  '大车': 0.04784592401026589,
  '货源': 0.04784592401026589,
  '竞争': 0.04784592401026589},
 '889199b4': {'路面': 0.6765889846712362,
  '水泥': 0.5892871801975282,
  '修补': 0.3492

In [518]:
top10_test_tfidf_dics = topn_tfidf_dics(test_tfidf_dics, n=10)
top10_test_tfidf_dics

{'a8df14bc': {'资管': 0.17378000701841856,
  '基金': 0.139779570862641,
  '子公司': 0.117112613425456,
  '投资': 0.10577913470686348,
  '产品': 0.10200130846733263,
  '业务': 0.09822348222780179,
  '信托': 0.09444565598827097,
  '信托公司': 0.07555652479061677,
  '集合': 0.06422304607202425,
  '券商': 0.060445219832493416},
 'da1f7a3b': {'产品': 0.17119591769472461,
  '信托': 0.15407632592525214,
  '发行': 0.10271755061683475,
  '资管': 0.08559795884736231,
  '投资': 0.074184897667714,
  '市场': 0.06847836707788983,
  '收益': 0.06277183648806568,
  '规模': 0.05135877530841738,
  '本周': 0.05135877530841738,
  '门槛': 0.05135877530841738},
 '4a76109a': {'4k': 0.09622743672213924,
  '牛市': 0.08248066004754792,
  '形态': 0.06873388337295659,
  '调整': 0.06873388337295659,
  '点': 0.06873388337295659,
  '概念': 0.054987106698365276,
  '大盘': 0.054987106698365276,
  '上影线': 0.054987106698365276,
  '行情': 0.054987106698365276,
  '短线': 0.054987106698365276},
 'b035435d': {'供电': 0.23284568356119564,
  'ups': 0.20697394094328503,
  '电源': 0.1811021

In [519]:
top15_test_tfidf_dics = topn_tfidf_dics(test_tfidf_dics, n=15)
top15_test_tfidf_dics

{'a8df14bc': {'资管': 0.17378000701841856,
  '基金': 0.139779570862641,
  '子公司': 0.117112613425456,
  '投资': 0.10577913470686348,
  '产品': 0.10200130846733263,
  '业务': 0.09822348222780179,
  '信托': 0.09444565598827097,
  '信托公司': 0.07555652479061677,
  '集合': 0.06422304607202425,
  '券商': 0.060445219832493416,
  '证券公司': 0.060445219832493416,
  '资金': 0.05666739359296257,
  '管理': 0.05288956735343174,
  '投资者': 0.049111741113900896,
  '资产管理': 0.04533391487437006},
 'da1f7a3b': {'产品': 0.17119591769472461,
  '信托': 0.15407632592525214,
  '发行': 0.10271755061683475,
  '资管': 0.08559795884736231,
  '投资': 0.074184897667714,
  '市场': 0.06847836707788983,
  '收益': 0.06277183648806568,
  '规模': 0.05135877530841738,
  '本周': 0.05135877530841738,
  '门槛': 0.05135877530841738,
  '高': 0.05135877530841738,
  '理财产品': 0.04565224471859323,
  '区别': 0.04565224471859323,
  '天': 0.04565224471859323,
  '集合': 0.03994571412876908},
 '4a76109a': {'4k': 0.09622743672213924,
  '牛市': 0.08248066004754792,
  '形态': 0.06873388337295659,


In [149]:
top15_tfidf_dics = topn_tfidf_dics(tfidf_dics, n=15)
top15_tfidf_dics

{'4e36d02a': {'誉洋': 0.1959646887654507,
  '3d': 0.14697351657408803,
  '企业': 0.14697351657408803,
  '机器视觉': 0.1224779304784067,
  '机器人': 0.09798234438272535,
  '解决方案': 0.07348675828704401,
  '工业': 0.07348675828704401,
  '智能': 0.07348675828704401,
  '制造': 0.07348675828704401,
  'kine': 0.07348675828704401,
  'ye': 0.07348675828704401,
  '®': 0.07348675828704401,
  '系统': 0.07348675828704401,
  '智慧工厂': 0.048991172191362675,
  '上海': 0.048991172191362675},
 'cb8e8b79': {'可视化': 0.2574933867434202,
  '数据可视化': 0.09903591797823855,
  '领域': 0.09903591797823855,
  '运行': 0.09903591797823855,
  '数据': 0.09903591797823855,
  '卫星': 0.09903591797823855,
  '态势': 0.08913232618041468,
  '设备': 0.06932514258476698,
  '智能': 0.06932514258476698,
  '应用领域': 0.05942155078694312,
  '决策': 0.05942155078694312,
  '商业智能': 0.04951795898911927,
  '展示': 0.04951795898911927,
  '案例': 0.04951795898911927,
  '硬件': 0.04951795898911927},
 'f85c18e2': {'行业': 0.09569184802053178,
  '越来越': 0.09569184802053178,
  '货运': 0.07176888

### 根据词性筛选关键词
```python
n 名词
nr 人名
ns 地名
nt 机构名
nz 专有名词
t 时间词
```

In [520]:
top10_test_tfidf_dics['4a76109a'].keys()

dict_keys(['4k', '牛市', '形态', '调整', '点', '概念', '大盘', '上影线', '行情', '短线'])

In [521]:
f = codecs.open('../runs/train_dic.json', 'r', 'utf-8')
word_flag_dic = json.load(f)

In [522]:
print([i for i in word_flag_dic.values()][:5])

['eng', 'x', 'vg', 'l', 'n']


In [36]:
def flag_filter(top10_tfidf_dics, word_flag_dic):
    flag_filter = ['n', 'nr', 'ns', 'nt', 'nz', 't', 'x']
    for newsId, tfidf_dic in top10_tfidf_dics.items():
        common_words = top10_tfidf_dics[newsId].keys() & word_flag_dic.keys()
        dif_words = top10_tfidf_dics[newsId].keys() - word_flag_dic.keys()
        for dif in dif_words:
            top10_tfidf_dics[newsId].pop(dif)
        for word in common_words:
            if word_flag_dic[word] not in flag_filter:
                top10_tfidf_dics[newsId].pop(word)
    return top10_tfidf_dics

In [37]:
top10_tfidf_dics = flag_filter(top10_tfidf_dics, word_flag_dic)
top10_tfidf_dics

{'4e36d02a': {'誉洋': 0.1959646887654507,
  '3d': 0.14697351657408803,
  '企业': 0.14697351657408803,
  '机器视觉': 0.1224779304784067,
  '机器人': 0.09798234438272535,
  '解决方案': 0.07348675828704401,
  '工业': 0.07348675828704401,
  '智能': 0.07348675828704401},
 'cb8e8b79': {'可视化': 0.2574933867434202,
  '数据可视化': 0.09903591797823855,
  '领域': 0.09903591797823855,
  '数据': 0.09903591797823855,
  '卫星': 0.09903591797823855,
  '态势': 0.08913232618041468,
  '智能': 0.06932514258476698},
 'f85c18e2': {'行业': 0.09569184802053178,
  '货运': 0.07176888601539884,
  '大车': 0.04784592401026589,
  '货源': 0.04784592401026589},
 '889199b4': {'路面': 0.6765889846712362,
  '水泥': 0.5892871801975282,
  '材料': 0.21825451118426975,
  '硬化': 0.21825451118426975,
  '水平': 0.21825451118426975,
  '病害': 0.13095270671056183},
 'a811d20e': {'市场': 0.0637393797368594,
  '趋势': 0.0637393797368594,
  '股票': 0.04780453480264455,
  '昨天': 0.04780453480264455},
 '4e798999': {'电源': 0.36771320876711,
  'evga': 0.12257106958903666,
  '功率': 0.0875507639921

In [523]:
top10_test_tfidf_dics = flag_filter(top10_test_tfidf_dics, word_flag_dic)
top10_test_tfidf_dics

{'a8df14bc': {'资管': 0.17378000701841856,
  '基金': 0.139779570862641,
  '子公司': 0.117112613425456,
  '产品': 0.10200130846733263,
  '业务': 0.09822348222780179,
  '信托': 0.09444565598827097,
  '信托公司': 0.07555652479061677,
  '券商': 0.060445219832493416},
 'da1f7a3b': {'产品': 0.17119591769472461,
  '信托': 0.15407632592525214,
  '资管': 0.08559795884736231,
  '市场': 0.06847836707788983,
  '收益': 0.06277183648806568,
  '规模': 0.05135877530841738,
  '本周': 0.05135877530841738,
  '门槛': 0.05135877530841738},
 '4a76109a': {'4k': 0.09622743672213924,
  '牛市': 0.08248066004754792,
  '形态': 0.06873388337295659,
  '概念': 0.054987106698365276,
  '大盘': 0.054987106698365276,
  '上影线': 0.054987106698365276,
  '行情': 0.054987106698365276},
 'b035435d': {'ups': 0.20697394094328503,
  '电源': 0.18110219832537439,
  '机房': 0.15523045570746377,
  '地线': 0.15523045570746377,
  '计算机系统': 0.12935871308955313,
  '专用': 0.12935871308955313,
  '市电': 0.07761522785373189},
 'b6514ace': {'云计算': 0.1853187197580721,
  'it': 0.1667868477822649,


In [50]:
len(top10_tfidf_dics)

100

In [39]:
print([i for i in top10_tfidf_dics.values()][0])

{'誉洋': 0.1959646887654507, '3d': 0.14697351657408803, '企业': 0.14697351657408803, '机器视觉': 0.1224779304784067, '机器人': 0.09798234438272535, '解决方案': 0.07348675828704401, '工业': 0.07348675828704401, '智能': 0.07348675828704401}


In [524]:
with codecs.open('../runs/top10_test_tfidf_dics.json', 'w', encoding='utf-8') as f:
    json.dump(top10_test_tfidf_dics, f, indent=4, ensure_ascii=False)
with codecs.open('../runs/top15_test_tfidf_dics.json', 'w', encoding='utf-8') as f:
    json.dump(top15_test_tfidf_dics, f, indent=4, ensure_ascii=False)

In [40]:
with codecs.open('../runs/top10_tfidf_dics.json', 'w', encoding='utf-8') as f:
    json.dump(top10_tfidf_dics, f, indent=4, ensure_ascii=False)

In [151]:
with codecs.open('../runs/top15_tfidf_dics.json', 'w', encoding='utf-8') as f:
    json.dump(top15_tfidf_dics, f, indent=4, ensure_ascii=False)

## 构造特征
前10的TF-IDF值找到的关键词，如果在训练集的实体中，标记为1，否则标记为0

In [45]:
train_entity_dic = {}
with open('../data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
    for l in tqdm(f):
        a = json.loads(l.strip())
        train_entity_dic[a['newsId']] = [i['entity'] for i in a['coreEntityEmotions']]

40000it [00:20, 1935.62it/s] 


In [195]:
train_entity_dic['5bb9047a']

['心理', '心理咨询师', '小程序开发']

In [47]:
with codecs.open('../runs/train_entity_dic.json', 'w', encoding='utf-8') as f:
    json.dump(train_entity_dic, f, indent=4, ensure_ascii=False)

In [48]:
f = codecs.open('../runs/train_entity_dic.json', 'r', 'utf-8')
train_entity_dic_ = json.load(f)

In [49]:
train_entity_dic_['cb8e8b79']

['数据', '卫星', '可视化']

In [366]:
model.wv.vocab.keys()

dict_keys(['sia', '智慧工厂', '展', '誉洋以', '智', '取胜', '第十七届', '上海', '国际', '工业自动化', '机器人', '展与', '智能工厂', '展览会', '2019', '年', '月', '日', '圆满', '落下', '帷幕', '展会', '展出', '规模', '90762', '平方米', '国内外', '参展', '厂商', '达', '1000', '多家', '吸引', '超过', '100000', '名', '专业', '观众', '前来', '参观', '展示', '技术', '工业', '装配', '传输技术', '工业机器人', '整机', '零部件', '机器视觉', 'agv', '无人', '搬运', '解决方案', '产品', '领域', '无论是', '人数', '快速', '增长', '推动', '我国', '制造业', '转型', '升级', '跨越', '发展', '誉洋', '3d', '引发', '热潮', '大连', '智能', '展位', '自动', '抓取', '物件', '眼前一亮', '视觉', '引导', '精准定位', '杂乱无序', '目标', '准确', '过程', '井然有序', '无需', '人工干预', '现场', '工程师', '介绍', '制造', '企业', '采用', 'kine', 'ye', '®', '系统', '以往', '繁琐', '枯燥', '物料', '工作', '交给', '解放了', '人力', '提升', '物流', '效率', '这套', '国内', '知名企业', '成功', '实施', '认可', '好评', '智能制造', '有限公司', '成立', '专注', '智能设备', '研发', '生产', '服务', '创新', '理念', '伴随', '成长', '中国', '达成', '合作伙伴', '关系', '领先', '供应商', '前瞻', '思维', '引领', '步伐', '面临', '改革', '痛点', '欧洲', '科研机构', '联合', '拥有', '比肩', '世界', '技术水准', '生产线', '智能化', '改造', '不仅如此', '提供', '柔性', '智能仓储',

In [357]:
x = ['数据', '卫星', '可视化', '5', '建好']
model_words = model.wv.vocab.keys()

In [358]:
c = list(x - model_words)
c

['建好', '5']

In [360]:
for k in c:
    x.remove(k)
x

['数据', '卫星', '可视化']

In [337]:
x

{'5', '建好'}

In [561]:
def simlar_calcute(x, y, model_words):
    diff_x = x - model_words
    diff_y = y - model_words
    if len(diff_x)>=1:
        print('diff_x:',diff_x)
    if len(diff_y)>=1:
        print('diff_y:',diff_y)
    for dif in diff_x:
        if dif in x:
            x.remove(dif)
    for dif in diff_y:
        if dif in y:
            y.pop(dif)
    x_mean = sum(np.array([model[x1] for x1 in x]))/len(x)
    y_mean = sum(np.array([model[y1] for y1 in y]))/len(y)
    res = np.dot(x_mean, y_mean) / (np.linalg.norm(x_mean) * np.linalg.norm(y_mean))
    return round(res, 8)

In [384]:
def subject_freq_count(sub_list, title, content):
    '''
    统计每个主体出现的次数
    '''
    subject_count_dict = {}
    for sub in sub_list:
        # 文中出现的主体
        if sub not in subject_count_dict:
            # 初始化出现次数都为1
            subject_count_dict[sub] = 1
            if len(set([sub]) & set(title))>=1:
                # title中，权重设置为3
                subject_count_dict[sub] += 3
            if len(set([sub]) & set(content))>=1:
                # 文本中，权重设置为1
                subject_count_dict[sub] += 1
    return subject_count_dict

In [153]:
top10_tfidf_dics['cb8e8b79']

{'可视化': 0.2574933867434202,
 '数据可视化': 0.09903591797823855,
 '领域': 0.09903591797823855,
 '运行': 0.09903591797823855,
 '数据': 0.09903591797823855,
 '卫星': 0.09903591797823855,
 '态势': 0.08913232618041468,
 '设备': 0.06932514258476698,
 '智能': 0.06932514258476698,
 '应用领域': 0.05942155078694312}

In [166]:
sub_list = top10_tfidf_dics['cb8e8b79'].keys()
title = small_word_dic['cb8e8b79'][0]
content = small_word_dic['cb8e8b79'][1]

In [175]:
sub_list&title

{'应用领域', '数据可视化'}

In [176]:
set(['应用领域']) & set(title)

{'应用领域'}

In [162]:
sub_list = list(sub_list)

['可视化', '数据可视化', '领域', '运行', '数据', '卫星', '态势', '设备', '智能', '应用领域']

In [178]:
subject_count_dict = subject_freq_count(sub_list, title, content)
subject_count_dict

{'可视化': 2,
 '数据可视化': 5,
 '领域': 2,
 '运行': 2,
 '数据': 2,
 '卫星': 2,
 '态势': 2,
 '设备': 2,
 '智能': 2,
 '应用领域': 5}

In [180]:
first_offset = first_index('可视化', title,content)
first_offset

0.8939393939393939

In [385]:
def left_right_entropy(sub, title, content):
    title_content = "".join(title+content)
    stop_word = ['【', '】', ')', '(', '、', '，', '“', '”', '。', '>', '<', '\n', '《', '》', ' ', '-', '！', '？', '.',
                 '\'', '[', ']', '：', '/', '.', '"', '\u3000', '’', '．', ',', '…', '?']
    for sw in stop_word:
        title_content = title_content.replace(sw, "")
    lr = re.findall('(.)%s(.)' % sub, title_content)

    from collections import Counter
    def entropy(alist):
        f = dict(Counter(alist))
        ent = (-1) * sum([i / len(alist) * math.log(i / len(alist)) for i in f.values()])
        return ent

    if lr:
        left_entropy = entropy([w[0] for w in lr])
        right_entropy = entropy([w[1] for w in lr])
        if left_entropy == -0.0:
            left_entropy = 0.5
        if right_entropy == -0.0:
            right_entropy = 0.5
    else:
        left_entropy = 0.4
        right_entropy = 0.4
    return left_entropy,right_entropy

In [192]:
left_entropy,right_entropy = left_right_entropy('数据', title, content)
left_entropy,right_entropy

(2.9364272168250984, 2.08874543934893)

In [386]:
def first_index(sub, title,content):
    if len(set([sub]) & set(title))>=1:
        subject_start = title.index(sub)
        first_offset = 1-subject_start/len(title)
    else:
        try:
            subject_start = content.index(sub)
            first_offset = 1-subject_start/len(content)
        except:
            first_offset = 0.5
    return first_offset

In [531]:
model = FastText.load('../runs/fasttext_model')

In [552]:
top15_test_tfidf_dics['a8df14bc']

{'资管': 0.17378000701841856,
 '基金': 0.139779570862641,
 '子公司': 0.117112613425456,
 '投资': 0.10577913470686348,
 '产品': 0.10200130846733263,
 '业务': 0.09822348222780179,
 '信托': 0.09444565598827097,
 '信托公司': 0.07555652479061677,
 '集合': 0.06422304607202425,
 '券商': 0.060445219832493416,
 '证券公司': 0.060445219832493416,
 '资金': 0.05666739359296257,
 '管理': 0.05288956735343174,
 '投资者': 0.049111741113900896,
 '资产管理': 0.04533391487437006}

In [557]:
# 1、主体 和 title 关键词的相似度
# # 2、主体 和 tfidf的topN词相似度
# 3、主体 和 text_rank词[:15]的相似度
# 4、主体是否在title中
# 5、title中是否有其他主体
# 6、主体的TF/IDF/IF-IDF
# 7、文章中是否包含其他主体
# 8、主体最高出现次数/该主体出现的次数
# 9、1-主体首次出现位置占全文的比例
# 10、文章的长度
# 11、主体个数
# 12、主体的左右熵
def generate_test_features(top10_tfidf_dics, top15_tfidf_dics):
    import random
    f = codecs.open('../runs/train_entity_dic.json', 'r', 'utf-8')
    train_entity_dic = json.load(f)
    label_dic = []
    ids = top10_tfidf_dics.keys()
    model_words = model.wv.vocab.keys()
    for newsId in ids:
        keys = top10_tfidf_dics[newsId].keys()
        temp = []
        sub_title_sim = 0.5
        sub_content_sim = 0.5
        sub_tfidf_top_sim = 0.5
        sub_is_in_title = 0
        title_has_other_sub = 0
        title_has_other_sub = 1
        title = small_word_test_dic[newsId][0]
        content = small_word_test_dic[newsId][1]
        title_content = title+content
        # title中是否有其他主体
        title_has_other_sub = 1 if len(keys & set(title)) >= 2 else 0
        # 文章中是否包含其他主体
        content_has_other_sub = 1 if len(keys & set(content)) >= 2 else 0
        subject_count_dict = subject_freq_count(keys, title, content)
        # 文章的长度
        len_doc = (len(title)+len(content))/100
        # 每篇文章中主体总数
        sub_in_doc_sum = (len(keys))/10
        label = 0
        for sub in keys:
            # 主体和title相似度
            sub_title_sim = simlar_calcute([sub], title, model_words)
            # 主体和content相似度
            sub_content_sim = simlar_calcute([sub], content, model_words)
            # 主体 和 tfidf的topN词相似度
            sub_tfidf_top_sim = simlar_calcute([sub], top15_tfidf_dics[newsId], model_words)
            # 主体是否在title中
            sub_is_in_title = 1 if len(set([sub]) & set(title))>=1 else 0
            # 主体在文章中出现的次数
            count = subject_count_dict[sub]
            # 主体首次出现位置占全文的比例
            first_offset = first_index(sub, title, content)
            # 主体的左右熵
            left_entropy,right_entropy = left_right_entropy(sub, title, content)
            temp.append([newsId,# id 
                         sub, # sub
                         test_tf_dics[sub], # tf
                         test_idf_dics[sub], # idf
                         top10_tfidf_dics[newsId][sub], # tfidf
                         title_has_other_sub, #标题中是否有其它主体
                         content_has_other_sub, # 文章中是否包含其他主体
                         len_doc, #文章的长度
                         sub_in_doc_sum, # 每篇文章中主体总数
                         sub_title_sim, # 主体和title相似度
                         sub_content_sim, # 主体和content相似度
                         sub_tfidf_top_sim, # 主体 和 tfidf的topN词相似度
                         sub_is_in_title, # 主体是否在title中
                         count, # 主体在文章中出现的次数
                         first_offset, # 主体首次出现位置占全文的比例
                         left_entropy, # 主体的左熵
                         right_entropy])# 主体的右熵
        label_dic.append(temp)
    random.shuffle(label_dic)
    return label_dic

In [562]:
test_feature = generate_test_features(top10_test_tfidf_dics, top15_test_tfidf_dics)
test_feature[0]

  
  from ipykernel import kernelapp as app


diff_y: {'菌类'}
diff_x: {'菌类'}


ZeroDivisionError: division by zero

In [534]:
# 1、主体 和 title 关键词的相似度
# # 2、主体 和 tfidf的topN词相似度
# 3、主体 和 text_rank词[:15]的相似度
# 4、主体是否在title中
# 5、title中是否有其他主体
# 6、主体的TF/IDF/IF-IDF
# 7、文章中是否包含其他主体
# 8、主体最高出现次数/该主体出现的次数
# 9、1-主体首次出现位置占全文的比例
# 10、文章的长度
# 11、主体个数
# 12、主体的左右熵
def generate_features(top10_tfidf_dics, top15_tfidf_dics):
    import random
    f = codecs.open('../runs/train_entity_dic.json', 'r', 'utf-8')
    train_entity_dic = json.load(f)
    label_dic = []
    common_ids = top10_tfidf_dics.keys() & train_entity_dic.keys()
    model_words = model.wv.vocab.keys()
    for newsId in common_ids:
        common_keys = top10_tfidf_dics[newsId].keys() & train_entity_dic[newsId]
        diff_keys = top10_tfidf_dics[newsId].keys() - train_entity_dic[newsId] 
        keys = common_keys|diff_keys
        temp = []
        sub_title_sim = 0.5
        sub_content_sim = 0.5
        sub_tfidf_top_sim = 0.5
        sub_is_in_title = 0
        title_has_other_sub = 0
        title_has_other_sub = 1
        title = small_word_dic[newsId][0]
        content = small_word_dic[newsId][1]
        title_content = title+content
        # title中是否有其他主体
        title_has_other_sub = 1 if len(keys & set(title)) >= 2 else 0
        # 文章中是否包含其他主体
        content_has_other_sub = 1 if len(keys & set(content)) >= 2 else 0
        subject_count_dict = subject_freq_count(keys, title, content)
        # 文章的长度
        len_doc = (len(title)+len(content))/100
        # 每篇文章中主体总数
        sub_in_doc_sum = (len(keys))/10
        label = 0
        for sub in keys:
            if len(set([sub]) & set(common_keys))>=1:
                label = 1
            if len(set([sub]) & set(diff_keys))>=1:
                label = 0
            # 主体和title相似度
            sub_title_sim = simlar_calcute([sub], title, model_words)
            # 主体和content相似度
            sub_content_sim = simlar_calcute([sub], content, model_words)
            # 主体 和 tfidf的topN词相似度
            sub_tfidf_top_sim = simlar_calcute([sub], top15_tfidf_dics[newsId], model_words)
            # 主体是否在title中
            sub_is_in_title = 1 if len(set([sub]) & set(title))>=1 else 0
            # 主体在文章中出现的次数
            count = subject_count_dict[sub]
            # 主体首次出现位置占全文的比例
            first_offset = first_index(sub, title, content)
            # 主体的左右熵
            left_entropy,right_entropy = left_right_entropy(sub, title, content)
            temp.append([newsId,# id 
                         sub, # sub
                         tf_dics[sub], # tf
                         idf_dics[sub], # idf
                         top10_tfidf_dics[newsId][sub], # tfidf
                         title_has_other_sub, #标题中是否有其它主体
                         content_has_other_sub, # 文章中是否包含其他主体
                         len_doc, #文章的长度
                         sub_in_doc_sum, # 每篇文章中主体总数
                         sub_title_sim, # 主体和title相似度
                         sub_content_sim, # 主体和content相似度
                         sub_tfidf_top_sim, # 主体 和 tfidf的topN词相似度
                         sub_is_in_title, # 主体是否在title中
                         count, # 主体在文章中出现的次数
                         first_offset, # 主体首次出现位置占全文的比例
                         left_entropy, # 主体的左熵
                         right_entropy, # 主体的右熵
                         label])#label
        label_dic.append(temp)
    random.shuffle(label_dic)
    return label_dic

In [403]:
label_dic_ = generate_features(top10_tfidf_dics, top15_tfidf_dics)
label_dic_

  if sys.path[0] == '':
  del sys.path[0]


[[['579edaad',
   '美女',
   0.022388059701492536,
   4.605170185988092,
   0.10310082505943488,
   1,
   1,
   2.68,
   1.0,
   0.97944653,
   0.9761411,
   0.9366692,
   1,
   5,
   0.4444444444444444,
   1.945910149055313,
   1.945910149055313,
   0],
  ['579edaad',
   '身边',
   0.003367003367003367,
   4.605170185988092,
   0.10310082505943488,
   1,
   1,
   2.68,
   1.0,
   0.98194754,
   0.9889204,
   0.98766506,
   0,
   2,
   0.8957528957528957,
   1.242453324894,
   1.7917594692280547,
   0],
  ['579edaad',
   '胖妞',
   0.022388059701492536,
   4.605170185988092,
   0.10310082505943488,
   1,
   1,
   2.68,
   1.0,
   0.9564287,
   0.963069,
   0.9100428,
   1,
   5,
   0.6666666666666667,
   1.7917594692280547,
   1.7917594692280547,
   0],
  ['579edaad',
   'baby',
   0.05223880597014925,
   4.605170185988092,
   0.24056859180534806,
   1,
   1,
   2.68,
   1.0,
   0.99327177,
   0.97252303,
   0.9492123,
   1,
   5,
   0.5555555555555556,
   2.5400363038209806,
   2.4036404106

In [404]:
len(label_dic_)

100

In [407]:
len(label_dic_[0])

10

In [409]:
label_dic_[0][0]

['579edaad',
 '美女',
 0.022388059701492536,
 4.605170185988092,
 0.10310082505943488,
 1,
 1,
 2.68,
 1.0,
 0.97944653,
 0.9761411,
 0.9366692,
 1,
 5,
 0.4444444444444444,
 1.945910149055313,
 1.945910149055313,
 0]

In [133]:
def label_entity(top10_tfidf_dics):
    import random
    f = codecs.open('../runs/train_entity_dic.json', 'r', 'utf-8')
    train_entity_dic = json.load(f)
    label_dic = []
    common_ids = top10_tfidf_dics.keys() & train_entity_dic.keys()
    for newsId in common_ids:
        common_keys = top10_tfidf_dics[newsId].keys() & train_entity_dic[newsId]
        diff_keys = top10_tfidf_dics[newsId].keys() - train_entity_dic[newsId] 
        temp = []
        for com_key in common_keys:
            temp.append([newsId, com_key, tf_dics[com_key], idf_dics[com_key], top10_tfidf_dics[newsId][com_key], 1])
        for diff_key in diff_keys:
            temp.append([newsId, com_key, tf_dics[diff_key], idf_dics[diff_key], top10_tfidf_dics[newsId][diff_key], 0])
        label_dic.append(temp)
    random.shuffle(label_dic)
    return label_dic

In [57]:
label_dic = label_entity(top10_tfidf_dics)
label_dic

[[['5f2de988', '黄金', 0.056574572309436015, 1],
  ['5f2de988', '林涵艺', 0.04525965784754881, 1],
  ['5f2de988', '涵艺', 0.04525965784754881, 0],
  ['5f2de988', '12', 0.41865183508982656, 0]],
 [['78e67428', '传感器', 0.10890605169566434, 0],
  ['78e67428', '半导体', 0.04667402215528472, 0],
  ['78e67428', '▷', 0.062232029540379624, 0],
  ['78e67428', '汽车', 0.14002206646585416, 0]],
 [['4dc75398', 'axonius', 0.06366594727172478, 1],
  ['4dc75398', 'iot', 0.05659195313042202, 1],
  ['4dc75398', 'forescout', 0.08488792969563302, 1],
  ['4dc75398', '网络安全', 0.07073994141302753, 0],
  ['4dc75398', '可视化', 0.07073994141302753, 0],
  ['4dc75398', '物联网', 0.05659195313042202, 0],
  ['4dc75398', '技术', 0.06366594727172478, 0],
  ['4dc75398', '公司', 0.08488792969563302, 0]],
 [['049c541c', '界面', 0.25184524454622376, 1],
  ['049c541c', '联系人', 0.3957568128583516, 1],
  ['049c541c', '手机', 0.4317347049363836, 1],
  ['049c541c', '方法', 0.14391156831212787, 0],
  ['049c541c', '数据', 0.14391156831212787, 0],
  ['049c541

In [410]:
with codecs.open('../runs/label_dic.json', 'w', encoding='utf-8') as f:
    json.dump(label_dic, f, indent=4, ensure_ascii=False)

In [78]:
with codecs.open('../runs/label_data.json', 'w', encoding='utf-8') as f:
    json.dump(label_data, f, indent=4, ensure_ascii=False)

In [69]:
np.array(label_dic[0])

array([['5f2de988', '黄金', '0.056574572309436015', '1'],
       ['5f2de988', '林涵艺', '0.04525965784754881', '1'],
       ['5f2de988', '涵艺', '0.04525965784754881', '0'],
       ['5f2de988', '12', '0.41865183508982656', '0']], dtype='<U20')

In [412]:
label_data_ = []
for item in label_dic_:
    for line in item:
        label_data_.append(line)

In [413]:
with open('../runs/feature_dics.txt', 'r', encoding='utf-8') as f:
    

TypeError: Object of type 'float32' is not JSON serializable

In [70]:
label_data = []
for item in label_dic:
    for line in item:
        label_data.append(line)

In [393]:
len(label_data_)

1000

In [138]:
import random
random.shuffle(label_data_)

In [None]:
newsId,# id 
sub, # sub
tf, # tf
idf, # idf
tfidf, # tfidf
title_has_other_sub, #标题中是否有其它主体
content_has_other_sub, # 文章中是否包含其他主体
len_doc, #文章的长度
sub_in_doc_sum, # 每篇文章中主体总数
sub_title_sim, # 主体和title相似度
sub_content_sim, # 主体和content相似度
sub_tfidf_top_sim, # 主体 和 tfidf的topN词相似度
sub_is_in_title, # 主体是否在title中
count, # 主体在文章中出现的次数
first_offset, # 主体首次出现位置占全文的比例
left_entropy, # 主体的左熵
right_entropy, # 主体的右熵
label#label

In [414]:
import pandas as pd
df = pd.DataFrame(label_data_, columns=['newsId', 'word', 'tf', 'idf', 'tfidf', 
                                        'title_has_other_sub', 'content_has_other_sub', 
                                        'len_doc', 'sub_in_doc_count', 'sub_title_sim', 
                                        'sub_content_sim', 'sub_tfidf_top_sim',
                                        'sub_is_in_title', 'count', 'first_offset',
                                        'left_entropy','right_entropy', 
                                        'label'])
df.head(20)

Unnamed: 0,newsId,word,tf,idf,tfidf,title_has_other_sub,content_has_other_sub,len_doc,sub_in_doc_count,sub_title_sim,sub_content_sim,sub_tfidf_top_sim,sub_is_in_title,count,first_offset,left_entropy,right_entropy,label
0,579edaad,美女,0.022388,4.60517,0.103101,1,1,2.68,1.0,0.979447,0.976141,0.936669,1,5,0.444444,1.94591,1.94591,0
1,579edaad,身边,0.003367,4.60517,0.103101,1,1,2.68,1.0,0.981948,0.98892,0.987665,0,2,0.895753,1.242453,1.791759,0
2,579edaad,胖妞,0.022388,4.60517,0.103101,1,1,2.68,1.0,0.956429,0.963069,0.910043,1,5,0.666667,1.791759,1.791759,0
3,579edaad,baby,0.052239,4.60517,0.240569,1,1,2.68,1.0,0.993272,0.972523,0.949212,1,5,0.555556,2.540036,2.40364,0
4,579edaad,孙俪,0.044776,4.60517,0.206202,1,1,2.68,1.0,0.976183,0.990323,0.965495,1,5,0.333333,2.397895,1.893788,1
5,579edaad,明星,0.032609,4.60517,0.103101,1,1,2.68,1.0,0.966122,0.961182,0.969574,1,5,1.0,1.332179,0.950271,0
6,579edaad,说,0.000877,4.60517,0.085917,1,1,2.68,1.0,0.889595,0.869498,0.909796,0,2,0.984556,1.609438,1.332179,0
7,579edaad,经纪人,0.089552,4.60517,0.412403,1,1,2.68,1.0,0.969962,0.987485,0.967986,1,5,0.888889,2.658193,3.004767,1
8,579edaad,之间,0.001754,4.60517,0.05155,1,1,2.68,1.0,0.767872,0.834993,0.870509,0,2,0.957529,0.636514,0.636514,0
9,579edaad,杨幂,0.041045,4.60517,0.189018,1,1,2.68,1.0,0.959815,0.910474,0.885074,1,5,0.777778,1.893788,2.271869,1


In [415]:
df.describe()

Unnamed: 0,tf,idf,tfidf,title_has_other_sub,content_has_other_sub,len_doc,sub_in_doc_count,sub_title_sim,sub_content_sim,sub_tfidf_top_sim,sub_is_in_title,count,first_offset,left_entropy,right_entropy,label
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.015864,4.60517,0.122243,0.75,1.0,3.3628,1.0,0.80556,0.860343,0.858299,0.29,2.858,0.7244,1.474649,1.493685,0.205
std,0.017359,8.886228e-16,0.084277,0.433229,0.0,3.185304,0.0,0.150455,0.112729,0.107206,0.453989,1.351911,0.241587,0.647287,0.660478,0.403904
min,0.000465,4.60517,0.028914,0.0,1.0,0.38,1.0,0.228092,0.367774,0.397838,0.0,1.0,0.071429,0.271189,0.206192,0.0
25%,0.003891,4.60517,0.070535,0.75,1.0,1.84,1.0,0.715863,0.80589,0.805746,0.0,2.0,0.556597,1.039721,1.039721,0.0
50%,0.011384,4.60517,0.096342,1.0,1.0,2.63,1.0,0.848245,0.893474,0.887968,0.0,2.0,0.785262,1.386294,1.466478,0.0
75%,0.021525,4.60517,0.146142,1.0,1.0,3.58,1.0,0.925192,0.94662,0.937325,1.0,5.0,0.93011,1.890316,1.94591,0.0
max,0.169014,4.60517,0.778339,1.0,1.0,21.45,1.0,1.0,0.995696,0.993936,1.0,5.0,1.0,3.661818,3.584545,1.0


In [418]:
df.to_csv("../runs/label_data.csv", index=False, sep='\t')

## fasttext预训练词向量

In [102]:
import codecs
import json
f = codecs.open('../runs/all_docs.json', 'r', 'utf-8')
sentences = json.load(f)

In [229]:
len(sentences[:2000])

2000

In [235]:
with codecs.open('../runs/small_words.json', 'w', encoding='utf-8') as f:
    json.dump(words, f, indent=4, ensure_ascii=False)

In [232]:
words = [word for line in sentences[0:2000] for word in line]
len(words)

719363

In [109]:
from gensim.models import FastText
model = FastText(sentences[:2000], size=200, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=0)

In [115]:
model['自学'].shape  # 词向量获得的方式

  """Entry point for launching an IPython kernel.


(200,)

In [113]:
model.save('../runs/fasttext_model')

In [114]:
model = FastText.load('../runs/fasttext_model')

## Xgboost训练

In [419]:
import numpy as np
from sklearn.model_selection import train_test_split

In [453]:
df['word'].unique().shape

(772,)

In [443]:
df.columns[2:17]

Index(['tf', 'idf', 'tfidf', 'title_has_other_sub', 'content_has_other_sub',
       'len_doc', 'sub_in_doc_count', 'sub_title_sim', 'sub_content_sim',
       'sub_tfidf_top_sim', 'sub_is_in_title', 'count', 'first_offset',
       'left_entropy', 'right_entropy'],
      dtype='object')

In [444]:
X, y = df[df.columns[2:17]], df['label']

In [461]:
X.head()

Unnamed: 0,tf,idf,tfidf,title_has_other_sub,content_has_other_sub,len_doc,sub_in_doc_count,sub_title_sim,sub_content_sim,sub_tfidf_top_sim,sub_is_in_title,count,first_offset,left_entropy,right_entropy
0,0.022388,4.60517,0.103101,1,1,2.68,1.0,0.979447,0.976141,0.936669,1,5,0.444444,1.94591,1.94591
1,0.003367,4.60517,0.103101,1,1,2.68,1.0,0.981948,0.98892,0.987665,0,2,0.895753,1.242453,1.791759
2,0.022388,4.60517,0.103101,1,1,2.68,1.0,0.956429,0.963069,0.910043,1,5,0.666667,1.791759,1.791759
3,0.052239,4.60517,0.240569,1,1,2.68,1.0,0.993272,0.972523,0.949212,1,5,0.555556,2.540036,2.40364
4,0.044776,4.60517,0.206202,1,1,2.68,1.0,0.976183,0.990323,0.965495,1,5,0.333333,2.397895,1.893788


In [446]:
X.shape, y.shape

((1000, 15), (1000,))

In [462]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [470]:
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.ranking import roc_auc_score

xgb = XGBClassifier(
         learning_rate =0.002,
         n_estimators=30,
         max_depth=4,
         min_child_weight=1,
         gamma=0.24,
         subsample=0.7,
         colsample_bytree=0.6,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=27)

In [471]:
xgb.fit(X_train,Y_train)
xgb_test_predict=xgb.predict(X_test)
xgb_test_auc = roc_auc_score(Y_test, xgb_test_predict)
print('xgb test auc: %.5f' % xgb_test_auc)

xgb test auc: 0.55535


  if diff:


### n_estimators

In [479]:
cv_params = {'n_estimators': range(40,60,5)}
other_params = {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 
                'reg_alpha': 0, 'reg_lambda': 1, 'objective': 'binary:logistic'}

model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
参数的最佳取值：{'n_estimators': 50}
最佳模型得分:0.7466510693407246


[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.6s finished


### max_depth,min_child_weight

In [480]:
cv_params = {'max_depth':[3,4,5,6],'min_child_weight':[1,2,3,4]}
other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
参数的最佳取值：{'max_depth': 3, 'min_child_weight': 4}
最佳模型得分:0.7470405344888104


[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:    1.7s finished


In [483]:
# cv_params = {'gamma':[i/10.0 for i in range(0,5)]} 0.1
cv_params = {'gamma':[0.08,0.09,0.1,0.12,0.14,0.16] }
other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_child_weight': 4, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
参数的最佳取值：{'gamma': 0.09}
最佳模型得分:0.7512996655410448


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.6s finished


In [484]:
cv_params = {'subsample':[i/10.0 for i in range(6,10)],'colsample_bytree':[i/10.0 for i in range(6,10)]}
other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_child_weight': 4, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.09, 'reg_alpha': 0, 'reg_lambda': 1}

model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
参数的最佳取值：{'colsample_bytree': 0.9, 'subsample': 0.7}
最佳模型得分:0.7545904673146052


[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:    1.6s finished


In [485]:
cv_params = {'reg_alpha':[0.18,0.02,0.22,0.24],'reg_lambda': [1.4,1.8,2,2.4]}
other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_child_weight': 4, 'seed': 0,
                'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 0.09}
model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
参数的最佳取值：{'reg_alpha': 0.18, 'reg_lambda': 2}
最佳模型得分:0.7589526633664565


[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:    1.1s finished


In [490]:
cv_params = {'learning_rate':[0.08,0.09,0.1,0.12,0.14]}
other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_child_weight': 4, 'seed': 0,
                'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 0.09, 'reg_alpha': 0.18, 'reg_lambda': 2}

model = XGBClassifier(**other_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=5, verbose=1, n_jobs=4)
optimized_GBM.fit(X_train, Y_train, eval_metric='auc')
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
参数的最佳取值：{'learning_rate': 0.1}
最佳模型得分:0.7589526633664565


[Parallel(n_jobs=4)]: Done  18 out of  25 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.5s finished


In [491]:
xgb = XGBClassifier(
         learning_rate =0.1,
         n_estimators=50,
         max_depth=3,
         min_child_weight=4,
         gamma=0.09,
         subsample=0.7,
         colsample_bytree=0.9,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=27)
xgb.fit(X_train,Y_train)
xgb_test_predict=xgb.predict(X_test)
xgb_test_auc = roc_auc_score(Y_test, xgb_test_predict)
print('xgb test auc: %.5f' % xgb_test_auc)

xgb test auc: 0.61070


  if diff:


In [495]:
from sklearn.metrics import f1_score
f1_score(Y_test, xgb_test_predict, average='macro')  

0.6303018081602473

In [496]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, xgb_test_predict)

array([[229,  12],
       [ 43,  16]])

In [494]:
xgb_test_predict

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1])

In [None]:
def get_all_feature(self, subject_info_list, info):
    '''
    :param original_subject: 文章中的主体
    :param info: tile_segment, content_segment, keyword
    :return: all_feature
    1、主体 和 title 关键词的相似度
    # 2、主体 和 tfidf的topN词相似度
    3、主体 和 text_rank词[:15]的相似度
    4、主体是否在title中
    5、title中是否有其他主体
    6、主体的TF/IDF/IF-IDF
    7、文章中是否包含其他主体
    8、主体最高出现次数/该主体出现的次数
    9、1-主体首次出现位置占全文的比例
    10、文章的长度
    11、主体个数
    12、主体的左右熵
    '''
    # subject_info_list:[key, subject_name, 1/-1, original_subject_name]
    # info等同于datas
    result = []
    title_clean = [t['word'] for t in info['title_segment'] if t['word'] not in self.stop_word_dict and len(t['word'])>=2]
    title_segment_word = [tw['word'] for tw in info['title_segment']]
    # title_segment_offset = [to['offset'] for to in info['title_segment']]
    content_segment_word = [cw['word'] for cw in info['content_segment']]
    # content_segment_offset = [co['offset'] for co in info['content_segment']]
    keyword = [kw for kw in info['keyword']]
    # 所有主体
    all_subject = [als[1] for als in subject_info_list]
    # 每个主体的出现次数
    all_subject_count = self.subject_freq_count(subject_info_list, info)
    try:
        top_freq = sorted(all_subject_count.items(), key=lambda x: x[1], reverse=True)[0][1]
    except:
        top_freq = 1
    for subject in subject_info_list:
        all_feature = []
        company_short = subject[1]
        all_feature.append(company_short)
        tag = subject[2]
        original_subject_name = subject[3].split("#")
        # 注：n_similarity应用时, 不在model中的词会报错, 所以需要预筛选
        # 在model中的主体
        # tmp_subject = [sub_a for sub_a in original_subject_name if sub_a in model]
        tmp_subject = [sub_a for sub_a in original_subject_name]
        # 在model中的title词
        # tmp_clean_title = [sub_b for sub_b in title_clean if sub_b in model]
        tmp_clean_title = [sub_b for sub_b in title_clean]
        # 在model中的keyword
        # tmp_keyword = [sub_c for sub_c in keyword[:15] if sub_c in model]
        tmp_keyword = [sub_c for sub_c in keyword[:15]]
        if tmp_subject and tmp_clean_title and tmp_keyword:
            # 1、主体 和 title关键词的相似度
            company_title_simlarity = self.simlar_calcute(tmp_subject, tmp_clean_title)
            # company_title_simlarity = round(model.n_similarity(tmp_subject, tmp_clean_title), 8)
            # 3、主体 和 text_rank词的相似度
            company_textRank_similarity = self.simlar_calcute(tmp_subject, tmp_keyword)
            # company_textRank_similarity = round(model.n_similarity(tmp_subject, tmp_keyword), 8)
        else:
            # 不再model中的, 取默认值为0.05
            company_title_simlarity = 0.05
            company_textRank_similarity = 0.05
        all_feature.append(company_title_simlarity)
        all_feature.append(company_textRank_similarity)
        # 2、主体 和 tfidf的topN词相似度
        # 4、主体是否在title中
        if len(set(original_subject_name) & set(title_segment_word)) >=1:
            subject_in_title = 1
        else:
            subject_in_title = 0
        all_feature.append(subject_in_title)
        # 5、title中是否有其他主体
        if len(set(title_segment_word) & set(all_subject)) >=2:
            title_contain_2subject = 1
        else:
            title_contain_2subject = 0
        all_feature.append(title_contain_2subject)
        # 6、主体出现的次数
        tf = all_subject_count[subject[1]]
        all_feature.append(tf)
        # 7、主体的TF / IDF / IF - IDF
        idf = 2.5
        for subject_idf in original_subject_name:
            if subject_idf in info['title_segment']:
                idf = info['title_segment'][subject_idf]
                break
            elif subject_idf in info['content_segment']:
                idf = info['content_segment'][subject_idf]
                break
        # 降低tf-idf影响,无需考虑是否/10,整列除相同数对xgboost无影响
        tfidf = tf*idf
        all_feature.append(idf)
        all_feature.append(tfidf)
        # 8、文章中是否包含其他主体
        if len(set(content_segment_word) & set(all_subject))>=2:
            content_contain_2subject = 1
        else:
            content_contain_2subject = 0
        all_feature.append(content_contain_2subject)
        # 9、主体最高出现次数 / 该主体出现的次数
        subject_freq = tf / top_freq
        all_feature.append(subject_freq)
        # 10、1 - 主体首次出现位置占全文的比例
        if original_subject_name[0] in title_segment_word:
            subject_start = title_segment_word.index(original_subject_name[0])
            first_offset = 1-subject_start/len(title_segment_word)
        else:
            try:
                subject_start = content_segment_word.index(original_subject_name[0])
                first_offset = 1-subject_start/len(content_segment_word)
            except:
                first_offset = 0.5
        all_feature.append(first_offset)
        # 11、文章的长度
        content_len = len(content_segment_word)/1000
        all_feature.append(content_len)
        # 主体个数
        subject_count = len(set(all_subject))
        all_feature.append(subject_count)
        # 左右信息熵
        title_content = "".join(title_segment_word+content_segment_word)
        stop_word = ['【', '】', ')', '(', '、', '，', '“', '”', '。', '>', '<', '\n', '《', '》', ' ', '-', '！', '？', '.',
                     '\'', '[', ']', '：', '/', '.', '"', '\u3000', '’', '．', ',', '…', '?']
        for sw in stop_word:
            title_content = title_content.replace(sw, "")
        lr = re.findall('(.)%s(.)' % original_subject_name[0], title_content)
        if lr:
            left_entropy = self.entropy([w[0] for w in lr])
            right_entropy = self.entropy([w[1] for w in lr])
            if left_entropy == -0.0:
                left_entropy = 0.5
            if right_entropy == -0.0:
                right_entropy = 0.5
        else:
            left_entropy = 0.4
            right_entropy = 0.4
        all_feature.append(left_entropy)
        all_feature.append(right_entropy)
        # 100维信息
        # subject_cbow = self.fasttext_model[original_subject_name[0]]
        # all_feature.extend(subject_cbow)
        all_feature.append(tag)
        result.append(all_feature)
    return result

## 位置特征

In [None]:
## 位置特征： 1. 是否出现在标题 2.是否出现在第一句 3.是否出现在最后一句 4.出现在正文中间部分
occur_in_title = np.zeros(len(new_tags))
occur_in_first_sentence = np.zeros(len(new_tags))
occur_in_last_sentence = np.zeros(len(new_tags))
occur_in_other_sentence = np.zeros(len(new_tags))
for i in range(len(new_tags)):
    if new_tags[i] in x['title_cut']:
        occur_in_title[i] = 1
    if new_tags[i] in x['first_sentence']:
        occur_in_first_sentence[i] = 1
    if new_tags[i] in x['last_sentence']:
        occur_in_last_sentence[i] = 1
    if new_tags[i] in x['other_sentence']:
        occur_in_other_sentence[i] = 1

## 训练TF-IDF词向量

In [626]:
sentences[0:2]

['sia 智慧工厂 誉洋以 第十七届 工业自动化 机器人 智能工厂 展览会 2019 90762 平方米 国内外 1000 100000 工业自动化 传输技术 工业机器人 零部件 机器视觉 agv 智慧工厂 解决方案 工业自动化 解决方案 无论是 制造业 机器视觉 机器人 眼前一亮 机器人 精准定位 杂乱无序 井然有序 人工干预 工程师 kine 机器视觉 机器人 解放了 机器视觉 知名企业 kine 机器视觉 智能制造 有限公司 智能设备 合作伙伴 智能设备 供应商 科研机构 kine 机器视觉 技术水准 制造业 生产线 智能化 不仅如此 智能仓储 智能制造 一站式 解决方案',
 '数据可视化 应用领域 大数据 数据可视化 应用领域 商业智能 公共服务 市场营销 金融行业 电力行业 医疗保健 数据可视化 数据可视化 ceo 数据可视化 数据可视化 应用领域 可视化 可视化 统计分析 可视化 代表性 数据可视化 化繁为简 更有意义 应用领域 可视化 可视化 时间推移 数据可视化 多维度 非专业 可视化 可视化 飞行数据 可视化 可视化 宇宙空间 可视化 一目了然 宇宙空间 应用领域 可视化 三维动画 计算机 实体模型 可视化 管理者 一目了然 大大减少 管理者 劳动强度 管理效率 管理水平 工业4.0 工业设备 可视化 处理软件 零部件 一系列 可视化 时间推移 可视化 可视化 可视化 可视化 数据传输 针对性 可视化 应用领域 统计分析 可视化 商业智能 市场营销 商业智能 可视化 商业价值 管理层 商业智能 可视化 决策者 可视化 交互式 决策依据 数据分析 决策人员 科学决策 商业智能 可视化 判断力 决策人员 工作效率 可视化 数据分析 用户群 用户群体 用户群 营销策略 忠诚度 市场占有率 数据可视化 智能手机 软硬件 智能化 智能化 大数据 可穿戴设备 智能电视 智能家居 医疗健康 机器人 可视化 健康状况 行为习惯 大数据 自主研发 数据可视化 中国企业 大数据 后顾之忧']

In [628]:
len(tfidf.vocabulary_)

861554

In [629]:
tfidf_matrix.toarray()[3]

array([0., 0., 0., ..., 0., 0., 0.])

In [641]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
tfidf = TFIDF(min_df=2,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
#            ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1) # 去掉英文停用词
tfidf_matrix = tfidf.fit_transform(sentences)

In [642]:
tfidf_matrix.shape

(40000, 91895)

In [647]:
feature_names = tfidf.get_feature_names()
feature_names[50000:50020]

['恩爱有加',
 '恩爱秀',
 '恩经复',
 '恩菲尔德',
 '恩诺沙星',
 '恩里克',
 '恪尽职守',
 '恬不知耻',
 '恭敬地',
 '恭王府',
 '息事宁人',
 '息息相关',
 '息息相通',
 '恰似你的温柔',
 '恰到好处',
 '恰同学少年',
 '恰好相反',
 '恰如其分',
 '恰巧是',
 '恰恰好']

In [655]:
print(tfidf_matrix.toarray()[20000][100:110])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [658]:
tf_idf_words = []  # 保存所有的word：tfidf

print('start')
with open('../runs/tf_idf_word.txt','w', encoding='utf-8') as wf:
    for i in range(len(sentences)):
#         print('----Document %d----' % (i))
        word0 = [] # 保存每条数据的word：tfidf
        for j in range(len(feature_names)):
            if tfidf_matrix[i,j] > 0:
                word = feature_names[j]
                tfidf = tfidf_matrix[i,j]
                word0.append({word:tfidf})
                if j%10000 == 0:
                    print(word0)
        tf_idf_words.append(word0)
        wf.write(str(word0)+'\n')
        
print('finished')

start
[{'0': 0.03861741506601017}]
[{'0': 0.034950109364719525}]
[{'0': 0.06403549922905982}]
[{'0': 0.07763004829518379}]
[{'0': 0.04261084155235694}]
[{'0': 0.06160400508182375}]
[{'0': 0.07048512012887595}]
[{'0': 0.04181761647826834}]
[{'0': 0.0446222157937248}]
[{'0': 0.03195701322495244}]
[{'0': 0.08220808434951174}]
[{'0': 0.11282423401645245}]
[{'0': 0.0955581763054653}]
[{'0': 0.055260224091229784}]
[{'0': 0.0806471397742805}]
[{'0': 0.09716430538785073}]
[{'0': 0.02047202628611491}]
[{'0': 0.02769783909858152}]
[{'0': 0.11923995234123769}]
[{'一下子': 0.09537981357258803}, {'一天天': 0.13995107112362534}, {'一帆风顺': 0.11585613543214317}, {'个人资源': 0.27903490005554465}, {'亲儿子': 0.24592536018930325}, {'信息量': 0.12044786951100302}, {'偶像练习生': 0.12572777060569215}, {'共同体': 0.1208151235263993}, {'劳苦功高': 0.1792715189899637}, {'另一面': 0.11855807195000961}, {'吃瓜群众': 0.1024298282793724}, {'哥儿俩': 0.1792715189899637}, {'实实在在': 0.1018698878355726}, {'当事人': 0.09963558530143092}, {'影视剧': 0.09143585642

[{'0': 0.03580484734569109}]
[{'0': 0.07295603801430864}]
[{'0': 0.03881683966095052}]
[{'0': 0.10398539469464387}]
[{'0': 0.06789197773879768}]
[{'0': 0.13983300775193894}]
[{'0': 0.06025748366996392}]
[{'0': 0.0801006329761676}]
[{'0': 0.052999457006125864}]
[{'0': 0.056765910896514574}]
[{'0': 0.08326975451357521}]
[{'0': 0.08133795012394987}]
[{'0': 0.05035686180040322}]
[{'0': 0.03712770811923835}]
[{'0': 0.06010328770662565}]
[{'0': 0.05116990012235649}]
[{'0': 0.04873082745349669}]
[{'0': 0.046214136093037504}]
[{'18岁': 0.2571992888719611}, {'2018': 0.05968466563320775}, {'2020': 0.10399952349464396}, {'上个月': 0.25934249330145137}, {'东京奥运会': 0.3774294208949213}, {'亚运会': 0.21487103798407123}, {'全国纪录': 0.2652043087548303}, {'几十倍': 0.19464158713483162}]
[{'0': 0.08852919735640556}]
[{'0': 0.0889991945192216}]
[{'0': 0.03903229844469669}]
[{'0': 0.03785780936067335}]
[{'0': 0.03579271475969068}]
[{'0': 0.035942406754044594}]
[{'0': 0.06023551195420912}]
[{'0': 0.06328812803361503}]
[

[{'0': 0.12130461271169374}]
[{'0': 0.08450454478314345}]
[{'0': 0.05249223775577422}]
[{'0': 0.08970805777151715}]
[{'0': 0.13389872523953594}]
[{'0': 0.04445091017316259}]
[{'0': 0.06805796751442397}]
[{'0': 0.10377594715310919}]
[{'2016': 0.06827032850252063}, {'2020': 0.08268460836606399}, {'8': 0.06543346026844206}, {'97': 0.10094122327473459}, {'james': 0.14938551943897413}, {'wan': 0.16330717324712232}, {'一个家庭': 0.13594020497455853}, {'下半年': 0.09445841622942887}, {'亚特兰大': 0.16908518162948188}, {'亿美元': 0.0769037665632426}, {'公司出品': 0.16671954458605293}, {'制片人': 0.2075611284878107}, {'取材于': 0.18139920159398146}, {'好莱坞': 0.11782810196244996}, {'安娜贝尔': 0.21085014305392638}, {'安娜贝尔2': 0.18300683543763005}, {'性价比': 0.09629137799482364}, {'恐怖事件': 0.18878484381998964}, {'恐怖片': 0.16330717324712232}, {'招魂3': 0.44249270128039886}, {'搜狐娱乐': 0.1321448773837204}, {'最新消息': 0.12217987493260758}, {'来源于': 0.09435682420245602}, {'查维斯': 0.20059036318342827}]
[{'0': 0.06965992060685097}]
[{'0': 0.06

[{'0': 0.09312563138251756}]
[{'0': 0.061507711891430734}]
[{'0': 0.07644798973347862}]
[{'0': 0.03491882231294357}]
[{'0': 0.045182314916841125}]
[{'0': 0.0887715767822596}]
[{'0': 0.09561855183483485}]
[{'0': 0.04852253427779732}]
[{'0': 0.07368506814871642}]
[{'0': 0.05226445348470485}]
[{'0': 0.048838415903519096}]
[{'0': 0.06548969000294266}]
[{'0': 0.040770240387138265}]
[{'0': 0.09250229526005689}]
[{'0': 0.03998790769390209}]
[{'0': 0.09047642015587169}]
[{'0': 0.09039459824238813}]
[{'0': 0.10325776268451721}]
[{'0': 0.07502016915682087}]
[{'0': 0.06853202122039716}]
[{'0': 0.08611827230393743}]
[{'0': 0.051852135266495986}]
[{'0': 0.08662997729926122}]
[{'0': 0.08883545919189695}]
[{'0': 0.04650220756463052}]
[{'0': 0.06465515255288869}]
[{'0': 0.17353085753951453}]
[{'0': 0.0725385133396398}]
[{'0': 0.057398748821513065}]
[{'0': 0.05293890411245688}]
[{'0': 0.0718194371104903}]
[{'0': 0.05344222911555018}]
[{'0': 0.0896124975808061}]
[{'0': 0.04405150158045178}]
[{'0': 0.064

[{'0': 0.035639191994383465}]
[{'0': 0.05653593447961775}]
[{'0': 0.03131956789820088}]
[{'0': 0.062432521138870226}]
[{'0': 0.059944043050629565}]
[{'0': 0.06284776934397733}]
[{'0': 0.04948704391821039}]
[{'0': 0.08355097106290078}]
[{'0': 0.04317996861480582}]
[{'0': 0.06343937930986637}]
[{'0': 0.025597749535851404}]
[{'0': 0.025597749535851404}, {'1': 0.023824495359602176}, {'10g': 0.06430622415348927}, {'10ms': 0.07875097261145239}, {'1ms': 0.07377051211989215}, {'2': 0.041770869220959496}, {'2018': 0.04201193670510094}, {'2019': 0.04106678631109456}, {'2022': 0.04884637076352225}, {'2024': 0.10213701803899734}, {'20g': 0.06944216139477527}, {'480p': 0.08177316684056464}, {'4g网络': 0.053325138073913106}, {'4k电视': 0.06863457487860618}, {'500m': 0.07287776664447107}, {'5g手机': 0.05016081405248343}, {'5g技术': 0.047796070577914185}, {'5g新媒体平台': 0.08177316684056464}, {'5g牌照': 0.06582032348092266}, {'5g直播': 0.07377051211989215}, {'5g移动网络': 0.07964371808687347}, {'5g网络': 0.1393596249896154

[{'0': 0.06641637486001051}]
[{'0': 0.02502617543531863}]
[{'0': 0.03519458494962985}]
[{'0': 0.117422346672429}]
[{'0': 0.040545651930631706}]
[{'0': 0.07197615323177868}]
[{'0': 0.11387245755279411}]
[{'0': 0.08431442139691485}]
[{'0': 0.067972414574997}]
[{'0': 0.138990018149714}]
[{'0': 0.06274289545261805}]
[{'0': 0.03988383168916274}]
[{'0': 0.0633859495605045}]
[{'0': 0.06392779670339375}]
[{'0': 0.05035271755374513}]
[{'0': 0.04588743409514465}]
[{'0': 0.04245205312690494}]
[{'0': 0.03306765243643487}]
[{'0': 0.033679730125800195}]
[{'0': 0.20093796907452932}]
[{'0': 0.05759028004567529}]
[{'0': 0.05372804306941658}]
[{'0': 0.0471845758188785}]
[{'0': 0.02115736117671601}]
[{'0': 0.09865666266175498}]
[{'0': 0.08604986580582699}]
[{'0': 0.04305301968072492}]
[{'0': 0.04404298901323133}]
[{'0': 0.04873052858849526}]
[{'0': 0.10551663099872248}]
[{'0': 0.14944519811487436}]
[{'0': 0.15067275209934838}]
[{'0': 0.042854184374142894}]
[{'3200': 0.13754209373126225}, {'3240': 0.21221

[{'0': 0.05978025108138858}]
[{'0': 0.04182869874221902}]
[{'0': 0.07111195463477782}]
[{'0': 0.06389078895097679}]
[{'0': 0.11952509107137094}]
[{'0': 0.15137166342111197}]
[{'0': 0.0968640481828336}]
[{'0': 0.052533265222265854}]
[{'0': 0.07271758581698878}]
[{'0': 0.05208993565651816}]
[{'0': 0.1335407465129368}]
[{'0': 0.06123723377331019}]
[{'0': 0.20407274976742304}]
[{'0': 0.09662921827613297}]
[{'0': 0.11711711765883646}]
[{'0': 0.04873894700918863}]
[{'0': 0.032766821824700816}]
[{'0': 0.0606400520118569}]
[{'0': 0.03146554302196298}]
[{'0': 0.03635423139606765}]
[{'0': 0.05657590570805455}]
[{'0': 0.04046085074680807}]
[{'0': 0.060528776499811004}]
[{'0': 0.1045201463005776}]
[{'0': 0.02759226126732694}]
[{'0': 0.03262241170560078}]
[{'0': 0.1510211425292144}]
[{'0': 0.06971278394731202}]
[{'0': 0.12083900875555596}]
[{'0': 0.04410824780091428}]
[{'0': 0.0313671428717694}]
[{'0': 0.0673545302736897}]
[{'0': 0.0716274368211228}]
[{'0': 0.0772962637338291}]
[{'0': 0.04168257804

[{'0': 0.061039134373075045}]
[{'0': 0.08194545725327104}]
[{'0': 0.05182890003030445}]
[{'0': 0.11073379530782783}]
[{'0': 0.05400595770510922}]
[{'0': 0.06386812134079374}]
[{'0': 0.07313875197478477}]
[{'0': 0.07433303712896137}]
[{'0': 0.041358393949187}]
[{'0': 0.04806622802507228}]
[{'0': 0.08709004711989328}]
[{'0': 0.03816419663105405}]
[{'0': 0.05650754487994763}]
[{'0': 0.04683141182507225}]
[{'0': 0.04139913456155425}]
[{'0': 0.07094454819857125}]
[{'0': 0.18627958399482952}]
[{'0': 0.02730018123597009}]
[{'0': 0.12821733514075767}]
[{'0': 0.04444661165113142}]
[{'0': 0.0357684086229282}]
[{'0': 0.03596641795030127}]
[{'0': 0.06355622625272658}]
[{'0': 0.06542548753444646}]
[{'0': 0.057917011122926444}]
[{'0': 0.04609416791659243}]
[{'0': 0.06491151629176983}]
[{'0': 0.036283608738558716}]
[{'0': 0.04735286413317448}]
[{'0': 0.0555302007867647}]
[{'0': 0.044404023084800036}]
[{'0': 0.024932080946065336}]
[{'0': 0.1841009068035116}]
[{'0': 0.03616363157549911}]
[{'0': 0.02219

[{'0': 0.04548636812395262}]
[{'0': 0.05058752093086405}]
[{'0': 0.06655156865414728}]
[{'0': 0.10756522292419643}]
[{'0': 0.18266190610830682}]
[{'0': 0.14956758304397416}]
[{'0': 0.16656571395870626}]
[{'0': 0.07951446701435773}]
[{'0': 0.16448887128686268}]
[{'0': 0.06412283484324006}]
[{'0': 0.040391012970573625}]
[{'0': 0.07218804573559437}]
[{'0': 0.19298928341609844}]
[{'0': 0.08651849167196002}]
[{'0': 0.07129129756530657}]
[{'0': 0.0695042580180444}]
[{'0': 0.1260139256110167}]
[{'0': 0.07365743799447227}]
[{'0': 0.05688449083004411}]
[{'0': 0.06840635079791452}]
[{'0': 0.05893417584158743}]
[{'0': 0.046340854358871274}]
[{'0': 0.05531020258938015}]
[{'0': 0.07967371892317257}]
[{'0': 0.03367414529090504}]
[{'0': 0.08297117441951112}]
[{'0': 0.08421300316892481}]
[{'0': 0.08452258630415069}]
[{'0': 0.0407578768911235}]
[{'0': 0.042887714692880156}]
[{'0': 0.06129155724639684}]
[{'0': 0.022085538174771234}]
[{'0': 0.028891358598504984}]
[{'0': 0.03566020077832533}]
[{'0': 0.148

[{'0': 0.0493118443351606}]
[{'0': 0.09141944737144217}]
[{'0': 0.10115675304224939}]
[{'0': 0.05764845668812866}]
[{'0': 0.026459166829203348}]
[{'0': 0.23766164879737028}]
[{'0': 0.03156373865716881}]
[{'0': 0.041622068917660855}]
[{'0': 0.02385340753003502}]
[{'0': 0.024541319930582332}]
[{'0': 0.08075357376525649}]
[{'0': 0.08929975762454592}]
[{'0': 0.0513652443802879}]
[{'0': 0.04405878090527129}]
[{'0': 0.08384971292807049}]
[{'0': 0.06641562804807662}]
[{'0': 0.028094347313889743}]
[{'0': 0.09345314636081602}]
[{'0': 0.021215304614620838}]
[{'0': 0.07071018239490177}]
[{'0': 0.059347847673276534}]
[{'0': 0.1311440596053642}]
[{'0': 0.06456220865494038}]
[{'0': 0.05118524688887318}]
[{'0': 0.03568853617217424}]
[{'0': 0.05893719828316896}]
[{'0': 0.1582493430594695}]
[{'0': 0.07398275823224235}]
[{'0': 0.14692588559477554}]
[{'0': 0.13593624418119843}]
[{'0': 0.05508700489385864}]
[{'0': 0.05305613807760049}]
[{'0': 0.032476086467538516}]
[{'100m': 0.286689731818729}, {'10g': 0.

[{'0': 0.12312673286417011}]
[{'0': 0.16528563585321987}]
[{'0': 0.16077718550305842}]
[{'0': 0.02971977998290325}]
[{'0': 0.058100583875603636}]
[{'0': 0.11063004233483581}]
[{'0': 0.059499137021066885}]
[{'0': 0.045057605191790646}]
[{'0': 0.10814848500171656}]
[{'0': 0.04309374243991017}]
[{'0': 0.0780527079794506}]
[{'0': 0.042387061181975196}]
[{'0': 0.03771574227708831}]
[{'0': 0.04229239605844096}]
[{'0': 0.05595355245941678}]
[{'0': 0.08307853983365687}]
[{'0': 0.05766925696625654}]
[{'0': 0.07370332380449392}]
[{'0': 0.022855946425429794}]
[{'0': 0.04085091459001208}]
[{'0': 0.04567302116377922}]
[{'0': 0.04745785488264606}]
[{'0': 0.0444610632980769}]
[{'0': 0.04156187013345376}]
[{'0': 0.052298726373221846}]
[{'0': 0.06785976123900414}]
[{'0': 0.04539552247614706}]
[{'0': 0.10378666125809165}]
[{'0': 0.047265705674955284}]
[{'0': 0.04863653373889422}]
[{'0': 0.06459746572461669}]
[{'0': 0.0560922461693179}]
[{'0': 0.061631556905759936}]
[{'0': 0.11059274499030299}]
[{'0': 0.

[{'0': 0.04656764150496661}]
[{'0': 0.03269660262606222}]
[{'0': 0.06668106624923664}]
[{'0': 0.051817275443603476}]
[{'0': 0.045153734518097746}]
[{'0': 0.08885562579159105}]
[{'0': 0.11918169865890987}]
[{'0': 0.025798114252771086}]
[{'0': 0.06323953465756206}]
[{'0': 0.053935000680906404}]
[{'0': 0.04253631336571175}]
[{'0': 0.04650864715105455}]
[{'0': 0.029225711322096278}]
[{'0': 0.061925471353240634}]
[{'0': 0.02623316019951192}]
[{'0': 0.04835799289837652}]
[{'0': 0.038166794768820324}]
[{'0': 0.024699259672518174}]
[{'0': 0.026042988661399074}]
[{'0': 0.04080926677850257}]
[{'0': 0.03369499363881396}]
[{'0': 0.09392743538183605}]
[{'0': 0.08324740086471125}]
[{'0': 0.049958180509488324}]
[{'0': 0.05971617912593609}]
[{'0': 0.08678074082625371}]
[{'0': 0.032734705527680645}]
[{'0': 0.10687056313686386}]
[{'0': 0.08587681865419004}]
[{'0': 0.07621260439518487}]
[{'0': 0.03786811712099737}]
[{'0': 0.026238866879884408}]
[{'0': 0.1732716320461946}]
[{'0': 0.08862144559852873}]
[{'

[{'0': 0.12874868963654712}]
[{'0': 0.07413053849669753}]
[{'0': 0.12358517669447239}]
[{'0': 0.08328562536972602}]
[{'0': 0.054874868222840115}]
[{'0': 0.030114758040947834}]
[{'0': 0.02987440354580476}]
[{'0': 0.1034911167679951}]
[{'0': 0.11687983663218102}]
[{'0': 0.04494204063403048}]
[{'0': 0.05008166326344797}]
[{'0': 0.1211578552723853}]
[{'0': 0.05385424849881706}]
[{'0': 0.15419791262812055}]
[{'0': 0.09840273279696843}]
[{'0': 0.06024433483382364}]
[{'0': 0.0927663727229501}]
[{'0': 0.08950682121767452}]
[{'0': 0.21990962224900354}]
[{'0': 0.1041602982108223}]
[{'0': 0.10299838850463987}]
[{'0': 0.050319416438244405}]
[{'0': 0.04532066280072679}]
[{'0': 0.1390317662208771}]
[{'0': 0.022586485224919656}]
[{'0': 0.0591422905444719}]
[{'0': 0.037810208616966}]
[{'0': 0.05167704533863791}]
[{'0': 0.04768392568367219}]
[{'0': 0.03952389236040433}]
[{'0': 0.036156236253228065}]
[{'0': 0.059775219062415286}]
[{'0': 0.053513525593896015}]
[{'0': 0.12571569247999}]
[{'0': 0.050272461

[{'0': 0.10253509807180419}]
[{'0': 0.13863395912661}]
[{'0': 0.1067986770211266}]
[{'0': 0.03931190102131246}]
[{'0': 0.03276196026672343}]
[{'0': 0.05862072102589965}]
[{'0': 0.09829086957320213}]
[{'0': 0.17052006131318367}]
[{'0': 0.04144343797822619}]
[{'0': 0.06316513393625187}]
[{'0': 0.04021993330790151}]
[{'0': 0.03200841025664254}]
[{'0': 0.04699049372119462}]
[{'0': 0.03888341654906539}]
[{'0': 0.0735623573470105}]
[{'0': 0.0974834892682631}]
[{'0': 0.06149148663733336}]
[{'0': 0.1025321934731991}]
[{'183': 0.12635293691762567}, {'200b': 0.10742790007367192}, {'2018': 0.02417682550362403}, {'243': 0.082584433106712}, {'24小时': 0.053915534817300145}, {'3000': 0.04423653214784022}, {'3566': 0.10448401159375652}, {'911': 0.08658421944558306}, {'914': 0.08909258979421783}, {'ani': 0.0916643233886937}, {'chae': 0.0828899224045285}, {'cnn': 0.07154483196970834}, {'com': 0.04272213638016988}, {'era': 0.05973854490581504}, {'f15': 0.08532961376041653}, {'f16战斗机': 0.09739094534063336}

[{'0': 0.17887945074259434}]
[{'110': 0.07100921732225285}, {'16': 0.05446107781362815}, {'20': 0.04756556872339717}, {'2000': 0.04968051494953099}, {'2018': 0.05125265487312426}, {'287': 0.10266767043092763}, {'40': 0.05443630356199725}, {'42': 0.0625876047257889}, {'550': 0.08824074892740523}, {'56': 0.06291918290532479}, {'7': 0.04209527543612094}, {'78': 0.06561527632972786}, {'80': 0.055289074200561596}, {'88': 0.06624237040017679}, {'90': 0.05806458415596011}, {'97': 0.06439232834803343}, {'weekly': 0.11785837066759722}, {'一座座': 0.11228089568997472}, {'一部分': 0.05144760315870591}, {'万美元': 0.10405169494365961}, {'上一年': 0.08535867690139282}, {'不一定': 0.06563503333155041}, {'不太高': 0.11388453758779306}, {'不言而喻': 0.08095580994579946}, {'业内人士': 0.060466437001034615}, {'东南亚': 0.12455399368058906}, {'中产阶级': 0.09037062579519556}, {'中国人': 0.17189548211719366}, {'中国企业': 0.07188029922345411}, {'中英文': 0.10009659457494893}, {'二线城市': 0.07030699155263304}, {'人民币': 0.08095627033462979}, {'人民币汇率': 0

[{'0': 0.0823682435864752}]
[{'0': 0.07432979159419474}]
[{'0': 0.10682039458029656}]
[{'0': 0.06218462865918764}]
[{'0': 0.04519650668581185}]
[{'0': 0.09021609245483778}]
[{'0': 0.03591041175529564}]
[{'0': 0.02312618652002396}]
[{'0': 0.03880542236395159}]
[{'0': 0.1362178860097351}]
[{'0': 0.0551510547879632}]
[{'0': 0.07280790353125897}]
[{'0': 0.08655692294163013}]
[{'0': 0.09744333824859863}]
[{'0': 0.04964946313071164}]
[{'0': 0.11289862985357542}]
[{'0': 0.05941690125332516}]
[{'0': 0.05767474449916683}]
[{'0': 0.07053756482279362}]
[{'0': 0.06605679750243217}]
[{'0': 0.024977907158542645}]
[{'0': 0.09183424595913}]
[{'0': 0.03711293689881518}]
[{'0': 0.048224560419329356}]
[{'0': 0.12332299944691087}]
[{'0': 0.025923271076006203}]
[{'0': 0.032256418303312724}]
[{'0': 0.09244712166341323}]
[{'0': 0.11407316240268128}]
[{'0': 0.03344712386848929}]
[{'0': 0.027380634623640532}]
[{'0': 0.0379806424019704}]
[{'0': 0.06312077321117031}]
[{'0': 0.12267751183927615}]
[{'0': 0.0547857

[{'0': 0.05750943534950823}]
[{'0': 0.07800210124089764}]
[{'0': 0.06959226498434577}]
[{'0': 0.03615247332813299}]
[{'0': 0.20340582780751912}]
[{'0': 0.04492749659537027}]
[{'0': 0.022929984226498622}]
[{'0': 0.03588008958643589}]
[{'0': 0.07287252009878918}]
[{'0': 0.0499646897524893}]
[{'0': 0.03009214543236897}]
[{'0': 0.057540790870538205}]
[{'0': 0.026345739017296727}]
[{'0': 0.08989291709803049}]
[{'0': 0.0670454732213255}]
[{'0': 0.03388060722845715}]
[{'0': 0.1526114203725157}]
[{'0': 0.04897167720171732}]
[{'0': 0.061989447769308266}]
[{'0': 0.12566213342242896}]
[{'0': 0.029149392612319282}]
[{'0': 0.08587172720173815}]
[{'0': 0.07169979299981646}]
[{'0': 0.043999285328382924}]
[{'0': 0.09596455386531054}]
[{'0': 0.07345783303191034}]
[{'0': 0.05918185123395992}]
[{'0': 0.028946304818881385}]
[{'0': 0.039633683677550643}]
[{'0': 0.03517114394781604}]
[{'0': 0.04732745464188194}]
[{'0': 0.11379715196882816}]
[{'0': 0.11283219171508095}]
[{'0': 0.05838265989012598}]
[{'0': 0.

[{'0': 0.06072891769165871}]
[{'0': 0.05524494728780427}]
[{'0': 0.11337126773400738}]
[{'0': 0.03716322761745157}]
[{'0': 0.04526579417056665}]
[{'0': 0.15798023946167708}]
[{'0': 0.05473642848212993}]
[{'0': 0.06111038009295676}]
[{'0': 0.05053165947784093}]
[{'0': 0.07331707026537873}]
[{'0': 0.07002802719909601}]
[{'0': 0.06446382263054273}]
[{'0': 0.20223143128513812}]
[{'0': 0.08815663367969233}]
[{'0': 0.050854577811280506}]
[{'0': 0.04413772122634797}]
[{'0': 0.057426774162197755}]
[{'0': 0.1499255500564919}]
[{'0': 0.04111247199504396}]
[{'0': 0.042341622945376016}]
[{'0': 0.027466065878980463}]
[{'0': 0.07493985171761552}]
[{'0': 0.09390697851273186}]
[{'0': 0.10201987430236031}]
[{'0': 0.07312492565341126}]
[{'0': 0.022073556829764196}]
[{'000': 0.05455191775432529}, {'1100': 0.07749123389595539}, {'16': 0.04788897541358034}, {'1994': 0.06895179733218783}, {'1998': 0.10941441051640662}, {'20': 0.04182558338126991}, {'2000': 0.043685308011967953}, {'2001': 0.10348357663539208

[{'0': 0.04618777452568584}]
[{'0': 0.06424662230285814}]
[{'0': 0.03524434736829204}]
[{'0': 0.13517984746072118}]
[{'0': 0.05072253370279662}]
[{'0': 0.031929350573018773}]
[{'0': 0.04445826768796597}]
[{'0': 0.09093971204576319}]
[{'0': 0.07052455717278644}]
[{'0': 0.08463345345894115}]
[{'0': 0.038895957924079023}]
[{'0': 0.03447714961029275}]
[{'0': 0.055485707385306865}]
[{'0': 0.04196619889811539}]
[{'0': 0.08291959830211026}]
[{'0': 0.12827273969862302}]
[{'0': 0.05285753105778345}]
[{'0': 0.03582678215568575}]
[{'0': 0.03795164039895104}]
[{'0': 0.06029196883458177}]
[{'0': 0.13383202704338465}]
[{'0': 0.04957761453496992}]
[{'0': 0.06271647995734252}]
[{'0': 0.03042876221867226}]
[{'0': 0.06522879300138572}]
[{'0': 0.041588436756018816}]
[{'0': 0.038219507583647006}]
[{'0': 0.047121171862987664}]
[{'0': 0.11367149448621934}]
[{'0': 0.04877365469490872}]
[{'0': 0.08833031888772458}]
[{'0': 0.1095130184792021}]
[{'0': 0.0668080316732098}]
[{'0': 0.05746910298761085}]
[{'0': 0.1

[{'0': 0.08697073599712932}]
[{'0': 0.06235977333937633}]
[{'0': 0.040494232618620216}]
[{'0': 0.1353314956639313}]
[{'0': 0.18893196355147535}]
[{'0': 0.06314387081145621}]
[{'0': 0.044179871295006064}]
[{'0': 0.06165658396593655}]
[{'0': 0.13497142296707312}]
[{'0': 0.027196255212362287}]
[{'0': 0.057698886683158264}]
[{'0': 0.15329097754080162}]
[{'0': 0.033577168762819985}]
[{'0': 0.03997983661831461}]
[{'0': 0.09325816222477115}]
[{'0': 0.049157220949302624}]
[{'0': 0.04914322572225446}]
[{'0': 0.038982506018413576}]
[{'0': 0.10105313107139392}]
[{'0': 0.04331544438751143}]
[{'0': 0.2130516637707518}]
[{'0': 0.05395567511271044}]
[{'0': 0.11251333244082397}]
[{'0': 0.08697727008587439}]
[{'0': 0.08164478140124835}]
[{'0': 0.04468445269874643}]
[{'0': 0.03983370611412962}]
[{'0': 0.07961575987658812}]
[{'0': 0.07887929621308332}]
[{'0': 0.042668059868444276}]
[{'0': 0.07904555422011447}]
[{'0': 0.04776980615178586}]
[{'0': 0.12402510695883685}]
[{'0': 0.06463988146555152}]
[{'0': 0

[{'0': 0.032141459837005476}]
[{'0': 0.04038229831034409}]
[{'0': 0.1136281841744411}]
[{'0': 0.09468241137188911}]
[{'0': 0.05642992536845322}]
[{'0': 0.07981977329114812}]
[{'0': 0.030815126855968697}]
[{'0': 0.10378728486358872}]
[{'0': 0.03884977637831916}]
[{'0': 0.04454528454123371}]
[{'0': 0.06391833898450776}]
[{'0': 0.027762705239848758}]
[{'0': 0.09188466988249844}]
[{'0': 0.04650938600391784}]
[{'0': 0.02629227743874352}]
[{'0': 0.21262376737441946}]
[{'0': 0.08489055041795174}]
[{'0': 0.055936322703024705}]
[{'0': 0.03267045821285568}]
[{'0': 0.03843877998220252}]
[{'0': 0.07638012124656722}]
[{'0': 0.056880692709350485}]
[{'0': 0.05889222022789285}]
[{'0': 0.033738730326696706}]
[{'0': 0.08585310953910336}]
[{'0': 0.02454708637485427}]
[{'0': 0.05951004397891023}]
[{'0': 0.2032082010629065}]
[{'0': 0.11409552765910387}]
[{'0': 0.06682102470994734}]
[{'0': 0.025880013753735612}]
[{'0': 0.09276560534622562}]
[{'0': 0.03330270775576211}]
[{'0': 0.10017519308128586}]
[{'0': 0.

[{'0': 0.05793611574595537}]
[{'0': 0.07253047379333821}]
[{'0': 0.16200496531325811}]
[{'0': 0.08198855228539803}]
[{'0': 0.087233162783027}]
[{'0': 0.07498099881614567}]
[{'0': 0.022845302279381523}]
[{'0': 0.03957457954570523}]
[{'0': 0.03645292593029369}]
[{'0': 0.054548311810894944}]
[{'0': 0.11822477066182457}]
[{'0': 0.04863959669037049}]
[{'0': 0.027755574072411696}]
[{'0': 0.04216184559606947}]
[{'0': 0.03975822919849339}]
[{'0': 0.07065891521020347}]
[{'0': 0.1104541486187266}]
[{'0': 0.15230155037273904}]
[{'0': 0.13436751478617068}]
[{'0': 0.03080543105291153}]
[{'0': 0.09489262527205747}]
[{'0': 0.027579509518774796}]
[{'0': 0.043732868263441055}]
[{'0': 0.13023521608643956}]
[{'0': 0.11644380147121432}]
[{'0': 0.07349156371097139}]
[{'0': 0.08606348785515823}]
[{'0': 0.07253627675125658}]
[{'0': 0.034020858976823026}]
[{'0': 0.049106213208690186}]
[{'0': 0.07516076758061947}]
[{'0': 0.08090686786209111}]
[{'0': 0.0329484987982184}]
[{'0': 0.06560607527066835}]
[{'0': 0.04

[{'0': 0.031876080471795847}]
[{'0': 0.03379056681382663}]
[{'0': 0.13146366619487415}]
[{'0': 0.12012702457021045}]
[{'0': 0.04778604873688253}]
[{'0': 0.04459288992799122}]
[{'0': 0.10291194194889647}]
[{'0': 0.034333966156778584}]
[{'0': 0.11030816910193691}]
[{'0': 0.03793507767665514}]
[{'0': 0.04732599639044607}]
[{'0': 0.10688873125846296}]
[{'0': 0.0435876772107032}]
[{'0': 0.07506746334460787}]
[{'0': 0.07452064302125076}]
[{'0': 0.059236048116220894}]
[{'0': 0.06662652127365587}]
[{'0': 0.10200852811638651}]
[{'0': 0.05786125006891289}]
[{'0': 0.08030501086270608}]
[{'0': 0.061361151700895025}]
[{'0': 0.06878731120906122}]
[{'0': 0.08528992708862054}]
[{'0': 0.0342322547326786}]
[{'0': 0.029278252198303132}]
[{'0': 0.03581305271418357}]
[{'1982': 0.08167408335754646}, {'2006': 0.0608732170517264}, {'2018': 0.028233787665091638}, {'3': 0.03503097592666315}, {'4000': 0.05344386293145831}, {'73': 0.06223828236478148}, {'90': 0.09169666087903053}, {'一席之地': 0.07581725256613615}, {

[{'0': 0.05371396875682146}]
[{'0': 0.03403294819042751}]
[{'0': 0.06114369791227766}]
[{'0': 0.026170706095333125}]
[{'0': 0.060660793330681584}]
[{'0': 0.081159820212066}]
[{'0': 0.030214040928090272}]
[{'0': 0.051405287625239}]
[{'0': 0.04310691065829315}]
[{'0': 0.027699509581663742}]
[{'0': 0.0444963818411984}]
[{'0': 0.14789835142354246}]
[{'0': 0.08140335281041391}]
[{'0': 0.069706566493549}]
[{'0': 0.06753711715156331}]
[{'0': 0.0588236525459611}]
[{'0': 0.12423402230686022}]
[{'0': 0.09208271404706612}]
[{'0': 0.045917491265505046}]
[{'0': 0.06460633791599103}]
[{'0': 0.08622684050189576}]
[{'0': 0.04653256875260663}]
[{'0': 0.08345012883464545}]
[{'0': 0.03705298230159677}]
[{'0': 0.1803708328413856}]
[{'0': 0.07128929238159139}]
[{'0': 0.09827927965802122}]
[{'0': 0.081234483884741}]
[{'0': 0.06442468533164561}]
[{'0': 0.027230538047849612}]
[{'0': 0.04943569611658162}]
[{'0': 0.08863245601328298}]
[{'0': 0.04947601595744914}]
[{'0': 0.047646907501775894}]
[{'0': 0.168473728

[{'0': 0.11648418880471915}]
[{'0': 0.05733922147439844}]
[{'0': 0.07598841837609684}]
[{'0': 0.1122497964739105}]
[{'0': 0.030723939320702148}]
[{'0': 0.028183831378683275}]
[{'0': 0.07159323992804821}]
[{'0': 0.10367694297663593}]
[{'0': 0.06662827649023872}]
[{'0': 0.04273551359635261}]
[{'0': 0.05059994724875915}]
[{'0': 0.03545522366324573}]
[{'0': 0.07644616870712438}]
[{'0': 0.09765992158951124}]
[{'0': 0.13122078171088045}]
[{'0': 0.053050053335758046}]
[{'0': 0.06333294120201646}]
[{'0': 0.06255634501522858}]
[{'0': 0.12612300600883466}]
[{'0': 0.10188176997761568}]
[{'0': 0.12057394518808295}]
[{'0': 0.07989392123781956}]
[{'0': 0.038202812045367136}]
[{'0': 0.1053498234581826}]
[{'0': 0.07522621673917665}]
[{'0': 0.03659322074168222}]
[{'0': 0.0808203804005545}]
[{'0': 0.0503332471289015}]
[{'0': 0.05633662840553183}]
[{'0': 0.03421958389299418}]
[{'0': 0.1253436407391902}]
[{'0': 0.04498041015235509}]
[{'0': 0.06504743581008711}]
[{'0': 0.04901566643374279}]
[{'0': 0.084530

[{'0': 0.05805290164144178}]
[{'0': 0.06087101893255601}]
[{'0': 0.0884314203918073}]
[{'0': 0.048787805719977914}]
[{'0': 0.029824868273178873}]
[{'0': 0.04769739311809187}]
[{'0': 0.022927769400720395}]
[{'0': 0.026327285810477677}]
[{'0': 0.054416777640335846}]
[{'0': 0.06918391329274706}]
[{'0': 0.03275872056029958}]
[{'0': 0.09214674890509451}]
[{'0': 0.038340677566624605}]
[{'0': 0.07899598245378746}]
[{'0': 0.06043996411671202}]
[{'0': 0.06000048738911258}]
[{'0': 0.05539472787896506}]
[{'0': 0.10064002545642148}]
[{'0': 0.050307691229796093}]
[{'0': 0.0518524190053738}]
[{'0': 0.03311474663413924}]
[{'0': 0.10578886076134253}]
[{'0': 0.049010912977873265}]
[{'0': 0.06501081085070043}]
[{'0': 0.08759078092128891}]
[{'0': 0.08988191006957896}]
[{'0': 0.07778932836182495}]
[{'一见倾心': 0.1786276741455587}, {'三十余年': 0.3222335331930738}, {'两人共进': 0.21326292546427233}, {'众所周知': 0.09885471796684962}, {'在一起': 0.08437701906747715}, {'妻子2': 0.16918710407318044}, {'妻子的浪漫旅行2': 0.2677563054068

[{'0': 0.1867170176801605}]
[{'0': 0.04459369134385434}]
[{'0': 0.06624944831158831}]
[{'0': 0.06529878163246562}]
[{'0': 0.06529878163246562}, {'1000': 0.05104220498120999}, {'1400': 0.07867669914384907}, {'2': 0.03716955673705099}, {'2000': 0.04950094680944062}, {'2018': 0.03016123163152942}, {'5': 0.061263664383557084}, {'60': 0.05565511258670477}, {'7': 0.041943123826494205}, {'70': 0.055527176337040816}, {'app': 0.04788876420099625}, {'end': 0.0648388908940582}, {'facebook': 0.07010973457324125}, {'pay': 0.14279288404926954}, {'win': 0.13635522481109655}, {'一时间': 0.0733710712049993}, {'一起来': 0.0699680449587895}, {'一辆车': 0.09598989120758629}, {'上下学': 0.11999415933409487}, {'世界各地': 0.13013700798222114}, {'东南亚': 0.07329770259378754}, {'东南亚市场': 0.09889277340254114}, {'中国人': 0.06563642395242385}, {'主城区': 0.09973479962706115}, {'互联网': 0.11299510085582158}, {'人民的': 0.08196489333987543}, {'众所周知': 0.06042013254358993}, {'使用率': 0.0867155265771983}, {'共享单车': 0.2736447127140819}, {'创始人': 0.08

[{'10': 0.04668429003057497}, {'30': 0.05281373248577205}, {'48': 0.06580802956620144}, {'70': 0.058003757724032705}, {'一个个': 0.0762646380244955}, {'一个人': 0.08953065779669092}, {'一个月': 0.05966021096316209}, {'一瞬间': 0.08827993932588607}, {'一般人': 0.08419301269910925}, {'不可能': 0.057409320046053695}, {'不约而同': 0.09654827374849904}, {'不见得': 0.0954021895627586}, {'主力建仓': 0.1215096680069072}, {'事实上': 0.0532254955101605}, {'交易量': 0.0788992392163969}, {'人性的弱点': 0.10801947684238479}, {'信号灯': 0.10842983190738288}, {'关键因素': 0.08488772640481183}, {'关键在于': 0.08696719297980295}, {'关键性': 0.0886529842760275}, {'几十倍': 0.10274778884217173}, {'出现异常': 0.10480346328354842}, {'分时图': 0.30516774641459904}, {'分析法': 0.10929048453982942}, {'刚刚开始': 0.08248017172352194}, {'十几个': 0.08730481828799691}, {'十字线': 0.20985037980128857}, {'千百万': 0.12534604864592938}, {'另一方': 0.09509204797148389}, {'可能会': 0.04707192889135542}, {'可能性': 0.0575462827770722}, {'同方向': 0.13318447028231367}, {'基本要素': 0.11538158043993618}, {'失败者': 0

[{'0': 0.06419305497704667}]
[{'0': 0.15490143287955407}]
[{'0': 0.038438258777916186}]
[{'0': 0.03797756880341599}]
[{'0': 0.13103726133629032}]
[{'0': 0.025617146509201032}]
[{'0': 0.06610983510842895}]
[{'0': 0.07964903236570137}]
[{'0': 0.10548715398945747}]
[{'0': 0.05412174437640893}]
[{'0': 0.030349084211761572}]
[{'0': 0.03710394161378617}]
[{'0': 0.08811741386978515}]
[{'0': 0.05272351377586847}]
[{'0': 0.0630233493823604}]
[{'0': 0.05180391619274851}]
[{'0': 0.0612127825479008}]
[{'0': 0.07615420876459496}]
[{'0': 0.06486196179056387}]
[{'0': 0.05497461961609579}]
[{'0': 0.06388730841418624}]
[{'0': 0.11706345514452647}]
[{'0': 0.09964384795273491}]
[{'0': 0.053527977688321626}]
[{'0': 0.11090990152072774}]
[{'0': 0.06333571867009963}]
[{'0': 0.07732795158595115}]
[{'0': 0.04646578543870987}]
[{'0': 0.15262390740742873}]
[{'0': 0.09543769634097934}]
[{'0': 0.1368082287938179}]
[{'0': 0.040263964315828946}]
[{'0': 0.07425225349066758}]
[{'0': 0.057889198487379265}]
[{'0': 0.06

[{'0': 0.04935348768642634}]
[{'0': 0.09815539668719857}]
[{'0': 0.051570382904582455}]
[{'0': 0.13011973706536606}]
[{'0': 0.04656750459983809}]
[{'0': 0.09790374217162927}]
[{'0': 0.07849114346679709}]
[{'0': 0.02373515521934466}]
[{'0': 0.07154083841188857}]
[{'0': 0.04747685905189898}]
[{'0': 0.04164871092231335}]
[{'0': 0.058739040231902157}]
[{'0': 0.03583321985560945}]
[{'0': 0.05241367849229217}]
[{'0': 0.11102425087125396}]
[{'0': 0.07004651489605548}]
[{'0': 0.026435449816182836}]
[{'0': 0.03526079134254343}]
[{'0': 0.034739680133591584}]
[{'0': 0.030190379636271372}]
[{'0': 0.10122633054368918}]
[{'0': 0.18055353312084735}]
[{'0': 0.08272670530914696}]
[{'0': 0.037328995790106956}]
[{'0': 0.05785480964271486}]
[{'0': 0.03792505393554996}]
[{'0': 0.07059630326325944}]
[{'0': 0.08591752588379391}]
[{'0': 0.06961978902110376}]
[{'0': 0.04158267527454992}]
[{'0': 0.05231131863320241}]
[{'0': 0.04702153598848288}]
[{'0': 0.027827590653260584}]
[{'0': 0.04126451489695678}]
[{'0': 

[{'0': 0.025629318551877472}]
[{'0': 0.1524791824474088}]
[{'0': 0.09558651934572897}]
[{'0': 0.06763765431094353}]
[{'0': 0.0383565205790395}]
[{'0': 0.09263564229295797}]
[{'0': 0.05598618485542483}]
[{'0': 0.07257572224918711}]
[{'0': 0.040483464422113195}]
[{'0': 0.051644540500933755}]
[{'0': 0.041524238070306946}]
[{'0': 0.025887291243534977}]
[{'0': 0.028554101702134113}]
[{'0': 0.12989750296756353}]
[{'0': 0.030335696325854943}]
[{'0': 0.06380511077105923}]
[{'0': 0.10450684484154324}]
[{'0': 0.04902509521112038}]
[{'0': 0.11740244743934167}]
[{'0': 0.1368568105818649}]
[{'0': 0.03725384385675863}]
[{'0': 0.030719663735454816}]
[{'0': 0.06241133014570186}]
[{'0': 0.055849892332791314}]
[{'0': 0.0563104662349433}]
[{'0': 0.07273716257641373}]
[{'0': 0.07367302508407117}]
[{'0': 0.05995908798241237}]
[{'0': 0.07963660696997293}]
[{'0': 0.05339773882907605}]
[{'0': 0.05052279830004917}]
[{'0': 0.03936797418336764}]
[{'0': 0.061073307466356926}]
[{'0': 0.09957430814204996}]
[{'0': 0

[{'0': 0.05624370016697867}]
[{'0': 0.04202345721524888}]
[{'0': 0.10953087146593127}]
[{'0': 0.05365379511280852}]
[{'0': 0.11157934311047123}]
[{'0': 0.03500088450040488}]
[{'0': 0.057789852642836176}]
[{'0': 0.11671733603297674}]
[{'0': 0.05444635215224747}]
[{'0': 0.0754530057940047}]
[{'0': 0.11302890312194895}]
[{'0': 0.02867859119774338}]
[{'0': 0.054852943555474994}]
[{'0': 0.08710480490036109}]
[{'0': 0.043759698007795576}]
[{'0': 0.07260466448404307}]
[{'0': 0.07240595559526097}]
[{'0': 0.056244795509048554}]
[{'0': 0.06658098968797996}]
[{'0': 0.132280956978386}]
[{'0': 0.058680159621086594}]
[{'0': 0.02469492440571248}]
[{'0': 0.05233300317377598}]
[{'0': 0.06444941559658081}]
[{'0': 0.036239613678998715}]
[{'0': 0.1839074054607794}]
[{'0': 0.07711641434630759}]
[{'0': 0.07516394575769297}]
[{'0': 0.09328159389366292}]
[{'0': 0.03738375585822114}]
[{'0': 0.10554755432688656}]
[{'0': 0.054927217725424256}]
[{'0': 0.07043336699808513}]
[{'0': 0.05078556423324339}]
[{'0': 0.09

[{'0': 0.07051088257904448}]
[{'0': 0.04062499114413826}]
[{'0': 0.07531295746907608}]
[{'0': 0.0712626177439308}]
[{'0': 0.054061593835601}]
[{'0': 0.044039410891795694}]
[{'0': 0.10784499474085829}]
[{'0': 0.08526580448807279}]
[{'0': 0.058926261044647876}]
[{'0': 0.14210294220079672}]
[{'0': 0.04309081665156807}]
[{'0': 0.06883976636278381}]
[{'0': 0.18339166672172655}]
[{'0': 0.04370980355372499}]
[{'0': 0.04534323913799302}]
[{'0': 0.07039857916877425}]
[{'0': 0.09855113374509263}]
[{'0': 0.04472300325051349}]
[{'0': 0.05418267360681998}]
[{'0': 0.08868290915998361}]
[{'0': 0.04219978299571932}]
[{'0': 0.08422661280114559}]
[{'0': 0.029735998299582643}]
[{'0': 0.14187494979017626}]
[{'0': 0.21753730262303045}]
[{'0': 0.08452506703637999}]
finished


In [124]:
def tfidf_keywords():
    # 00、读取文件,一行就是一个文档，将所有文档输出到一个list中
    corpus = []
    for line in open('news.txt', 'r').readlines():
        corpus.append(line)

    # 01、构建词频矩阵，将文本中的词语转换成词频矩阵
    vectorizer = CountVectorizer()
    # a[i][j]:表示j词在第i个文本中的词频
    X = vectorizer.fit_transform(corpus)
    print X  # 词频矩阵

    # 02、构建TFIDF权值
    transformer = TfidfTransformer()
    # 计算tfidf值
    tfidf = transformer.fit_transform(X)

    # 03、获取词袋模型中的关键词
    word = vectorizer.get_feature_names()

    # tfidf矩阵
    weight = tfidf.toarray()

    # 打印特征文本
    print len(word)
    for j in range(len(word)):
        print word[j]

    # 打印权重
    for i in range(len(weight)):
        for j in range(len(word)):
            print weight[i][j]
            # print '\n'

In [129]:
jieba_tags = jieba.analyse.extract_tags(sentence=data[1]['content'], topK=20, allowPOS=('r','m','d', 'p', 'q', 'ad', 'u', 'f'), withWeight=True, withFlag=True)
jieba_tags

[(pair('仿真', 'd'), 1.3352572514828571),
 (pair('1.', 'm'), 0.5692746429952381),
 (pair('2.', 'm'), 0.5692746429952381),
 (pair('3.', 'm'), 0.5692746429952381),
 (pair('我么', 'r'), 0.5692746429952381),
 (pair('我方', 'r'), 0.4335660526847619),
 (pair('三维', 'm'), 0.41879677038476193),
 (pair('此类', 'r'), 0.35080422924),
 (pair('业内', 'f'), 0.3286311992604762),
 (pair('特定', 'd'), 0.32387980516285714),
 (pair('深刻', 'd'), 0.3218400512985714),
 (pair('若干', 'm'), 0.31838920180095237),
 (pair('真实', 'd'), 0.31049627921666667),
 (pair('众多', 'm'), 0.2812081443657143),
 (pair('一系列', 'm'), 0.2764613758414286),
 (pair('很快', 'd'), 0.27028467095285713),
 (pair('内部', 'f'), 0.2684818418557143),
 (pair('多年', 'm'), 0.2658180634033333),
 (pair('最终', 'd'), 0.2541329396461905)]

In [None]:
text = 19*(x['title_cut']+'。')+ 3*(x['first_sentence']+'。') + 1*(x['other_sentence']+'。')+\
        3*(x['last_sentence']+ '。') + 7*first_sentence_reg
jieba_tags = jieba.analyse.extract_tags(sentence=text, topK=20, allowPOS=('r','m','d', 'p', 'q', 'ad', 'u', 'f'), withWeight=True,\
                                  withFlag=True)

In [None]:
def get_train_df(df, train=True):
    res = []
    for index in tqdm(df.index):
        
        x = df.loc[index]
        # TF-IDF
        first_sentence_reg = ' '.join(x['first_sentence_reg'])
        ## 这里主要是提取jieba默认的tf-idf值 我这边jieba自带的idf文件效果比自己提取的要好 所以也顺便用来筛选候选词 
        ## 假如把topK设置为None的话 数据量会增大10倍 但是非常难跑 没有验证过效果
        ## PS：这里我稍微修改了一下jieba的源码 allowpPOS实际是不允许出现的词性 即allowPOS = NotAllowPOS
        
        text = 19*(x['title_cut']+'。')+ 3*(x['first_sentence']+'。') + 1*(x['other_sentence']+'。')+\
                3*(x['last_sentence']+ '。') + 7*first_sentence_reg
        jieba_tags = jieba.analyse.extract_tags(sentence=text, topK=20, allowPOS=('r','m','d', 'p', 'q', 'ad', 'u', 'f'), withWeight=True,\
                                          withFlag=True)

        tags = []
        cixing = []
        weight = []
        for tag in jieba_tags:
            tags.append(tag[0].word)
            cixing.append(tag[0].flag)
            weight.append(tag[1])

        sentence_delimiters = re.compile(u'[。？！；!?]')
        sentences =[i for i in sentence_delimiters.split(text) if i != '']
        num_sen = len(sentences)

        words = []
        num_words = 0
        for sen in sentences:
            cut = jieba.lcut(sen)
            words.append(cut)
            num_words += len(cut)
        
        new_tags = []
        new_cixing = []
        new_weight = []
        len_tags = []
        for i in range(len(tags)):
            if tags[i].isdigit() and tags[i] not in ['985', '211']:
                continue
            if ',' in tags[i]:
                continue
            new_tags.append(tags[i])
            new_weight.append(weight[i])
            new_cixing.append(cixing[i])
            len_tags.append(len(tags[i]))
            
            
        ## 位置特征： 1. 是否出现在标题 2.是否出现在第一句 3.是否出现在最后一句 4.出现在正文中间部分
        occur_in_title = np.zeros(len(new_tags))
        occur_in_first_sentence = np.zeros(len(new_tags))
        occur_in_last_sentence = np.zeros(len(new_tags))
        occur_in_other_sentence = np.zeros(len(new_tags))
        for i in range(len(new_tags)):
            if new_tags[i] in x['title_cut']:
                occur_in_title[i] = 1
            if new_tags[i] in x['first_sentence']:
                occur_in_first_sentence[i] = 1
            if new_tags[i] in x['last_sentence']:
                occur_in_last_sentence[i] = 1
            if new_tags[i] in x['other_sentence']:
                occur_in_other_sentence[i] = 1
        
        
        ## 共现矩阵及相关统计特征 这里我一开始统计了好多 例如均值、方差、偏度等 得到新特征后贪心验证只保留以下三个 下面的统计特征同理
        num_tags = len(new_tags)
        arr = np.zeros((num_tags, num_tags))
        for i in range(num_tags):
            for j in range(i+1, num_tags):
                count = 0
                for word in words:
                    if new_tags[i] in word and new_tags[j] in word:
                        count += 1
                arr[i, j] = count
                arr[j, i] = count
        ske = stats.skew(arr)
        # cols += ['var_gongxian']
        # cols += ['kurt_gongxian']
        # cols += ['diff_min_gongxian']   
        var_gongxian = np.zeros(len(new_tags))
        kurt_gongxian = np.zeros(len(new_tags))
        diff_min_gongxian = np.zeros(len(new_tags))
        for i in range(len(new_tags)):
            var_gongxian[i] = np.var(arr[i])
            kurt_gongxian[i] = stats.kurtosis(arr[i])
            diff_sim = np.diff(arr[i])
            if len(diff_sim) > 0:
                diff_min_gongxian[i] = np.min(diff_sim)

                
        ## textrank特征
        textrank_tags = dict(jieba.analyse.textrank(sentence=text, allowPOS=('r','m','d', 'p', 'q', 'ad', 'u', 'f'), withWeight=True))
        
        textrank = []
        for tag in new_tags:
            if tag in textrank_tags:
                textrank.append(textrank_tags[tag])
            else:
                textrank.append(0)
                
        all_words = np.concatenate(words).tolist()
        
        ## 词频
        tf = []
        for tag in new_tags:
            tf.append(all_words.count(tag))
        tf = np.array(tf)
        
        ## hf: 头词频，文本内容前1/4候选词词频
        hf = []
        head = len(words) // 4 + 1
        head_words = np.concatenate(words[:head]).tolist()
        for tag in new_tags:
            hf.append(head_words.count(tag))
        
        ## has_num：是否包含数字
        ## has_eng: 是否包含字母
        def hasNumbers(inputString):
            return bool(re.search(r'\d', inputString))
        def hasEnglish(inputString):
            return bool(re.search(r'[a-zA-Z]', inputString))
        has_num = []
        has_eng = []
        for tag in new_tags:
            if hasNumbers(tag):
                has_num.append(1)
            else:
                has_num.append(0)
            if hasEnglish(tag):
                has_eng.append(1)
            else:
                has_eng.append(0)
                
        ## is_TV:是否为作品名称
        is_TV = []
        for tag in new_tags:
            if tag in TV:
                is_TV.append(1)
            else:
                is_TV.append(0)
                
        ## idf: 用训练集跑出的逆词频
        v_idf = []
        for tag in new_tags:
            v_idf.append(idf.get(tag, 0))
        
        ## 计算文本相似度，这里直接用doc2vec跟每个单词的word2vec做比较
        ## sim: 余弦相似度
        ## sim_euc：欧氏距离
        default = np.zeros(100)
        doc_vec = doc2vec_model.docvecs.vectors_docs[x['idx']]
        sim = []
        sim_euc = []
        for tag in new_tags:
            if tag in wv:
                sim.append(Cosine(wv[tag], doc_vec))
                sim_euc.append(Euclidean(wv[tag], doc_vec))
            else:
                sim.append(Cosine(default, doc_vec))
                sim_euc.append(Euclidean(default, doc_vec))
                
        ## 关键词所在句子长度 L2，记录为列表，然后算统计特征 
        mean_l2 = np.zeros(len(new_tags))
        max_l2 = np.zeros(len(new_tags))
        min_l2 = np.zeros(len(new_tags))
        for i in range(len(new_tags)):
            tmp = []
            for word in words:
                if new_tags[i] in word:
                    tmp.append(len(word))
            if len(tmp) > 0:
                mean_l2[i] = np.mean(tmp)
                max_l2[i] = np.max(tmp)
                min_l2[i] = np.min(tmp)
                
# cols += ['min_pos']
# cols += ['diff_min_pos_bili']
# cols += ['diff_kurt_pos_bili']  

        ## 关键词所在位置，记录为列表，然后算统计特征 

        min_pos = [np.NaN for _ in range(len(new_tags))]
        diff_min_pos_bili = [np.NaN for _ in range(len(new_tags))]
        diff_kurt_pos_bili = [np.NaN for _ in range(len(new_tags))]
        
        for i in range(len(new_tags)):
            pos = [a for a in range(len(all_words)) if all_words[a] == new_tags[i]]
            pos_bili = np.array(pos) / len(all_words)
            
            if len(pos) > 0:
                min_pos[i] = np.min(pos)
                diff_pos = np.diff(pos)
                diff_pos_bili = np.diff(pos_bili)
                if len(diff_pos) > 0:
                    diff_min_pos_bili[i] = np.min(diff_pos_bili)
                    diff_kurt_pos_bili[i] = stats.kurtosis(diff_pos_bili)
                    
        ## 关键词所在句子位置特征，也是做成列表，做统计特征
        # cols += ['diff_max_min_sen_pos']
        # cols += ['diff_var_sen_pos_bili']

        diff_max_min_sen_pos =  [np.NaN for _ in range(len(new_tags))]
        diff_var_sen_pos_bili =  [np.NaN for _ in range(len(new_tags))]  
        for i in range(len(new_tags)):
            pos = [a for a in range(len(words)) if new_tags[i] in words[a]]
            pos_bili = np.array(pos) / len(all_words)
            
            if len(pos) > 0:
                diff_pos = np.diff(pos)
                diff_pos_bili = np.diff(pos_bili)
                if len(diff_pos) > 0:
                    diff_max_min_sen_pos[i] = np.max(diff_pos) - np.min(diff_pos)
                    diff_var_sen_pos_bili[i] = np.var(diff_pos_bili)
                    
#         ## 左右信息熵 没用
#         left_entropy = []
#         right_entropy = []
#         for tag in new_tags:
#             left = []
#             right = []
#             for word in words:
#                 if len(word) < 3:
#                     continue
#                 for i in range(len(word)):
#                     if word[i] == tag:
#                         if i < 1:
#                             left.append('None')
#                             right.append(word[i+1])
#                         if i == (len(word) - 1):
#                             left.append(word[i-1])
#                             right.append('None')
#             left_entropy.append(calc_ent(np.array(left)))
#             right_entropy.append(calc_ent(np.array(right)))
                
        ## 候选关键词之间的相似度 word2vec gensim 窗口默认 迭代默认 向量长度100
        ## sim_tags_arr：相似度矩阵
        sim_tags_arr = np.zeros((len(new_tags), len(new_tags)))
        for i in range(len(new_tags)):
            for j in range(i+1, len(new_tags)):
                if new_tags[i] in wv and new_tags[j] in wv:
                    sim_tags_arr[i, j] = word2vec_model.similarity(new_tags[i], new_tags[j])
                    sim_tags_arr[j, i] = sim_tags_arr[i, j]
            # cols += ['mean_sim_tags']
# cols += ['diff_mean_sim_tags']        
#         max_sim_tags = np.zeros(len(new_tags))
#         min_sim_tags = np.zeros(len(new_tags))
        mean_sim_tags = np.zeros(len(new_tags))
#         var_sim_tags = np.zeros(len(new_tags))
#         skew_sim_tags = np.zeros(len(new_tags))
#         kurt_sim_tags = np.zeros(len(new_tags))
#         max_min_sim_tags = np.zeros(len(new_tags))
#         diff_max_sim_tags = np.zeros(len(new_tags))
#         diff_min_sim_tags = np.zeros(len(new_tags))
        diff_mean_sim_tags = np.zeros(len(new_tags))
#         diff_var_sim_tags = np.zeros(len(new_tags))
#         diff_skew_sim_tags = np.zeros(len(new_tags))
#         diff_kurt_sim_tags = np.zeros(len(new_tags))
#         diff_max_min_sim_tags = np.zeros(len(new_tags))       
        for i in range(len(new_tags)):
#             max_sim_tags[i] = np.max(sim_tags_arr[i])
#             min_sim_tags[i] = np.min(sim_tags_arr[i])
            mean_sim_tags[i] = np.mean(sim_tags_arr[i])
#             var_sim_tags[i] = np.var(sim_tags_arr[i])
#             skew_sim_tags[i] = stats.skew(sim_tags_arr[i])
#             kurt_sim_tags[i] = stats.kurtosis(sim_tags_arr[i])
#             max_min_sim_tags[i] = np.max(sim_tags_arr[i]) - np.min(sim_tags_arr[i])
            diff_sim = np.diff(sim_tags_arr[i])
            if len(diff_sim) > 0:
#                 diff_max_sim_tags[i] = np.max(diff_sim)
#                 diff_min_sim_tags[i] = np.min(diff_sim)
                diff_mean_sim_tags[i] = np.mean(diff_sim)
#                 diff_var_sim_tags[i] = np.var(diff_sim)
#                 diff_skew_sim_tags[i] = stats.skew(diff_sim)
#                 diff_kurt_sim_tags[i] = stats.kurtosis(diff_sim)
#                 diff_max_min_sim_tags[i] = np.max(diff_sim) - np.min(diff_sim)

        ## 候选关键词之间的相似度 word2vec gensim 窗口10 迭代10 向量长度256 
    
        sim_tags_arr_255 = np.zeros((len(new_tags), len(new_tags)))
        for i in range(len(new_tags)):
            for j in range(i+1, len(new_tags)):
                if new_tags[i] in word2vec_model_256 and new_tags[j] in word2vec_model_256:
                    sim_tags_arr_255[i, j] = word2vec_model_256.similarity(new_tags[i], new_tags[j])
                    sim_tags_arr_255[j, i] = sim_tags_arr_255[i, j]
# cols += ['diff_max_min_sim_tags_256']
# cols += ['kurt_sim_tags_256']

        kurt_sim_tags_256 = np.zeros(len(new_tags))
        diff_max_min_sim_tags_256 = np.zeros(len(new_tags))       
        for i in range(len(new_tags)):
            kurt_sim_tags_256[i] = stats.kurtosis(sim_tags_arr_255[i])
            diff_sim = np.diff(sim_tags_arr_255[i])
            if len(diff_sim) > 0:
                diff_max_min_sim_tags_256[i] = np.max(diff_sim) - np.min(diff_sim)   
        
        
        ## label 训练集打标签
        if train:
            label = []
            for tag in new_tags:
                if tag in x.kw:
                    label.append(1)
                else:
                    label.append(0)
                    
        ## 不同词性的比例
        cixing_counter = Counter(new_cixing)
        
        fea = pd.DataFrame()
        fea['id'] = [x['id'] for _ in range(len(new_tags))]
        fea['tags'] = new_tags
        fea['cixing'] = new_cixing


        fea['tfidf'] = new_weight
        fea['ske'] = ske
        
        fea['occur_in_title'] = occur_in_title
        fea['occur_in_first_sentence'] = occur_in_first_sentence
        fea['occur_in_last_sentence'] = occur_in_last_sentence
        fea['occur_in_other_sentence'] = occur_in_other_sentence
        fea['len_tags'] = len_tags
        fea['num_tags'] = num_tags
        fea['num_words'] = num_words
        fea['num_sen'] = num_sen
        fea['classes'] = x['classes']

        fea['len_text'] = len(x['title_cut'] + x['content_cut'])
        fea['textrank'] = textrank
        fea['word_count'] = tf
        fea['tf'] = tf / num_words
        fea['num_head_words'] = len(head_words)
        fea['head_word_count'] = hf
        fea['hf'] = np.array(hf) / len(head_words)
        fea['pr'] = tf / tf.sum()
        fea['has_num'] = has_num
        fea['has_eng'] = has_eng
        fea['is_TV'] = is_TV
        fea['idf'] = v_idf
        fea['sim'] = sim
        fea['sim_euc'] = sim_euc

        fea['mean_l2'] = mean_l2
        fea['meaxl2'] = max_l2
        fea['min_l2'] = min_l2
        
        fea['min_pos'] = min_pos
        fea['diff_min_pos_bili'] = diff_min_pos_bili
        fea['diff_kurt_pos_bili'] = diff_kurt_pos_bili
    
        fea['diff_max_min_sen_pos'] = diff_max_min_sen_pos
        fea['diff_var_sen_pos_bili'] = diff_var_sen_pos_bili

        fea['mean_sim_tags'] = mean_sim_tags
        fea['diff_mean_sim_tags'] = diff_mean_sim_tags

        fea['kurt_sim_tags_256'] = kurt_sim_tags_256
        fea['diff_max_min_sim_tags_256'] = diff_max_min_sim_tags_256
        fea['var_gongxian'] = var_gongxian
        fea['kurt_gongxian'] = kurt_gongxian
        fea['diff_min_gongxian'] = diff_min_gongxian
        
        ## 当前文本候选关键词词性比例
        for c in ['x', 'nz', 'l', 'n', 'v', 'ns', 'j', 'a', 'vn', 'nr', 'eng', 'nrt',
                  't', 'z', 'i', 'b', 'o', 'nt', 'vd', 'c', 's', 'nrfg', 'mq', 'rz',
                  'e', 'y', 'an', 'rr']:
            fea['cixing_{}_num'.format(c)] = cixing_counter[c]
            fea['cixing_{}_bili'.format(c)] = cixing_counter[c] / len(new_cixing)

        if train:
            fea['label'] = label
        res.append(fea)
    return res

In [None]:
%%time

In [None]:
# -*- coding: utf-8 -*-

import jieba
import codecs
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize
from sklearn.externals import joblib
import re
from scipy.sparse import vstack
from tqdm import tqdm

class Train():
    def __init__(self):
        # load nerDict as named entity recognizer
        self.loadNerDict()

    def trainCoreEntity(self):
        '''
        train model for coreEntity
        Baseline use entityDict for named entity recognition, you can use a more wise method.
        Baseline use tfIdf score as feature and LR as classification model
        :return:
        '''
        # 1. train tfIdf as core entity score model
        trainData = self.loadData('data/coreEntityEmotion_train.txt')

        print("loading all ner corpus from train data...")

        nerCorpus = []
        for news in tqdm(trainData):
            nerCorpus.append(' '.join(self.getEntity(news)))

        print("fitting ner tfIdf model...")
        tfIdf = TfidfVectorizer()
        tfIdf.fit(nerCorpus)
        # 1.1 save tfIdf model
        joblib.dump(tfIdf, 'models/coreEntityTfIdf.joblib')


        # 2. train LR with tfIdf score as features
        isCoreX = []
        isCoreY = []
        for news in trainData:

            tfIdfNameScore = self.getTfIdfScore(news, tfIdf)

            coreEntity_GroundTruth = [x['entity'] for x in news['coreEntityEmotions']]
            for name, score in tfIdfNameScore:
                if(name in coreEntity_GroundTruth):
                    isCoreX.append([score])
                    isCoreY.append(1)
                else:
                    isCoreX.append([score])
                    isCoreY.append(0)

        # 3. train LR model for coreEntity
        print("training LR model for coreEntity...")
        clf = LogisticRegression(random_state=0, solver='lbfgs',
                                 multi_class='multinomial').fit(isCoreX, isCoreY)
        joblib.dump(clf, 'models/CoreEntityCLF.joblib')

    def trainEmotion(self):
        '''
        train emotion model
        Baseline use tfIdf vector as feature, linearSVC as classfication model
        :return:
        '''
        trainData = self.loadData('data/coreEntityEmotion_train.txt')

        emotionX = []
        emotionY = []

        print("loading emotion corpus from train data...")

        # 1. get all related sentences to the entities
        for news in tqdm(trainData):

            text = news['title'] + '\n' + news['content']
            entities = [x['entity'] for x in news['coreEntityEmotions']]
            emotions = [x['emotion'] for x in news['coreEntityEmotions']]
            entityEmotionMap = dict(zip(entities, emotions))
            entitySentsMap = {}
            for entity in entityEmotionMap.keys():
                entitySentsMap[entity] = []

            for sent in re.split(r'[\n\t，。！？“”（）]',text):
                for entity in entityEmotionMap.keys():
                    if(entity in sent):
                        entitySentsMap[entity].append(sent)

            for entity, sents in entitySentsMap.items():
                relatedText = ' '.join(sents)
                emotionX.append([relatedText])
                emotionY.append(entityEmotionMap[entity])

        # 2. train tf-idf model for emotion related words
        emotionWordCorpus = []
        for news in trainData:
            emotionWordCorpus.append(' '.join(self.getWords(news)))

        print("fitting emotion tfIdf model...")

        tfIdf = TfidfVectorizer()
        tfIdf.fit(emotionWordCorpus)
        joblib.dump(tfIdf, 'models/emotionTfIdf.joblib')

        # 3. use naive bayes to train emotion classifiction
        emotionX = vstack([tfIdf.transform(x) for x in emotionX]).toarray()

        print("training emotion clf with linearSVC...")

        print(emotionX.shape)
        clf = MultinomialNB()
        clf.fit(emotionX, emotionY)

        print(clf.score(emotionX, emotionY))

        joblib.dump(clf, 'models/emotionCLF.joblib')

    def getTfIdfScore(self, news, tfIdf):
        featureName = tfIdf.get_feature_names()

        doc = self.getEntity(news)

        tfIdfFeatures = tfIdf.transform([' '.join(doc)])

        tfIdfScores = tfIdfFeatures.data
        # normalize
        tfIdfScoresNorm = normalize([tfIdfScores], norm='max')

        tfIdfNameScore = [(featureName[x[0]], x[1]) for x in zip(tfIdfFeatures.indices, tfIdfScoresNorm[0])]
        tfIdfNameScore = sorted(tfIdfNameScore, key=lambda x: x[1], reverse=True)

        return tfIdfNameScore

    def loadNerDict(self):
        nerDictFile = codecs.open('models/nerDict.txt','r','utf-8')
        self.nerDict = []
        for line in nerDictFile:
            self.nerDict.append(line.strip())

    def getWords(self, news):
        '''
        get all word list from news
        :param news:
        :return:
        '''
        title = news['title']
        content = news['content']

        words = jieba.cut(title + '\t' + content)

        return list(words)

    def getEntity(self, news):
        '''
        get all entity list from news
        :param news:
        :return:
        '''
        ners = []
        words = self.getWords(news)
        for word in words:
            if (word in self.nerDict):
                ners.append(word)
        return ners

    def loadData(self, filePath):
        f = codecs.open(filePath,'r', 'utf-8')
        data = []
        for line in f.readlines():
            news = json.loads(line.strip())
            data.append(news)
        return data

if __name__ == '__main__':
    trainer = Train()
    trainer.trainCoreEntity()
    trainer.trainEmotion()