In [7]:
# -*- coding: utf-8 -*-
from __future__ import division
import pickle
import requests

import jieba
import re
import operator
import time
import json

import eigen_config
from simplex import utils, logger
# from simplex.model import KeyWordClassifier
# from simplex.utils.finance_util import parse_report,edit_distance

class FinanceParagraphDUV2(object):
    '''
    财报文章段落获取v2，这是为了弥补自动生成不准确，给用户提供推荐段落。完全基于FinancePreprocessV2
    '''
    def __init__(self, app_config):
        app_config['mode'] = 'full'
        self.model = FinancePreprocessV2(app_config)

    def feature_process(self, item):
        '''处理财报pdf，解析为段落，标注意图
        Args:
            item: 原始财报pdf内容，应该包括
                - pdf_id: id
                - year: 年份
                - quarter: 季度
                - name：股票名称
                - code：股票代码
                - finance_type： hushen 和 xsb 两种类型
                - pubdate： 发布日期
                - url：财报链接
                - tables： 表格数据， json string
                - paragraphs： 段落数据，json string
        return：
            docs：a list of paragraph documents, each doc contains:
                - intent: 意图名称
                - seq: 段落位置（在意图内部的）
                - other metadata in item
        '''
        article = self.model.feature_process(item)
        content = article['content']
        docs = []
        metadata_keys = [key for key in article.keys() if key not in ['content']]
        cnt = 0
        for key in ['导语','业绩变动原因', '主营业务', '业务进展', '未来计划']:
            paragraphs = content[key]
            tmp_docs = [{"content":p} for p in paragraphs]
            tmp_docs = self.model.get_features(tmp_docs)
            for i,doc in enumerate(tmp_docs):
                doc['intent'] = key
                doc['seq'] = i
                doc['id'] = "{0}_{1}".format(article['pdf_id'],cnt)
                cnt += 1
                doc.update({k:article[k] for k in metadata_keys})
            docs.extend(tmp_docs)
        return docs

class FinancePreprocessV2(object):
    '''
    财报文章生成V2
    - 添加规则辅助生成文章
    '''
    def __init__(self, app_config = {}):
        config = app_config
        self.finance_du_host = config.get("finance_du_host", "https://surreal.aidigger.com/api/v1/du/finance")
        self.reason = config.get('reason', ['原因', '因此', '因为', '所致'])
        self.mode = config.get('mode','article')
        self.version = config.get('version', 0)
        self.min_text_length = config.get("min_text_length",5)
        
        self.labels = ['导语', '业绩变动原因', '主营业务$业务进展', '未来计划', 'other']
        self.labelmap = {
                         '导语': ['导语','意图_企业基本信息介绍_企业基本信息介绍', '意图_企业活动_公司公告信息', '意图_企业业绩_现状偏好', '意图_企业业绩_现状偏坏'],
                         '业绩变动原因': ['意图_企业业绩_业绩变动原因'],
                         '主营业务$业务进展': ['意图_企业业务_业务介绍', '意图_企业业务_业务规模', '意图_企业业务_业务规划'],
                         '未来计划': ['意图_企业业务_业务前景', '意图_企业业绩_前景偏好','意图_企业业绩_前景偏坏'],
                         'other': ['其他', '意图_评论_风险提示','意图_评论_投资建议', '意图_评论_盈利预测', '意图_企业活动_人事活动', '意图_企业活动_融资活动','意图_企业活动_投资活动']
                        }
        self.labelmapR = {}
        for key, value in self.labelmap.items():
            for v in value:
                self.labelmapR[v] = key

        # pattern like 一、（一）、1. 第一节
        self.heading_pattern = [re.compile('^(一|二|三|四|五|六|七|八|九|十|\d+)(、| )'), re.compile('^(（|\()(一|二|三|四|五|六|七|八|九|十|\d+)(）|\))'),
                                re.compile('^\d+(、|.)'), re.compile('^第(一|二|三|四|五|六|七|八|九|十|\d+)节(、| )')]
    
    def _get_features_from_du_host(self, docs):
        '''
        根据docs，返回其预测的类别及概率
        '''
        try:
            ret = requests.post(self.finance_du_host, json = docs, timeout = 30)
        except requests.ReadTimeout:
            print('Timeout')
            logger.warning("Time out when try to get finance docs features")
            return []
        if ret.status_code == 200:
            return ret.json()
        
    def get_features(self, docs):
        """
        20一组调用分类api
        """
        batch_size = 20
        batch = []
        results = []
        for doc in docs:
            batch.append(doc)
            if len(batch) == batch_size:
                ret = self._get_features_from_du_host(batch)
                if ret:
                    results.extend(ret)
                batch = []
        if len(batch) != 0:
            ret = self._get_features_from_du_host(batch)
            if ret:
                results.extend(ret)
        return results
    
    def _fuzzy_match_ed(self, pattern, lst, min_edit_distance=3):
        '''
        Match pattern by edit distance
        '''
        refine_lst = [self._remove_heading(l) for l in lst]
        for p in pattern:
            for l in refine_lst:
                ed = edit_distance(p,l)
                # in case string length is less than min_edit_distance
                # for example: 'abc' and 'qwe', ed will be 3, but they are not similar at all
                if ed / min(len(p),len(l)) < 0.6 and ed <= min_edit_distance:
                    return True
        return False

    def _remove_heading(self, content):
        for pattern in self.heading_pattern:
            content = pattern.sub('',content)
        return content

    def _merge_paragraphs(self, paras):
        '''
        主要解决pdf解析出现的错误分段，如下：
        ['（2）产品线丰富：报告期间世纪明德加强了国', '内研学、国际游学、社会实践以及教师培训产品的研发，产品可以覆盖更大客户群，可以为存量客户提供', '更多的产品服务内容；']
        主要发生在新三板财报中，基于规则来合并: 
        1. 段落以 。？！” 结尾
        2. 非标题句子合并到上一个段落
        3. 标题句子不合并到上一个段落
        4. 标题句子如果以空格结尾，则单独成段（这个规则只基于个别新三板的解析结果发现，有待进一步考证）
        '''
        end_tokens = tuple('。？！”')
        merged_paras = []
        para = ''
        for p in paras:
            find = False
            for pattern in self.heading_pattern:
                if pattern.match(p):
                    find = True
                    merged_paras.append(para)
                    para = p
                    if para.endswith(" "):
                        merged_paras.append(para)
                        para = ''
                    break
            if not find:
                if para.endswith(end_tokens):
                    merged_paras.append(para)
                    para = p
                else:
                    para += p
        merged_paras.append(para)
        return [p.replace(" ",'') for p in merged_paras if p]

    def _match(self, pattern, lst):
        l = ' '.join(lst)
        for p in pattern:
            if p in l:
                return True
#         if fuzzy_match:
#             return self._fuzzy_match_ed(pattern, lst)
        return False

    def _isReason(self, text):
        for r in self.reason:
            if r in text:
                return True
        return False
    
    def _intentMap(self, predict):
        """
        将现有意图映射到芥末堆所给的意图上去, 并将权重求和
        return:
            {content1: {intent1: prob1, intent2: prob2}, content2: ...}
        """
        intent = {x['content']: x['features']['intents'] for x in predict}
        for key, value in intent.items():
            intent[key] = {x['name']: x['prob'] for x in value}
            result = {}
            for l in self.labels:
                result[l] = 0
            for k in intent[key].keys():
                result[self.labelmapR[k]] += intent[key][k]
            intent[key] = result
        return intent
    
    def _getParabyIntent(self, docs, itt, num, threshold = 0.5):
        """
        返回给定意图概率（>threshold）前num的文本，
        """
        intent = self._intentMap(docs)
        intentL = {}
        for l in self.labels:
            intentL[l] = {}
        for k, v in intent.items():
            for i, p in v.items():
                intentL[i][k] = p
        for i in intentL.keys():
            intentL[i] = sorted(intentL[i].items(), key=operator.itemgetter(1))
        return [p[0] for p in intentL[itt][-num:] if (p[1] > threshold)]

    def _getPara(self, match, style, paragraphs, filter_headline = True, crap = None):
        """
        给定条件，返回符合条件的段落。
        """
        paras_raw = [p for p in paragraphs if p['name'] == style 
                if p['parents'] if self._match(match, p['parents'])
                if not p['content'].startswith('公司是否需要')]
        paras = [p['content'] for p in paras_raw]
        if crap is not None:
            index = len(paras)
            indexcontent = ''
            flag = False
            for i in range(len(paras)):
                for parent in paras_raw[i]['parents']:
                    if crap in parent:
                        indexcontent = paras_raw[i]['content']
                        flag = True
                if flag is True:
                    break
            if indexcontent != '':
                index = paras.index(indexcontent)
            paras = paras[:index]
            paras_raw = paras_raw[:index]
        if not filter_headline:
            for i, p in enumerate(paras_raw):
                if not p['parents'][0] in paras:
                    index = paras.index(p['content'])
                    paras.insert(index, p['parents'][0])
        return paras

    def _filter_by_intent(self, paras):
        docs = [{"content":p} for p in paras]
        docs = self.get_features(docs)
        paras = [p['content'] for p in docs if max(p['features']['intents'],key=lambda item:item['prob'])['name']!='其他']
        return paras

    def _filter_text(self, text):
        if len(text) <= self.min_text_length:
            return True
        if '商业模式' in text and '未发生' in text:
            return True
        if '年度内变化统计' in text:
            return True
        if u'□' in text or u'√' in text:
            return True
        return False
        
    def extractFromRow(self, row, changeIndex = 3):
        try:
            num = float(row[1]['text'].strip('）元 ').replace(',', ''))
            income = ''
            change = ''
            if num >= 100000000:
                income = str(round(num / 100000000, 2)) + '亿元'
            else:
                income = str(round(num / 10000, 2)) + '万元'
            if '-' in row[changeIndex]['text']:
                change = '减少' + row[changeIndex]['text'].strip('%- ') + '%'
            else:
                change = '增长' + row[changeIndex]['text'].strip('% ') + '%'
        except:
            return '__', '__'
        return income, change
        
    def _getLeadLanguage(self, table, finance_type, companyname, year, quarter):
        localtime = time.localtime(time.time()) 
        finance_data = {'营业收入': '', '营业收入变动': '', '净利润': '', '净利润变动': '', '总资产': ''}
        profit_type = {'hushen': '归属于上市公司股东的净利润', 'xsb': '归属于挂牌公司股东的净利润'}
        quarter_map = {'1': '第一季度', '2': '半年度', '3': '第三季度', '4': '年度'}
        if len(table) <= 1:
            return companyname + year + quarter_map[quarter] + '财报: 营业收入__元, 净利润__元', []
        changeIndex = 3
        for t in table:
            for i in range(len(t)):
                if '增减' in t[i]['text']:
                    changeIndex = i
                    break
            if '营业收入' in t[0]['text'] or '收入（元）' in t[0]['text']:
                finance_data['营业收入'], finance_data['营业收入变动'] = self.extractFromRow(t, changeIndex)
            elif '归属于上市公司股东的净利润' in t[0]['text'] or '归属于挂牌公司股东的净利润' == t[0]['text'].strip():
                finance_data['净利润'], finance_data['净利润变动'] = self.extractFromRow(t, changeIndex)
            elif '总资产' in t[0]['text'] or '资产总' in t[0]['text']:
                finance_data['总资产'], _ = self.extractFromRow(t)
        ll = '芥末堆' + str(localtime.tm_mon) + '月' + str(localtime.tm_mday) + '日讯，近日，' + companyname + '发布' + year + '年' + quarter_map[quarter] + '报告， 报告期内，' + companyname +  '营业收入为' + finance_data['营业收入'] + '， 同比' + finance_data['营业收入变动'] + '， ' + profit_type[finance_type] + '为'  +  finance_data['净利润'] + '， 同比' + finance_data['净利润变动'] + '。'
        title = companyname + year + quarter_map[quarter] + '财报: 营收' + finance_data['营业收入'] + '，净利润' + finance_data['净利润']
        if finance_data['总资产'] == '':
            return title, [ll]
        else:
            ll = ll + '截至报告期末，' + companyname + '总资产' + finance_data['总资产'] + '。'
            return title, [ll]    
        
    def _get_reason(self, finance_type, style, paragraphs, mode = None):
        '''根据finance_type({'hushen', 'xinsanban'}), style({'season', 'year'}和mode({'article', 'full'})), 
        返回由规则及模型生成的 *业绩变动原因* 意图段落
        return type: list(str)
        '''
        #A股规则
        if finance_type == 'hushen':
            if style == 'year':
                paras = [p for p in self._getPara(['经营情况讨论与分析'], 'Paragraph', paragraphs)
                     if ('营业收入' in p or '净利润' in p) and self._isReason(p)
                     if not self._filter_text(p)]
                return paras
            else:
                paras = [p['content'] for p in paragraphs if self._match(['变动的情况及原因'], p['parents']) 
                         if '营业收入' in p['content'] or '净利润' in p['content'] or '主营业务收入' in p['content']]
                return paras
        #新三板规则
        else:
            # TODO: 是否需要加入标题过滤？
            paras = [p for p in self._getPara(['经营情况','重大变动原因'],"Paragraph", paragraphs)
                     if ('营业收入' in p or '净利润' in p) and 
                     self._isReason(p)]
            return paras
    
    def _get_mainB(self, finance_type, style, paragraphs, mode = None):
        if finance_type == 'hushen':
            if style == 'year':
                if mode == 'article':
                    paras = self._getPara(['从事的主要业务'], 'Paragraph', paragraphs)
                    if len(paras) > 0:
                        return [paras[0]]
                    else:
                        return []
                else:
                    paras = self._getPara(['从事的主要业务'], 'Paragraph', paragraphs, filter_headline = False)
                    return paras
            else:
                return []
        else:
            paras = self._getPara(['商业模式'], 'Paragraph', paragraphs)
            paras = [p for p in paras if not self._filter_text(p)]
            return paras
    
    def _get_BProgress(self, finance_type, style, paragraphs, mode = None):
        if finance_type == 'hushen':
            if style == 'year':
                if mode == 'article':
                    paras_raw = [p for p in self._getPara(['概述', '经营情况的讨论与分析'], 'Paragraph', paragraphs)]
                    paras = self._filter_by_intent(paras_raw)
                    if len(paras) >= 3:
                        return paras[:3]
                    else:
                        return paras
                else:
                    paras1 = [p for p in self._getPara(['概述', '经营情况的讨论与分析'], 'Paragraph', paragraphs, filter_headline = False, crap = '主营业务分析')]
                    paras2 = [p for p in self._getPara(['核心竞争力分析'], 'Paragraph', paragraphs, filter_headline=False)]
                    return [p for p in paras1 + paras2 if not self._filter_text(p)]
            else:
                return []
        else:
            paras = self._getPara(['经营情况', '总体回顾', '经营计划'], 'Paragraph', paragraphs, filter_headline = False, crap = '主营业务分析')
            paras = [p for p in paras if not self._filter_text(p) if not self._isReason(p)]
            return paras
    
    def _get_future(self, finance_type, style, paragraphs, mode = None):
        if finance_type == 'hushen':
            if style == 'year':
                paras = self._getPara(['经营计划', '发展计划', '经营工作计划', '经营管理计划', '发展战略'], 'Paragraph', paragraphs, filter_headline = False)
                if mode == 'article':
                    return self._filter_by_intent(paras)
                return paras
            else:
                return []
        else:
            paras = self._getPara(['发展战略', '经营计划或目标'], 'Paragraph', paragraphs, filter_headline = False )
            if mode == 'article':
                return self._filter_by_intent(paras)
            return paras
        
    def _split_paragraphs(self, paras):
        ret = []
        for p in paras:
            ps = p.split("\n")
            ret.extend(ps)
        return ret

    def feature_process(self, item):
        '''处理财报pdf，解析为段落，标注意图
        Args:
            item: 原始财报pdf内容，应该包括
                - pdf_id: id
                - year: 年份
                - quarter: 季度
                - name：股票名称
                - code：股票代码
                - finance_type： hushen 和 xsb 两种类型
                - pubdate： 发布日期
                - url：财报链接
                - tables： 表格数据， json string
                - paragraphs： 段落数据，json string
                - meta: 季报才有的key，存放最近年报或半年报数据
        return：
            article：
                根据paragraphs生成的文章
        '''
        if item['paragraphs'] is None:
            paragraphs = []
        elif type(item['paragraphs']) == str:
            paragraphs = json.loads(item['paragraphs'])
        else:
            paragraphs = item['paragraphs']

        if item.get('meta') is None:
            paragraphs_meta = []
        elif item.get('meta').get('paragraphs') is None:
            paragraphs_meta = []
        elif type(item['meta']['paragraphs']) == str:
            paragraphs_meta = json.loads(item['meta']['paragraphs'])
        else:
            paragraphs_meta = item['meta']['paragraphs'] 
        
        if item['tables'] is None:
            tables = []
        elif type(item['tables']) == str:
            tables = json.loads(item['tables'])
        else:
            tables = item['tables']

        finance_type = item['finance_type']
        quarter = item['quarter']
        companyname = item['name']

        table = [t['body'] for t in tables if t['name'] == 'finance-main']
        if len(table) != 0:
            table = table[0]

        year = item['year']
        style = 'year'
        if int(quarter) == 1 or int(quarter) == 3:
            style = 'season'
            
        
        #获取导语
        title, ll = self._getLeadLanguage(table, finance_type, companyname, str(year), str(quarter)) 
        #获取业绩变动原因段落
        reason = self._get_reason(finance_type, style, paragraphs, self.mode)
        reason = self._split_paragraphs(reason)

        #获取主营业务段落
        if style == 'year':
            mainB = self._get_mainB(finance_type, style, paragraphs, self.mode)
        else:
            mainB = self._get_mainB(finance_type, 'year', paragraphs_meta, self.mode)
        mainB = self._split_paragraphs(mainB)

        #获取业务进展段落
        BProgress = self._get_BProgress(finance_type, style, paragraphs, self.mode)
        BProgress = self._split_paragraphs(BProgress)

        #获取未来计划段落
        future = self._get_future(finance_type, style, paragraphs, self.mode)
        future = self._split_paragraphs(future)

        result = {'标题': title, '导语': ll, '业绩变动原因': reason, '主营业务': mainB, '业务进展': BProgress, '未来计划': future}       
        for key in ['业绩变动原因', '主营业务', '业务进展', '未来计划']:
            for i in range(len(result[key])):
                result[key][i] = result[key][i].replace('公司', companyname)
        doc = {'version': self.version, 'quarter': quarter, 'url': item['url'], 'year': year, 'finance_type': finance_type, 'pubdate': item['pubdate'], 'name': companyname, 'code': item['code'], 'pdf_id': item['pdf_id'], 'content': result}
        return doc

In [8]:
item5 = items[3]
item5['meta'] = items[0]
FinancePreprocessV2().feature_process(item5)

{'code': '300010',
 'content': {'业务进展': [],
  '业绩变动原因': ['1、本期主营业务收入为30,593.59万元，上年同期为18,638.03万元，增长64.15%，主要是因为立思辰主'],
  '主营业务': ['立思辰主营业务分教育与信息安全两大业务，教育业务主要产品包括面向 B 端用户的区域教育云平台、智慧校园整体解决方案、K12 领域的学科应用产品等以及面向 C 端用户的高考升学咨询服务、留学咨询服务、线上辅导等；信息安全业务主要为客户提供围绕数据全生命周期的数据安全解决方案、工控安全产品及解决方案、面向多行业的自主可控产品及解决方案等。'],
  '导语': ['芥末堆2月27日讯，近日，立思辰发布2017年第一季度报告， 报告期内，立思辰营业收入为3.06亿元， 同比增长64.15%， 归属于上市公司股东的净利润为2384.87万元， 同比增长94.36%。截至报告期末，立思辰总资产74.05亿元。'],
  '未来计划': [],
  '标题': '立思辰2017第一季度财报: 营收3.06亿元，净利润2384.87万元'},
 'finance_type': 'hushen',
 'name': '立思辰',
 'pdf_id': '300010_2017_1',
 'pubdate': '2017-04-26',
 'quarter': '1',
 'url': 'http://disclosure.szse.cn/finalpage/2017-04-26/1203387778.PDF',
 'version': 0,
 'year': '2017'}

In [286]:
paras = [p['content'] for p in items[3]['paragraphs'] if fin_A_bnb._match(['变动的情况及原因'], p['parents']) 
         if '营业收入' in p['content'] or '净利润' in p['content'] or '主营业务收入' in p['content']]
# para = [p['content'] for p in items[3]['paragraphs'] if '本期主营业务收入']
paras

['1、本期主营业务收入为30,593.59万元，上年同期为18,638.03万元，增长64.15%，主要是因为公司主']

  chunks = self.iterencode(o, _one_shot=True)


In [116]:
fin_A_bnb = FinancePreprocessV2()
print(fin_A_bnb.feature_process(items[0]))

{'业绩变动原因': [], '导语': ['芥末堆2月6日讯，近日，立思辰发布2017年半年度报告， 报告期内，立思辰营业收入为6.81亿元， 同比增长35.71%， 归属于上市公司股东的净利润为3496.11万元， 同比增长15.62%。截至报告期末，立思辰总资产73.77亿元。'], '业务进展': ['2017 年上半年，立思辰管理层紧密围绕年初制定的经营计划，贯彻立思辰的战略部署，坚持积极进取，强化协同整合，各项经营任务稳步推进，为实现全年的经营目标奠定了坚实的基础。报告期内，立思辰实现营业收入 68,077.05 万元，比上年同期增长 35.71%；实现归属于上市立思辰股东的净利润 3,496.11 万元，比上年同期增长 15.62%；归属于上市立思辰股东的扣除非经常性损益后的净利润 3,072.40 万元，比上年同期增长15.49%。', '2017 年度是立思辰发布并实施新教育战略的开局之年，立思辰秉承“以科技和人文改变教育”的历史使命，坚持“做大智慧教育，做强教育服务，做精未来学校，做实教育生态”的发展路径，利用科技手段打通校内校外、线上线下、国内国外的学习场景，让学生们高效学习、快乐成长，最终实现“激发成就亿万青少年”的宏伟愿景。报告期内，立思辰成立智慧教育、教学服务、学习服务三大事业群，充分整合资源并继续深入推动“智慧教育+教育服务”的双轮驱动策略，巩固既有优势并扩大市场，立思辰教育业务上半年实现营业收入31,532.37 万元，同比增长 47.54%，目前立思辰在手的教育业务订单金额超过 7 亿元。', '立思辰扎根教育行业多年，经过坚持不懈的市场推广，业务在全国广泛铺开，尤其在北京地区经过多年来的深耕细作，充分赢得了客户的信任，与部分学校形成了长期稳定的合作关系。在保持原有各地市场领先的同时，立思辰加快渠道建设，积极开拓各地市场，报告期内，立思辰于北京怀柔、张家口、保定、石家庄、沧州、青岛、乌鲁木齐、成都、玉溪、福州、太原、济南、南京等地新建分支机构，极大的促进了立思辰各项业务在更多地市的落地推进。截至报告期末，立思辰智慧教育业务已累计进入 27 个省、16000 余所学校、累计覆盖 2100 多万中小学生。立思辰教育服务业务也在全国各地全面展开，立思辰“教育云规模化应用项目”已完成 9 个区域单独培训，其中包括：嘉兴、上犹、福州、昭通、新

  chunks = self.iterencode(o, _one_shot=True)


In [96]:
for i in items:
    print (i['pdf_id'], i['finance_type'])

300010_2017_2 hushen
839264_2016_4 xsb
300010_2016_4 hushen
300010_2017_1 hushen
839264_2017_2 xsb
300010_2017_3 hushen


  chunks = self.iterencode(o, _one_shot=True)


In [110]:
intents = ['导语', '业绩变动原因', '主营业务', '业务进展', '未来计划']
processor = FinancePreprocessV2()
for intent in intents:
    A_bnb = ''
    res = processor.feature_process(items[4])
    for intent in intents:
        for content in res[intent]:
            A_bnb += content + '\n'



  chunks = self.iterencode(o, _one_shot=True)


In [None]:
fin_xsb_bnb = FinancePreprocessV2()
print(items[4]['url'])
print(fin_xsb_bnb.feature_process(items[4])['业务进展'])

In [298]:
fin_xsb_nb = FinancePreprocessV2()
docs = fin_xsb_nb.feature_process(items[1])
text1 = '''1.营业收入：报告期较上年同期增长 56.52%，主要原因系：（1）市场环境向好：国家研学旅行政策
不断落地，政策环境向好，市场需求大增，客户人数增长迅速；（2）产品线丰富：报告期间世纪明德加强了国
内研学、国际游学、社会实践以及教师培训产品的研发，产品可以覆盖更大客户群，可以为存量客户提供
更多的产品服务内容；（3）市场开拓方面，世纪明德制定积极的销售政策，重点突破市场薄弱区域，客户范围
不断扩大。重点突破春秋季研学旅行市场，从客户参营时间上进行延展，从一年中寒暑假的游学旺季拓展
为全年旺季。'''
print(text1 in docs)

True


  chunks = self.iterencode(o, _one_shot=True)


In [None]:
item = items[1]['paragraphs']
[i for i in item if i['name'] == 'Paragraph' if fin._match(['总体回顾'], i['parents'])]

In [279]:
print(items[3]['url'])
print(items[3]['pdf_id'])
[p['content'] for p in items[3]['paragraphs'] if p[]]

http://disclosure.szse.cn/finalpage/2017-04-26/1203387778.PDF
300010_2017_1


  chunks = self.iterencode(o, _one_shot=True)


In [5]:
import json
itemlist = ['000001_0', '000004_0', '000006_0']
keys = ['pdf_id', 'year', 'quarter', 'name', 'code', 'finance_type', 'pubdate', 'url', 'tables', 'paragraphs']
items = []
for item in itemlist:
    f = open('/data/xueyou/finance/pdf_content_test/' + item)
    for line in f:
        data = line.split('\x01')
        tmp = {}
        for i, key in enumerate(keys):
            tmp[key] = data[i]
        items.append(tmp)
    f.close()

In [277]:
with open('/data/share/items.pkl', 'wb') as pickle_file:
    pickle.dump(items, pickle_file)

  chunks = self.iterencode(o, _one_shot=True)


In [278]:
f = open('/data/share/items.pkl', 'rb')
import pickle
items_ = pickle.load(f)
items_[0].keys()

dict_keys(['quarter', 'url', 'year', 'finance_type', 'pubdate', 'name', 'code', 'pdf_id', 'paragraphs', 'tables'])

  chunks = self.iterencode(o, _one_shot=True)


In [37]:
labels = ['导语', '业绩变动原因', '主营业务$业务进展', '未来计划', 'other']
labelmap = {
                 '导语': ['导语','意图_企业基本信息介绍_企业基本信息介绍', '意图_企业活动_公司公告信息', '意图_企业业绩_现状偏好', '意图_企业业绩_现状偏坏'],
                 '业绩变动原因': ['意图_企业业绩_业绩变动原因'],
                 '主营业务$业务进展': ['意图_企业业务_业务介绍', '意图_企业业务_业务规模', '意图_企业业务_业务规划'],
                 '未来计划': ['意图_企业业务_业务前景', '意图_企业业绩_前景偏好','意图_企业业绩_前景偏坏'],
                 'other': ['其他', '意图_评论_风险提示','意图_评论_投资建议', '意图_评论_盈利预测', '意图_企业活动_人事活动', '意图_企业活动_融资活动','意图_企业活动_投资活动']
                }
labelmapR = {}
for key, value in labelmap.items():
    for v in value:
        labelmapR[v] = key

  chunks = self.iterencode(o, _one_shot=True)


In [389]:
import operator
def intentMap(predict):
        """
        将现有意图映射到芥末堆所给的意图上去, 并将权重求和
        """
        result = {}
        for l in labels:
            result[l] = 0
        for key in predict.keys():
            result[labelmapR[key]] += predict[key]
        return result       
    
def getParabyIntent(docs, itt, num, threshold = 0.5):
    intent = {x['content']: x['features']['intents'] for x in docs}
    for key, value in intent.items():
        intent[key] = {x['name']: x['prob'] for x in value}
        intent[key] = intentMap(intent[key])
    intentL = {}
    for l in labels:
        intentL[l] = {}
    for k, v in intent.items():
        for i, p in v.items():
            intentL[i][k] = p
    for i in intentL.keys():
        intentL[i] = sorted(intentL[i].items(), key=operator.itemgetter(1))
    return [p[0] for p in intentL[itt][-num:] if (p[1] > threshold)]
import requests
text = [{'content':'''立思辰教育秉承“以科技和人文改变教育”的历史使命，坚持“做大智慧教育，做强教育服务，做精未来
学校，做实教育生态”的发展路径，利用科技手段打通校内校外、线上线下、国内国外的学习场景，让学生
们高效学习、快乐成长，最终实现“激发成就亿万青少年”的宏伟愿景。'''}, {'content': '老鼠开的房间啊手动阀'}]
query = requests.post("https://surreal.aidigger.com/api/v1/du/finance",json=text).json()
predict = query
# print(predict)
paras2_intent = [{x['content']: x['features']['intents']} for x in predict]
paras2_intent = [{x: {z['name']: z['prob'] for z in y} for x, y in p.items()} for p in paras2_intent]
paras2_intent = [{x: sorted(y.items(), key = operator.itemgetter(1))[-1][0] for x, y in p.items()} for p in paras2_intent]
print(paras2_intent)
paras = [x for p in paras2_intent for x, y in p.items() ]
print(paras)
# print(sorted({'意图_企业业务_业务介绍': 0.47018682423239766, '意图_企业活动_人事活动': 0.0009334563805220739}, key = operator.itemgetter(1)))

[{'立思辰教育秉承“以科技和人文改变教育”的历史使命，坚持“做大智慧教育，做强教育服务，做精未来\n学校，做实教育生态”的发展路径，利用科技手段打通校内校外、线上线下、国内国外的学习场景，让学生\n们高效学习、快乐成长，最终实现“激发成就亿万青少年”的宏伟愿景。': '意图_企业业务_业务介绍'}, {'老鼠开的房间啊手动阀': '意图_评论_风险提示'}]
['立思辰教育秉承“以科技和人文改变教育”的历史使命，坚持“做大智慧教育，做强教育服务，做精未来\n学校，做实教育生态”的发展路径，利用科技手段打通校内校外、线上线下、国内国外的学习场景，让学生\n们高效学习、快乐成长，最终实现“激发成就亿万青少年”的宏伟愿景。', '老鼠开的房间啊手动阀']


  chunks = self.iterencode(o, _one_shot=True)


In [None]:
paragraphs = items[1]['paragraphs']
paras_raw = [p for p in paragraphs if p['name'] == 'Paragraph' 
                if p['parents'] if fin_xsb_bnb._match(['总体回顾'], p['parents'])
                if not p['content'].startswith('公司是否需要')]
paras = [p['content'] for p in paras_raw]

index = len(paras)
indexcontent = ''
flag = False
for i in range(len(paras)):
    for parent in paras_raw[i]['parents']:
        if '主营业务分析' in parent:
            indexcontent = paras_raw[i]['content']
            flag = True
    if flag is True:
        break
if indexcontent != '':
    index = paras.index(indexcontent)
paras = paras[0:index]
paras_raw = paras_raw[0:index]
print(paras_raw[:5])
for p in paras:
    print(p)
    print ('-')

In [395]:
a = [[1,2], [4,5], [3,4]]
sorted(a, key = operator.itemgetter(1))

[[1, 2], [3, 4], [4, 5]]

  chunks = self.iterencode(o, _one_shot=True)


In [75]:
type([1,2]) != list

False

  chunks = self.iterencode(o, _one_shot=True)


In [27]:
ss = 'zhuyuhe'
charlist = list(ss)

  chunks = self.iterencode(o, _one_shot=True)


In [30]:
charlist = sorted(charlist)
charlist

['e', 'h', 'h', 'u', 'u', 'y', 'z']

  chunks = self.iterencode(o, _one_shot=True)
