In [1]:
import jieba
import unicodedata


def stop_words_list():
    # 停用词文件
    data_path = '/data/jupyter/stock/utils/stopwords.txt'

    with open(data_path, 'r', encoding='utf-8') as data:
        stopwords = [line.strip() for line in data.readlines()]

    temp_stop_list = ['\u3000', '\xa0', '\t']
    stop_words = stopwords + temp_stop_list
    return stop_words


def stock_code_dict():
    # 股票及股票代码表
    data_path = '/data/jupyter/stock/data/stock_list.csv'

    stock_name = []  # 提取出的股票名
    stock_code = []  # 提取出的股票代码

    with open(data_path, 'r', encoding='utf-8') as data:
        for line in data:
            stock_name.append(line[0:-9])
            stock_code.append(line[-8:-2])

    dict_code = dict(zip(stock_code, stock_name))

    return dict_code


def is_number(s):
    # 判断是否为数字
    try:
        float(s)
        return True
    except ValueError:
        pass
    try:
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False


class WordSegmentation(object):

    def __init__(self):
        self.dict_code = stock_code_dict()
        self.stopword_list = stop_words_list()

    def word_segmentation(self, str_title_content):

        # 结巴分词词库加载股票名词
        jieba.load_userdict('/data/jupyter/stock/data/user_dict.txt')

        # 分词结果列表
        news_list = []

        str_content = str(str_title_content).replace('\t', '').replace('\n', '').replace('\r', '').replace(' ', '')
        str_words = ','.join(jieba.cut_for_search(str_content)).split(',')

        for word in str_words:
            if word not in self.stopword_list:
                if word[-1] != '%':
                    if is_number(word):
                        if word in self.dict_code:
                            news_list.append(word)
                    else:
                        news_list.append(word)

        return news_list

if __name__ == "__main__":
    str_title_content = "新浪财经讯 3月12日消息，海航基础（600515）3月12日晚间公告，孙公司海航地产拟与海南融创昌晟签订《股权转让协议》，出售海航地产所持有的海岛物流100%的股权，转让价款约7.97亿元；同时，海航地产拟出售所持有的海南高和房地产开发有限公司100%的股权至海南融创昌晟，转让价款约11.36亿元。责任编辑：张恒"
    
    # 加载分词类
    ws = WordSegmentation()
    news_seg = ws.word_segmentation(str_title_content)
    print("分词结果：")
    print(news_seg)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.866 seconds.
Prefix dict has been built succesfully.


分词结果：
['新浪', '财经', '讯', '月', '日', '消息', '海航', '基础', '海航基础', '600515', '月', '日', '晚间', '公告', '孙', '公司', '海航', '地产', '拟', '海南', '融创昌晟', '签订', '股权', '转让', '协议', '出售', '海航', '地产', '持有', '海岛', '物流', '股权', '转让', '价款', '约', '亿元', '海航', '地产', '拟', '出售', '持有', '海南', '高', '房地', '地产', '房地产', '开发', '有限', '公司', '有限公司', '股权', '海南', '融创昌晟', '转让', '价款', '约', '亿元', '责任', '编辑', '责任编辑', '张恒']
