In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)
PYSPARK_PYTHON = "/miniconda2/envs/py365/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
from offline import SparkSessionBase

/root/toutiao_project/reco_sys


In [2]:
class OriginArticleData(SparkSessionBase):
    
    SPARK_APP_NAME = "mergeArticle"
    SPARK_URL = "yarn"

    ENABLE_HIVE_SUPPORT = True
    
    def __init__(self):
        self.spark = self._create_spark_session()

In [3]:
oa = OriginArticleData()

In [4]:
# 进行文章 前两个表 的合并
oa.spark.sql("use toutiao")
# news_article_basic 与news_article_content, article_id
titlce_content = oa.spark.sql("select a.article_id, a.channel_id, a.title, b.content from news_article_basic a inner join news_article_content b on a.article_id=b.article_id where a.article_id=116636")


In [5]:
titlce_content.show()

+----------+----------+---------------+--------------------+
|article_id|channel_id|          title|             content|
+----------+----------+---------------+--------------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|
+----------+----------+---------------+--------------------+



In [7]:
# 进行title_content 与 文章频道名称合并
titlce_content.registerTempTable('temptable')

channel_title_content = oa.spark.sql("select t.*, n.channel_name from temptable t left join news_channel n on t.channel_id=n.channel_id")




In [8]:
channel_title_content.show()

+----------+----------+---------------+--------------------+------------+
|article_id|channel_id|          title|             content|channel_name|
+----------+----------+---------------+--------------------+------------+
|    116636|        18|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|      python|
+----------+----------+---------------+--------------------+------------+



In [9]:
# 合并三个内容到一个字符串
import pyspark.sql.functions as F

sentence_df = channel_title_content.select("article_id", "channel_id", "channel_name", "title", "content", 
                            F.concat_ws(',', 
                                       channel_title_content.channel_name,
                                       channel_title_content.title,
                                       channel_title_content.content).alias('sentence'))


In [10]:
sentence_df.show()

+----------+----------+------------+---------------+--------------------+--------------------+
|article_id|channel_id|channel_name|          title|             content|            sentence|
+----------+----------+------------+---------------+--------------------+--------------------+
|    116636|        18|      python|动态再平衡投资策略历史数据回测|<p>赚钱是个俗气的话题，但又是人...|python,动态再平衡投资策略历...|
+----------+----------+------------+---------------+--------------------+--------------------+



In [11]:
# 读取文章，进行每篇张分词
oa.spark.sql("use article")
article_data = oa.spark.sql("select * from article_data limit 10")
article_data.show()


+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|         1|        17|          前端|     Vue props用法小结原荐|<p><strong>Vue pr...|前端,Vue props用法小结原...|
|         2|        17|          前端|vue.js响应式原理解析与实现—...|<p>上次我们已经分析了vue.j...|前端,vue.js响应式原理解析与...|
|         3|        17|          前端|JavaScript中浅拷贝和深拷...|<p>要理解 JavaScript...|前端,JavaScript中浅拷贝...|
|         4|        17|          前端|基于vue2.0 +vuex+ e...|<p>效果演示地址,</p><p>...|前端,基于vue2.0 +vuex...|
|         5|        17|          前端|immutability因Reac...|<p><img src="http...|前端,immutability因R...|
|         6|        17|          前端|简单了解 node npm cnp...|<span id="OSC_h1_...|前端,简单了解 node npm ...|
|         7|        17|          前端|       Web工程师以太坊入门原荐|<p>我经常构建使用以太坊的Web...|前端,Web工程师以太坊入门原荐,...|


In [12]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


In [13]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [14]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|      4273|        15|[javascript, reac...|
|      4274|        19|[java, java, 笔记, ...|
|      4275|        19|[java, 传统, 方式, 类继...|
|      4276|        15|[javascript, Vue,...|
|      4278|        15|[javascript, 作用域链...|
|      4279|        19|[java, springboot...|
|      4280|        19|[java, Jedis, 工具类...|
|      4281|        19|[java, java, 记录, ...|
|      4282|        15|[javascript, VueS...|
|      4283|        15|[javascript, 体积, ...|
+----------+----------+--------------------+



In [16]:
# 先计算分词之后的每篇文章的词频，得到CV模型
# 统计所有文章不同的词，组成一个词列表 words_list = [1,2,3,,34,4,45,56,67,78,8.......,,,,.]
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=2000, minDF=1.0)
cv_model = cv.fit(words_df)
cv_model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.model")
# 然后根据词频计算IDF以及词，得到IDF模型

In [20]:
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/test.model")
cv_result = cv_model.transform(words_df)
cv_result.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|
+----------+----------+--------------------+--------------------+
|      4273|        15|[javascript, reac...|(1265,[0,1,3,8,10...|
|      4274|        19|[java, java, 笔记, ...|(1265,[24,29,32,3...|
|      4275|        19|[java, 传统, 方式, 类继...|(1265,[4,29,30,34...|
|      4276|        15|[javascript, Vue,...|(1265,[0,1,8,12,1...|
|      4278|        15|[javascript, 作用域链...|(1265,[5,11,12,15...|
|      4279|        19|[java, springboot...|(1265,[3,4,21,31,...|
|      4280|        19|[java, Jedis, 工具类...|(1265,[1,14,16,18...|
|      4281|        19|[java, java, 记录, ...|(1265,[29,30,57,9...|
|      4282|        15|[javascript, VueS...|(1265,[1,2,4,9,19...|
|      4283|        15|[javascript, 体积, ...|(1265,[0,1,11,17,...|
+----------+----------+--------------------+--------------------+



In [21]:
# IDF 模型
from pyspark.ml.feature import IDF
idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idfModel = idf.fit(cv_result)
idfModel.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/testIDF.model")

In [22]:
# cv_model的词列表
cv_model.vocabulary

['this',
 'pa',
 'node',
 'data',
 '数据',
 'let',
 'keys',
 'obj',
 '组件',
 'npm',
 'child',
 '节点',
 'log',
 '属性',
 'key',
 'console',
 'value',
 'var',
 'return',
 'div',
 'name',
 '文本',
 'const',
 'fragment',
 '&#',
 'WebSocket',
 'props',
 'msg',
 '合约',
 '对象',
 '代码',
 'ul',
 '.a',
 'for',
 '程序',
 'model',
 'web3',
 '时候',
 '以太坊',
 'forChildMsg',
 'obj2',
 '交易',
 'class',
 'update',
 '函数',
 'mongodb',
 'config',
 '数组',
 '版本',
 '内容',
 'amp',
 'web',
 '语法',
 'test',
 'vue',
 'ownChildMsg',
 '项目',
 '方法',
 'childNode',
 'val',
 'immutable',
 'new',
 'function',
 'matchs',
 'match',
 'array',
 'template',
 'children',
 '文章',
 'DOM',
 'Watcher',
 '域名',
 'set',
 '信息',
 'Object',
 '区块链',
 '元素节点',
 'input',
 'attr',
 '官方',
 '文件',
 'Array',
 '方式',
 '命令',
 '功能',
 'install',
 'ckage',
 'truffle',
 '服务器',
 'newVal',
 'textReg',
 '定义',
 'rawTextContent',
 'Web',
 'item',
 'registry',
 'JSON',
 'forEach',
 'textContent',
 '智能',
 'arr',
 'gas',
 '部署',
 'r2',
 '类型',
 '源码',
 'the',
 'get',
 'init',
 'rse

In [23]:
# idf值列表 且和 词 一一对应
idfModel.idf.toArray()[:20]

array([0.6061358 , 0.        , 0.6061358 , 0.6061358 , 0.45198512,
       0.78845736, 1.01160091, 1.01160091, 1.01160091, 0.78845736,
       1.29928298, 1.70474809, 0.31845373, 1.01160091, 0.78845736,
       0.45198512, 0.78845736, 0.78845736, 0.45198512, 1.70474809])

In [None]:
# 使用 cv_model（包含词） 和 idf_model（包含tf-idf值）组合得到：词 → tf-idf值，并存储 idf_keywords_values 的 Hive表。
from pyspark.ml.feature import CountVectorizerModel
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model") # 这2个模型是已经训练好的

from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model") # 这2个模型是已经训练好的

# 上面已经展示出样例
keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))

def func(data):
    # 这个循环是不是这样？
    for index in range(len(data)):
       data[index] = list(data[index])
       data[index].append(index)
       data[index][1] = float(data[index][1])

func(keywords_list_with_idf)
sc = spark.sparkContext
rdd = sc.parallelize(keywords_list_with_idf)
df = rdd.toDF(["keywords", "idf", "index"]) # 其实可以忽略 idf值这一列

df.write.insertInto('idf_keywords_values')

In [24]:
# IDF 对 CV结果 进行计算TFIDF
from pyspark.ml.feature import IDFModel
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/testIDF.model")
tfidf_res = idf_model.transform(cv_result)

In [25]:
tfidf_res.show()

+----------+----------+--------------------+--------------------+--------------------+
|article_id|channel_id|               words|       countFeatures|         idfFeatures|
+----------+----------+--------------------+--------------------+--------------------+
|         1|        17|[Vue, props, 用法, ...|(1265,[0,1,3,4,5,...|(1265,[0,1,3,4,5,...|
|         2|        17|[vue, 响应式, 原理, mo...|(1265,[0,1,2,3,4,...|(1265,[0,1,2,3,4,...|
|         3|        17|[JavaScript, 浅拷贝,...|(1265,[0,1,5,7,12...|(1265,[0,1,5,7,12...|
|         4|        17|[vue2, vuex, elem...|(1265,[1,2,4,9,12...|(1265,[1,2,4,9,12...|
|         5|        17|[immutability, Re...|(1265,[1,3,4,5,6,...|(1265,[1,3,4,5,6,...|
|         6|        17|[node, npm, cnpm,...|(1265,[1,2,9,12,1...|(1265,[1,2,9,12,1...|
|         7|        17|[Web, 工程师, 以太坊, 入...|(1265,[1,2,3,4,6,...|(1265,[1,2,3,4,6,...|
|         8|        17|[Web, pa, api, we...|(1265,[1,2,9,30,3...|(1265,[1,2,9,30,3...|
|         9|        17|[vue, 中用, 数据驱动, 视...

In [26]:
# 1265个词的 {索引 以及 权重}
def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
        result = _[:TOPK]
        for word_index, tfidf in result:
            yield row.article_id, row.channel_id, int(word_index), round(float(tfidf), 4)

# mapPartitions算子执行后，数据就已经展开了（注意细看）
kewords_tfidf = tfidf_res.rdd.mapPartitions(func).toDF(['article_id', 'channel_id', 'index', 'weights'])

In [27]:
kewords_tfidf.show()

+----------+----------+-----+-------+
|article_id|channel_id|index|weights|
+----------+----------+-----+-------+
|         1|        17|   19|68.1899|
|         1|        17|   10|58.4677|
|         1|        17|    8|55.6381|
|         1|        17|   26|52.8472|
|         1|        17|   39|39.2092|
|         1|        17|   27|31.1828|
|         1|        17|   55|30.6855|
|         1|        17|   58|30.6855|
|         1|        17|   66| 27.276|
|         1|        17|  138|15.3427|
|         1|        17|  139|15.3427|
|         1|        17|  150| 13.638|
|         1|        17|  171| 13.638|
|         1|        17|  175| 13.638|
|         1|        17|    4|11.2996|
|         1|        17|  206|10.2285|
|         1|        17|  239| 8.5237|
|         1|        17|  267| 8.5237|
|         1|        17|   33| 8.1357|
|         1|        17|  295|  6.819|
+----------+----------+-----+-------+
only showing top 20 rows



In [None]:
# 计算最终TF-IDF表，并存储为Hive表
# idf_keywords_values = oa.spark.sql("select keyword, index idx from idf_keywords_values")

# 使用包含 article_id|channel_id|index为索引|weights为TF-IDF值 的 kewords_tfidf   join   idf_keywords_values（包含：词、IDF值、索引） join目的：得到 词
keyword_str_tfidf = kewords_tfidf.join(idf_keywords_values, idf_keywords_values.idx==kewords_tfidf.index).select(["article_id", "channel_id", "keyword", "weights"])
keyword_str_tfidf.show()

# keywordsByTFIDF.write.insertInto("tfidf_keywords_values") # 保存为 tfidf_keywords_values 的 Hive表

In [None]:
# texrank
# 分词
def textrank(partition):
    import os

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    class TextRank(jieba.analyse.TextRank):
        def __init__(self, window=20, word_min_len=2): # window窗口大小； word_min_len
            super(TextRank, self).__init__()
            self.span = window  # 窗口大小
            self.word_min_len = word_min_len  # 单词的最小长度
            # 要保留的词性，根据jieba github ，具体参见https://github.com/baidu/lac
            self.pos_filt = frozenset(
                ('n', 'x', 'eng', 'f', 's', 't', 'nr', 'ns', 'nt', "nw", "nz", "PER", "LOC", "ORG"))

        def pairfilter(self, wp):
            """过滤条件，返回True或者False"""

            if wp.flag == "eng":
                if len(wp.word) <= 2:
                    return False

            if wp.flag in self.pos_filt and len(wp.word.strip()) >= self.word_min_len \
                    and wp.word.lower() not in stopwords_list:
                return True
    # TextRank过滤窗口大小为5，单词最小为2
    textrank_model = TextRank(window=5, word_min_len=2)
    allowPOS = ('n', "x", 'eng', 'nr', 'ns', 'nt', "nw", "nz", "c")

    for row in partition:
        tags = textrank_model.textrank(row.sentence, topK=20, withWeight=True, allowPOS=allowPOS, withFlag=False)
        for tag in tags:
            yield row.article_id, row.channel_id, tag[0], tag[1]

In [None]:
# 计算TextRank并保存为Hive表：textrank_keywords_values
textrank_keywords_df = article_data.rdd.mapPartitions(textrank).toDF(["article_id", "channel_id", "keyword", "textrank"])
textrank_keywords_df.write.insertInto("textrank_keywords_values")

In [None]:
# TF-IDF 和 TextRank 都是训练直接按 文章维度 统计出相应权重的集合



In [None]:
# 文章画像结果
# 1.1、关键字：计算(TextRank * IDF)作为词的权重weights；（也可以使用 (TextRank + TF-IDF) / 2 的计算方式）
idf = oa.spark.sql("select * from idf_keywords_values")
idf = idf.withColumnRenamed("keyword", "keyword1")
result = textrank_keywords_df.join(idf, textrank_keywords_df.keyword==idf.keyword1)
keywords_res = result.withColumn("weights", result.textrank * result.idf).select(["article_id", "channel_id", "keyword", "weights"])

In [None]:
# 1.2、关键字：合并关键词权重到字典结果
keywords_res.registerTempTable("temptable")
merge_keywords = oa.spark.sql("select article_id, min(channel_id) channel_id, collect_list(keyword) keywords, collect_list(weights) weights from temptable group by article_id")

# 合并关键词权重合并成字典
def _func(row):
    return row.article_id, row.channel_id, dict(zip(row.keywords, row.weights))

keywords_info = merge_keywords.rdd.map(_func).toDF(["article_id", "channel_id", "keywords"])

In [None]:
# 2、主题词：将 TF-IDF 和 TextRank 共现的词作为主题词
topic_sql = """
            select t.article_id article_id2, collect_set(t.keyword) topics from tfidf_keywords_values t
            inner join 
            textrank_keywords_values r
            where t.keyword = r.keyword
            group by article_id2
            """
article_topics = oa.spark.sql(topic_sql)

In [None]:
# 3、将 主题词表 和 关键词表 进行合并得到 article_profile 的Hive表
article_profile = keywords_info.join(article_topics, keywords_info.article_id==article_topics.article_id2).select(["article_id", "channel_id", "keywords", "topics"])

# articleProfile.write.insertInto("article_profile")