In [1]:
import os
import sys


# 2.7.4、增量更新-文章向量计算
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/py365/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from offline import SparkSessionBase
from setting.default import CHANNEL_INFO
from pyspark.ml.feature import Word2Vec



class TrainWord2VecModel(SparkSessionBase):

    SPARK_APP_NAME = "Word2Vec"
    SPARK_URL = "local"
    
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()


w2v = TrainWord2VecModel()

In [3]:
# 训练一个频道的模型
w2v.spark.sql("use article")
article_data = w2v.spark.sql("select * from article_data where channel_id=18 limit 5")
article_data.show()

+----------+----------+------------+--------------------+--------------------+--------------------+
|article_id|channel_id|channel_name|               title|             content|            sentence|
+----------+----------+------------+--------------------+--------------------+--------------------+
|     12237|        18|      python|想学习区块链？那就用 Python...|<div id="article_...|python,想学习区块链？那就用...|
|     12238|        18|      python|鲜为人知的 Python 语法 使...|<p>所有人（好吧，不是所有人）都...|python,鲜为人知的 Pyth...|
|     12243|        18|      python|手把手教你写网络爬虫（4）：Scr...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12245|        18|      python|手把手教你写网络爬虫（5）：Pha...|<div id="cnblogs_...|python,手把手教你写网络爬虫...|
|     12247|        18|      python|用 Plumbum 开发 Pyth...|<div id="article_...|python,用 Plumbum ...|
+----------+----------+------------+--------------------+--------------------+--------------------+



In [4]:
# 文章数据进行分词处理,得到分词结果
# 分词
def segmentation(partition):
    import os
    import re

    import jieba
    import jieba.analyse
    import jieba.posseg as pseg
    import codecs

    abspath = "/root/backup/words"

    # 结巴加载用户词典
    userDict_path = os.path.join(abspath, "ITKeywords.txt")
    jieba.load_userdict(userDict_path)

    # 停用词文本
    stopwords_path = os.path.join(abspath, "stopwords.txt")

    def get_stopwords_list():
        """返回stopwords列表"""
        stopwords_list = [i.strip()
                          for i in codecs.open(stopwords_path).readlines()]
        return stopwords_list

    # 所有的停用词列表
    stopwords_list = get_stopwords_list()

    # 分词
    def cut_sentence(sentence):
        """对切割之后的词语进行过滤，去除停用词，保留名词，英文和自定义词库中的词，长度大于2的词"""
        # print(sentence,"*"*100)
        # eg:[pair('今天', 't'), pair('有', 'd'), pair('雾', 'n'), pair('霾', 'g')]
        seg_list = pseg.lcut(sentence)
        seg_list = [i for i in seg_list if i.flag not in stopwords_list]
        filtered_words_list = []
        for seg in seg_list:
            # print(seg)
            if len(seg.word) <= 1:
                continue
            elif seg.flag == "eng":
                if len(seg.word) <= 2:
                    continue
                else:
                    filtered_words_list.append(seg.word)
            elif seg.flag.startswith("n"):
                filtered_words_list.append(seg.word)
            elif seg.flag in ["x", "eng"]:  # 是自定一个词语或者是英文单词
                filtered_words_list.append(seg.word)
        return filtered_words_list

    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)    # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words

In [5]:
words_df = article_data.rdd.mapPartitions(segmentation).toDF(['article_id', 'channel_id', 'words'])

In [6]:
words_df.show()

+----------+----------+--------------------+
|article_id|channel_id|               words|
+----------+----------+--------------------+
|     12237|        18|[python, 区块链, Pyt...|
|     12238|        18|[python, Python, ...|
|     12243|        18|[python, 手把手, 网络,...|
|     12245|        18|[python, 手把手, 网络,...|
|     12247|        18|[python, Plumbum,...|
+----------+----------+--------------------+



In [7]:
# 直接调用word2vec训练
w2v_model = Word2Vec(vectorSize=100, inputCol='words', outputCol='model', minCount=3)

In [8]:
model = w2v_model.fit(words_df)
model.write().overwrite().save("hdfs://hadoop-master:9000/headlines/models/test.word2vec")

In [30]:
# 1、加载某个频道模型，得到每个词的向量
from pyspark.ml.feature import Word2VecModel

wv = Word2VecModel.load("hdfs://hadoop-master:9000/headlines/models/test.word2vec")
vectors = wv.getVectors()

# 它的Spark版本是2.2.2，我的是2.1.0计算出来的模型不能通用。
# channel_id = 18
# channel = "python"
# wv_model = Word2VecModel.load(
#                 "hdfs://hadoop-master:9000/headlines/models/word2vec_model/channel_%d_%s.word2vec" % (channel_id, channel))
# vectors = wv_model.getVectors()

vectors.show()

+------------+--------------------+
|        word|              vector|
+------------+--------------------+
|        函数参数|[0.03575931116938...|
|          流程|[0.08695186674594...|
|        配置文件|[0.10811857879161...|
|   recipient|[-0.1994543075561...|
|      enable|[0.05931602790951...|
|    register|[-0.0666727498173...|
|         fib|[0.01022425200790...|
|          函数|[0.15153197944164...|
|        read|[0.04327561706304...|
|          装饰|[0.10575848817825...|
|         数据流|[0.09737470746040...|
|QuotesSpider|[0.00672959722578...|
|     Request|[0.15434546768665...|
|      format|[0.03352830186486...|
|         for|[0.16336914896965...|
|          对象|[0.16593627631664...|
|         Set|[0.06395456194877...|
|      encode|[-0.0407770648598...|
|     program|[-0.0036022611893...|
|     network|[-0.0405408889055...|
+------------+--------------------+
only showing top 20 rows



In [7]:
#2、获取频道的文章画像，得到文章画像的关键词(接着之前增量更新的文章article_profile)
# 获取这些文章20个关键词名称，对应 关键词名称 找到 词向量
w2v.spark.sql("use article")
article_profile = w2v.spark.sql("select * from article_profile where channel_id=18 limit 10")
article_profile.show()

+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|     13098|        18|Map(pre -> 0.6040...|[__, object, 属性, ...|
|     13248|        18|Map(有限元 -> 5.2929...|[有限元, 代码分析, 案例, z...|
|     13401|        18|Map(pre -> 0.2100...|[补码, 字符串, 李白, typ...|
|     13723|        18|Map(pre -> 2.1094...|[acc, bstr, 原地, l...|
|     14719|        18|Map(pre -> 0.8814...|[__, ctime, cons,...|
|     14846|        18|Map(__ -> 2.54674...|[files, __, folde...|
|     15173|        18|Map(人人 -> 0.74986...|[cookie, Python爬虫...|
|     15194|        18|Map(dif -> 0.7567...|[display, 课程, lis...|
|     15237|        18|Map(pre -> 0.5349...|[__, send, sel, c...|
|     15322|        18|Map(pre -> 0.5762...|[Pclass, replace,...|
+----------+----------+--------------------+--------------------+



In [8]:
#3、文章画像表 的 关键词 进行 爆炸/展开
article_profile.registerTempTable('incremental')
keyword_weight = w2v.spark.sql("select article_id, channel_id, keyword, weight from incremental LATERAL VIEW explode(keywords) AS keyword, weight")
keyword_weight.show()

+----------+----------+--------+-------------------+
|article_id|channel_id| keyword|             weight|
+----------+----------+--------+-------------------+
|     13098|        18|    repr| 0.6326590117716192|
|     13098|        18|      __| 2.5401122038114203|
|     13098|        18|      属性|0.23645924932468856|
|     13098|        18|     pre| 0.6040062287555379|
|     13098|        18|    code| 0.9531379029975557|
|     13098|        18|     def| 0.5063435861497416|
|     13098|        18|   color| 1.1337936117177925|
|     13098|        18|      定义| 0.1554380122061322|
|     13098|        18| Student| 0.5033771372284416|
|     13098|        18|getPrice| 0.7404427038950527|
|     13098|        18|      方法|0.08080845613717194|
|     13098|        18|     div| 0.3434819820586186|
|     13098|        18|     str|0.35999033790156054|
|     13098|        18|      pa| 0.6651385256756351|
|     13098|        18|   slots| 0.6992789472129189|
|     13098|        18| cnblogs|0.339265861020

In [10]:
# 合并文章关键词与词向量
_keywords_vector = keyword_weight.join(vectors, vectors.word==keyword_weight.keyword, 'inner')
_keywords_vector.show()

+----------+----------+-------+-------------------+------+--------------------+
|article_id|channel_id|keyword|             weight|  word|              vector|
+----------+----------+-------+-------------------+------+--------------------+
|     13098|        18|     __| 2.5401122038114203|    __|[-0.0841412693262...|
|     13098|        18|     属性|0.23645924932468856|    属性|[0.15526983141899...|
|     13098|        18|   code| 0.9531379029975557|  code|[0.08397469669580...|
|     13098|        18|    def| 0.5063435861497416|   def|[0.00656610028818...|
|     13098|        18|     定义| 0.1554380122061322|    定义|[0.08375884592533...|
|     13098|        18|     方法|0.08080845613717194|    方法|[0.15521775186061...|
|     13098|        18|    div| 0.3434819820586186|   div|[0.04872748255729...|
|     13098|        18|    str|0.35999033790156054|   str|[0.06551016867160...|
|     13098|        18|     pa| 0.6651385256756351|    pa|[0.07491271197795...|
|     13098|        18|     函数|0.1501557

In [11]:
#4、计算得到文章每个词的向量：这里用词的权重 * 词的向量 = weights x vector=new_vector
def compute_vector(row):
    return row.article_id, row.channel_id, row.keyword, row.weight * row.vector

articleKeywordVectors = _keywords_vector.rdd.map(compute_vector).toDF(["article_id", "channel_id", "keyword", "weightingVector"])
articleKeywordVectors.show()

+----------+----------+-------+--------------------+
|article_id|channel_id|keyword|     weightingVector|
+----------+----------+-------+--------------------+
|     13098|        18|     __|[-0.2137282650596...|
|     13098|        18|     属性|[0.03671498778010...|
|     13098|        18|   code|[0.08003946631349...|
|     13098|        18|    def|[0.00332470276693...|
|     13098|        18|     定义|[0.01301930851531...|
|     13098|        18|     方法|[0.01254290689293...|
|     13098|        18|    div|[0.01673701228950...|
|     13098|        18|    str|[0.02358302775608...|
|     13098|        18|     pa|[0.04982733079938...|
|     13098|        18|     函数|[0.02275340318306...|
|     13098|        18|     &#|[0.00233461616804...|
|     13098|        18|  class|[-0.0195278108946...|
|     13248|        18|   code|[0.19872827625353...|
|     13248|        18|     参数|[0.12968818760840...|
|     13248|        18|     系统|[0.06216843054846...|
|     13248|        18|     .a|[0.050956582797

In [12]:
#5、计算得到文章的平均词向量 即：文章的向量
articleKeywordVectors.registerTempTable('temptable')
articleKeywordVectors = w2v.spark.sql("select article_id, min(channel_id) channel_id, collect_set(weightingVector) vectors from temptable group by article_id")
articleKeywordVectors.show()

+----------+----------+--------------------+
|article_id|channel_id|             vectors|
+----------+----------+--------------------+
|     13098|        18|[[0.0033247027669...|
|     13248|        18|[[0.0621684305484...|
|     13401|        18|[[0.0137473878736...|
|     13723|        18|[[0.0387393960900...|
|     14719|        18|[[0.0215666713566...|
|     14846|        18|[[0.0224016568366...|
|     15173|        18|[[0.0327754618004...|
|     15194|        18|[[0.0550583468621...|
|     15237|        18|[[-0.027733238125...|
|     15322|        18|[[0.0096505359203...|
+----------+----------+--------------------+



In [33]:
# 求平均值 得到 文章向量
def compute_avg_vectors(row):
    x = 0
    for i in row.vectors:
        x += i
    
    # 求平均值
    return row.article_id, row.channel_id, x / len(row.vectors)

article_vector = articleKeywordVectors.rdd.map(compute_avg_vectors).toDF(['article_id', 'channel_id', 'vectors'])
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|             vectors|
+----------+----------+--------------------+
|     13098|        18|[0.00230172387587...|
|     13248|        18|[0.06297692516839...|
|     13401|        18|[0.00371605895856...|
|     13723|        18|[0.04396727569275...|
|     14719|        18|[0.03041110678417...|
|     14846|        18|[0.00779903058473...|
|     15173|        18|[0.03051927367831...|
|     15194|        18|[0.01959675463890...|
|     15237|        18|[0.00435888468677...|
|     15322|        18|[0.01299606486049...|
+----------+----------+--------------------+



In [35]:
# 查看 article_vector 的变量类型：vectors字段为 vector
article_vector

DataFrame[article_id: bigint, channel_id: bigint, vectors: vector]

In [36]:
# 对计算出的“vectors”列进行处理，该列为Vector类型，不能直接存入HIVE，HIVE不支持Vector数据类型
# vectors字段名 代表 Vector数据类型，而 articlevector字段名 代表 数组数据类型。
def toArray(row):
    return row.article_id, row.channel_id, [float(i) for i in row.vectors.toArray()]

article_vector = article_vector.rdd.map(toArray).toDF(['article_id', 'channel_id', 'articlevector'])
article_vector.show()

+----------+----------+--------------------+
|article_id|channel_id|       articlevector|
+----------+----------+--------------------+
|     13098|        18|[0.00230172387587...|
|     13248|        18|[0.06297692516839...|
|     13401|        18|[0.00371605895856...|
|     13723|        18|[0.04396727569275...|
|     14719|        18|[0.03041110678417...|
|     14846|        18|[0.00779903058473...|
|     15173|        18|[0.03051927367831...|
|     15194|        18|[0.01959675463890...|
|     15237|        18|[0.00435888468677...|
|     15322|        18|[0.01299606486049...|
+----------+----------+--------------------+



In [37]:
# 查看 article_vector 的变量类型：vectors字段为 array<double>
article_vector

DataFrame[article_id: bigint, channel_id: bigint, articlevector: array<double>]

In [None]:
'''
# 最终计算出 18号Python频道 的所有文章向量，保存到固定的表当中
# vectors字段名 代表 Vector数据类型，而 articlevector字段名 代表 数组数据类型。

# 创建文章向量表（DataFrame的字段名 和 表的字段名 不相同，也可以新增数据）
CREATE TABLE article_vector(
article_id BIGINT comment "article_id",
channel_id INT comment "channel_id",
articlevector ARRAY<DOUBLE> comment "articlevector"); 

# 保存数据到HIVE
# article_vector.write.insertInto("article_vector")

# 上传计算好的历史文章向量
./hadoop dfs -put  /root/backup/backup/article.db/article_vector/ /user/hive/warehouse/article.db/
'''

In [31]:



# 2.7.5、文章相似度计算

In [42]:
# 1、拿到18号-Python频道的所有文章数据，10篇测试
from pyspark.ml.linalg import Vectors

# 选取部分数据做测试
article_vector = w2v.spark.sql("select article_id, articlevector from article_vector where channel_id=18 limit 10")
train = article_vector.select(['article_id', 'articleVector'])
train.show()

+----------+--------------------+
|article_id|       articleVector|
+----------+--------------------+
|     13098|[0.10339950907039...|
|     13248|[0.84907054580879...|
|     13401|[0.06157120217893...|
|     13723|[0.20708073724961...|
|     14719|[-0.0405607722081...|
|     14846|[0.17945355257543...|
|     15173|[-0.2399774663757...|
|     15194|[0.08605245220126...|
|     15237|[0.02019666206037...|
|     15322|[0.11985676790665...|
+----------+--------------------+



In [43]:
def toVector(row):
    return row.article_id, Vectors.dense(row.articleVector)

train = train.rdd.map(toVector).toDF(['article_id', 'vectors'])
train.show()

+----------+--------------------+
|article_id|             vectors|
+----------+--------------------+
|     13098|[0.10339950907039...|
|     13248|[0.84907054580879...|
|     13401|[0.06157120217893...|
|     13723|[0.20708073724961...|
|     14719|[-0.0405607722081...|
|     14846|[0.17945355257543...|
|     15173|[-0.2399774663757...|
|     15194|[0.08605245220126...|
|     15237|[0.02019666206037...|
|     15322|[0.11985676790665...|
+----------+--------------------+



In [44]:
# 查看 train 的变量类型：vectors字段为 vector
train

DataFrame[article_id: bigint, vectors: vector]

In [None]:
# 计算相似的文章（2.1.0版本还没有这两个类，所以后续没有做了）
from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH

brp = BucketedRandomProjectionLSH(inputCol='vectors', outputCol='hashes', numHashTables=4.0, bucketLength=10.0)
model = brp.fit(train)

In [None]:
# 第一个参数为：新增文章； 第二个参数为：历史库文章； 第三个参数为：欧氏距离名称（并不是指定使用欧氏距离公式，默认欧氏距离公式）
similar = model.approxSimilarityJoin(train, train, 2.0, distCol='EuclideanDistance')

In [None]:
similar.sort(['EuclideanDistance']).show()

In [None]:
# 因暂时无法计算，所以不能确定是什么数据类型
similar

In [None]:
# 将 相似文章 保存到 HBase 中：
# 创建HBASE的article_similar表：create 'article_similar', 'similar'
def save_hbase(partitions):
    import happybase
    pool = happybase.ConnectionPool(size=3, host='hadoop-master')
    
    with pool.connection() as conn:
        article_similar = conn.table('article_similar')
        for row in partitions:
            if row.datasetA.article_id == row.datasetB.article_id:
                pass
            else:
                article_similar.put(str(row.datasetA.article_id).encode(),
                                   {'similar:{}'.format(row.datasetB.article_id).encode(): b'%0.4f' % (row.EuclideanDistance)})

similar.foreachPartition(save_hbase)