In [3]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, os.path.join(BASE_DIR))

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from offline import SparkSessionBase
import pyhdfs
import time


class UpdateUserProfile(SparkSessionBase):
    """离线相关处理程序
    """
    SPARK_APP_NAME = "updateUser"
    SPARK_URL = "local"
    ENABLE_HIVE_SUPPORT = True

    SPARK_EXECUTOR_MEMORY = "1g"

    def __init__(self):

        self.spark = self._create_spark_session()

uup = UpdateUserProfile()

In [6]:
# - 2、读取固定时间内的用户行为日志
uup.spark.sql("use profile")
user_action = uup.spark.sql("select actionTime, readTime, channelId, param.articleId, param.algorithmCombine, param.action, param.userId from user_action where dt>='2019-04-01'")
user_action.show()

+-------------------+--------+---------+--------------------+----------------+--------+-------------------+
|         actionTime|readTime|channelId|           articleId|algorithmCombine|  action|             userId|
+-------------------+--------+---------+--------------------+----------------+--------+-------------------+
|2019-04-02 12:21:55|        |        0|[44737, 44739, 14...|              C2|exposure|1112727762809913344|
|2019-04-02 12:21:57|        |       18|              140357|              C2|   click|1112727762809913344|
|2019-04-02 12:22:20|        |       18|              140357|              C2| collect|1112727762809913344|
|2019-04-02 12:22:36|   38000|       18|              140357|              C2|    read|1112727762809913344|
|2019-04-02 12:22:43|        |       18|               13476|              C2|   click|1112727762809913344|
|2019-04-02 12:23:08|   23306|       18|               13476|              C2|    read|1112727762809913344|
|2019-04-02 12:23:13|       

In [9]:
# - 3、进行用户日志数据处理
def _compute(row):
    
    _list = []
    if row.action == 'exposure':
        for article_id in eval(row.articleId):
            # 用户ID跟文章ID拼接一个样本
            # ["user_id", "action_time","article_id", "channel_id", "shared", "clicked", "collected", "exposure", "read_time"]
            _list.append([row.userId, row.actionTime, article_id, row.channelId, False, False, False, True, row.readTime])
        return _list
    else:
        class Temp(object):
            shared = False
            clicked = False
            collected = False
            read_time = ""
        
        _tp = Temp()
        if row.action == 'click':
            _tp.clicked = True
        elif row.action == 'share':
            _tp.shared = True
        elif row.action == 'collect':
            _tp.collected = True
        elif row.action == 'read':
            _tp.clicked = True
        else:
            pass
        
        _list.append([row.userId, row.actionTime, int(row.articleId), row.channelId, _tp.shared, _tp.clicked, _tp.collected, True, row.readTime])
        
        return _list
        

_res = user_action.rdd.flatMap(_compute)
user_action_basic = _res.toDF(["user_id", "action_time","article_id", "channel_id", "shared", "clicked", "collected", "exposure", "read_time"])
user_action_basic.show()

+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|            user_id|        action_time|article_id|channel_id|shared|clicked|collected|exposure|read_time|
+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|1103195673450250240|2019-04-10 06:00:20|    141440|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     44161|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     17283|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     43907|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     16005|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     15750|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04

In [35]:
'''
1、行为表：行为需要更更新
2、将新的行为数据与历史行为数据进⾏合并
3、按照用户ID与文章ID分组合并，合并成⼀一行数据
4、HIVE目前支持hive终端操作ACID, update, delete
5、不支持python的pyspark原⼦性操作，并且开启配置中开启原⼦性相关配置也不行。insert overwrit
6、删除原来的表数据，将所有数据重新插⼊
'''
# 合并历史数据，存储到user_article_basic表中
old = uup.spark.sql("select * from user_article_basic")

In [36]:
# 由于合并的结果中 user_id和article_id 不是唯一的，所以 一个用户 会对 文章 有多种操作
new = old.unionAll(user_action_basic)
new.show()

+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|            user_id|        action_time|article_id|channel_id|shared|clicked|collected|exposure|read_time|
+-------------------+-------------------+----------+----------+------+-------+---------+--------+---------+
|1103195673450250240|2019-04-10 06:00:20|    141440|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     44161|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     17283|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     43907|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     16005|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04-10 06:00:20|     15750|         0| false|  false|    false|    true|         |
|1103195673450250240|2019-04

In [None]:
# HIVE目前支持hive终端操作ACID，不支持python的pyspark原子性操作，并且开启配置中开启原子性相关配置也不行。所以只能通过如下方式更新：
# insert overwrite table... 删除原来的表数据，将所有数据重新插⼊

# new.registerTempTable('temptable')

# max(Boolean)：true默认为1；false默认为0。所以max(Boolean)时，如果有true，则取true。
# uup.spark.sql("insert overwrite table user_article_basic select user_id, max(action_time) as action_time, "
#         "article_id, max(channel_id) as channel_id, max(shared) as shared, max(clicked) as clicked, "
#         "max(collected) as collected, max(exposure) as exposure, max(read_time) as read_time from temptable "
#         "group by user_id, article_id")

In [40]:
# 用户画像的关键词获取以及权重计算
# 1、读取user_article_basic表，合并行为表与文章画像中的主题词
uup.spark.sql("use profile")
user_basic = uup.spark.sql("select * from user_article_basic").drop('channel_id')
user_basic.show()

+-------------------+-------------------+----------+------+-------+---------+--------+---------+
|            user_id|        action_time|article_id|shared|clicked|collected|exposure|read_time|
+-------------------+-------------------+----------+------+-------+---------+--------+---------+
|1105045287866466304|2019-03-11 18:13:45|     14225| false|  false|    false|    true|         |
|1106476833370537984|2019-03-15 16:46:50|     14208| false|  false|    false|    true|         |
|1111189494544990208|2019-03-28 17:02:35|     19322| false|  false|    false|    true|         |
|1111524501104885760|2019-03-29 15:04:27|     44161| false|  false|    false|    true|         |
|1112727762809913344|2019-04-03 12:51:57|     18172| false|   true|     true|    true|    19413|
|                  1|2019-03-07 16:57:34|     44386| false|   true|    false|    true|    17850|
|                  1|2019-03-11 18:13:11|     44696| false|  false|    false|    true|         |
|                 10|2019-03-0

In [41]:
#读取文章画像
uup.spark.sql("use article")
article_topic = uup.spark.sql("select  article_id, channel_id, topics from article_profile")
article_topic.show()

+----------+----------+--------------------+
|article_id|channel_id|              topics|
+----------+----------+--------------------+
|        26|        17|[Electron, 全自动, 产...|
|        29|        17|[WebAssembly, 影音,...|
|       474|        17|[textAlign, borde...|
|       964|        11|[protocol, RMI, d...|
|      1677|        17|[spritesmith, ico...|
|      1697|         6|[nav, 样式, width, ...|
|      1806|        17|[声明, word, 容器, Ex...|
|      1950|        17|[app, scss, koala...|
|      2040|        17|[宽度, 媒体, width, r...|
|      2214|        11|[Cyber, 语言, 黑客, 知...|
|      2250|         6|[宽度, cal, 阶梯, 页面,...|
|      2453|        13|[__, CNN, logisti...|
|      2509|        13|[池化, CNN, 卷积神经网络,...|
|      2529|        17|[标题栏, 定义, 嵌套, hea...|
|      2927|         6|[季风, 圆角, bezier, ...|
|      3091|         6|[Chrome, react, 工...|
|      3506|        17|[cond, AJAX, 实心, ...|
|      3764|        15|[__, 语言, 原型链, obj...|
|      4590|        19|[println, Class, ...|
|      482

In [14]:
user_topic = user_basic.join(article_topic, on=['article_id'], how='left')

In [15]:
user_topic.show()

+----------+-------------------+-------------------+------+-------+---------+--------+---------+----------+--------------------+
|article_id|            user_id|        action_time|shared|clicked|collected|exposure|read_time|channel_id|              topics|
+----------+-------------------+-------------------+------+-------+---------+--------+---------+----------+--------------------+
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|[补码, 字符串, 李白, typ...|
|     13401|1114864237131333632|2019-04-09 16:39:51| false|  false|    false|    true|         |        18|[补码, 字符串, 李白, typ...|
|     13401|1106396183141548032|2019-03-28 10:58:20| false|  false|    false|    true|         |        18|[补码, 字符串, 李白, typ...|
|     13401|1109994594201763840|2019-03-26 15:03:58| false|  false|    false|    true|         |        18|[补码, 字符串, 李白, typ...|
|     14805|1105045287866466304|2019-03-11 18:15:48| false|  false|    false|    true|         | 

In [16]:
import pyspark.sql.functions as F

user_topic = user_topic.withColumn('topic', F.explode('topics')).drop('topics')


In [17]:
user_topic.show()

+----------+-------------------+-------------------+------+-------+---------+--------+---------+----------+--------+
|article_id|            user_id|        action_time|shared|clicked|collected|exposure|read_time|channel_id|   topic|
+----------+-------------------+-------------------+------+-------+---------+--------+---------+----------+--------+
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|      补码|
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|     字符串|
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|      李白|
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|    type|
|     13401|                 10|2019-03-06 10:06:12| false|  false|    false|    true|         |        18|      元素|
|     13401|                 10|2019-03-06 10:06:12| false|  fal

In [21]:
def compute_user_label_weights(partitions):
    """# 计算用户关键词权重
    """
    weightsOfaction = {
        "read_min": 1,
        "read_middle": 2,
        "collect": 2,
        "share": 3,
        "click": 5
    }
    
    # 导入包
    from datetime import datetime
    import numpy as np
    
    
    # 循环每个用户对应每个关键词处理
    for row in partitions:
        
        # 计算时间系数
        t = datetime.now() - datetime.strptime(row.action_time, '%Y-%m-%d %H:%M:%S')
        alpha = 1 / (np.log(t.days + 1) + 1)
        
        # 判断一下这个关键词对应的操作文章时间大小的权重处理
        if row.read_time  == '':
            read_t = 0
        else:
            read_t = int(row.read_time)
        
        # 阅读时间的行为分数计算出来
        read_score = weightsOfaction['read_middle'] if read_t > 1000 else weightsOfaction['read_min']
        
        # 计算row.topic的权重
        weights = alpha * (row.shared * weightsOfaction['share'] + row.clicked * weightsOfaction['click'] +
                          row.collected * weightsOfaction['collect'] + read_score)
        
        # user_profilehbase表
        #        with pool.connection() as conn:
#            table = conn.table('user_profile')
#            table.put('user:{}'.format(row.user_id).encode(),
#                      {'partial:{}:{}'.format(row.channel_id, row.topic).encode(): json.dumps(
#                          weigths).encode()})
#            conn.close()
        


user_topic.foreachPartition(compute_user_label_weights)