In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.insert(0, os.path.join(BASE_DIR))
sys.path.insert(0, os.path.join(BASE_DIR, "reco_sys"))
print(sys.path)

PYSPARK_PYTHON = "/miniconda2/envs/py365/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from offline import SparkSessionBase

class CtrLogisticRegression(SparkSessionBase):

    SPARK_APP_NAME = "ctrLogisticRegression"
    SPARK_URL = "local"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):

        self.spark = self._create_spark_hbase()

ctr = CtrLogisticRegression()

['/root/toutiao_project/reco_sys', '/root/toutiao_project', '/miniconda2/envs/py365/lib/python36.zip', '/miniconda2/envs/py365/lib/python3.6', '/miniconda2/envs/py365/lib/python3.6/lib-dynload', '', '/miniconda2/envs/py365/lib/python3.6/site-packages', '/miniconda2/envs/py365/lib/python3.6/site-packages/pyspark-2.2.2-py3.6.egg', '/miniconda2/envs/py365/lib/python3.6/site-packages/py4j-0.10.7-py3.6.egg', '/miniconda2/envs/py365/lib/python3.6/site-packages/IPython/extensions', '/root/.ipython']


In [None]:
'''
一、读取 用户行为表user_article_basic 与 用户画像表user_profile_hbase（HIVE关联HBASE的user_profile表） 和 文章画像表article_profile，构造训练样本
目标值Y：clicked
特征值X：
    用户画像关键词权重：权重值排序TOPK，这里取10个
    文章频道号：channel_id, ID类型通常要做one_hot编码，变成25维度(25个频道)
    这里由于我们的历史点击日志测试时候是只有18号频道，所以不进行转换
    文章向量：articlevector
    总共：10 + 1+ 100 = 110
'''

In [2]:
# (1)、用户行为日志表 user_article_basic（user_id、article_id 确定唯一行）
ctr.spark.sql('use profile')
user_article_basic = ctr.spark.sql("select user_id, article_id, clicked from user_article_basic")
print(user_article_basic.count())
user_article_basic.show()

995
+-------------------+----------+-------+
|            user_id|article_id|clicked|
+-------------------+----------+-------+
|1105045287866466304|     14225|  false|
|1106476833370537984|     14208|  false|
|1111189494544990208|     19322|  false|
|1111524501104885760|     44161|  false|
|1112727762809913344|     18172|   true|
|                  1|     44386|   true|
|                  1|     44696|  false|
|                 10|     43907|  false|
|1106473203766657024|     16005|  false|
|1108264901190615040|     15196|  false|
|                 23|     44739|   true|
|                 33|     13570|  false|
|                  1|     17632|  false|
|1106473203766657024|     17665|  false|
|1111189494544990208|     44368|  false|
|                 10|     44368|  false|
|1105093883106164736|     15750|  false|
|1106396183141548032|     19476|  false|
|1111524501104885760|     19233|  false|
|                  2|     44371|   true|
+-------------------+----------+-------+
only showing

In [34]:
# (2)、HBASE的用户画像表user_profile  对应的  HIVE关联HBASE 的 HIVE表user_profile_hbase（user_id确定唯一行）
user_profile_hbase = ctr.spark.sql("select user_id, information.gender, information.birthday, article_partial as user_article_partial from user_profile_hbase")
print(user_profile_hbase.count())
user_profile_hbase.show()

89
+--------------------+------+--------+--------------------+
|             user_id|gender|birthday|user_article_partial|
+--------------------+------+--------+--------------------+
|              user:1|  null|     0.0|Map(18:Animal -> ...|
|             user:10|  null|     0.0|Map(18:tp2 -> 0.1...|
|             user:11|  null|     0.0|               Map()|
|user:110249052282...|  null|     0.0|               Map()|
|user:110256196274...|  null|     0.0|               Map()|
|user:110319567345...|  null|     0.0|Map(18:Animal -> ...|
|user:110504528786...|  null|     0.0|Map(18:text -> 0....|
|user:110509388310...|  null|     0.0|Map(18:赋值 -> 0.14...|
|user:110510518565...|  null|     0.0|Map(18:SHOldboySt...|
|user:110639618314...|  null|     0.0|Map(18:tp2 -> 0.1...|
|user:110647320376...|  null|     0.0|Map(18:text -> 0....|
|user:110647683337...|  null|     0.0|Map(18:text -> 1....|
|user:110826490119...|  null|    null|Map(18:text -> 0....|
|user:111118949454...|  null|    null

In [35]:
# 对于用户ID做一个处理，取出前面的user字符串
def deal_with_user_id(row):
    return int(row.user_id.split(':')[1]), row.gender, row.birthday, row.user_article_partial

# 错误：不能直接 .toDF，如果DF存在连续大量的缺失值，toDF方法无法确定该列的类型。只能手动指定该列类型。
# user_profile_hbase = user_profile_hbase.rdd.map(deal_with_user_id).toDF(['user_id', 'gender', 'birthday', 'user_article_partial'])
# 解决：先转换为RDD（RDD不需要知道该列具体类型），然后再通过StructType手动指定。
user_profile = user_profile_hbase.rdd.map(deal_with_user_id)

In [36]:
_schema = StructType([
    StructField('user_id', LongType()),
    StructField('gender', BooleanType()),
    StructField('birthday', DoubleType()),
    StructField('user_article_partial', MapType(StringType(), DoubleType()))
])

In [37]:
user_profile_hbase = ctr.spark.createDataFrame(user_profile, schema=_schema).drop('gender').drop('birthday')
print(user_profile_hbase)
print(user_profile_hbase.count())

DataFrame[user_id: bigint, user_article_partial: map<string,double>]
89


In [25]:
# 用户行为表user_article_basic  left join  用户画像表user_profile_hbase  on user_id  得到 用户行为画像字段user_article_partial（用户基础信息画像字段 缺失值多 已被删除）
# 另：用户行为表user_article_basic（user_id、article_id确定伟一行），user_profile_hbase（user_id确定唯一行），所以train中：user_id和user_article_partial一一对应相同，又对应多行article_id，user_id、article_id确定唯一行 
train = user_article_basic.join(user_profile_hbase, on=['user_id'], how='left')
print(train.count())
train.show()

995
+-------------------+----------+-------+--------------------+
|            user_id|article_id|clicked|user_article_partial|
+-------------------+----------+-------+--------------------+
|1106473203766657024|     16005|  false|Map(18:text -> 0....|
|1106473203766657024|     17665|  false|Map(18:text -> 0....|
|1106473203766657024|     44664|  false|Map(18:text -> 0....|
|1106473203766657024|     44386|  false|Map(18:text -> 0....|
|1106473203766657024|     13778|  false|Map(18:text -> 0....|
|1106473203766657024|     13039|  false|Map(18:text -> 0....|
|1106473203766657024|     13648|  false|Map(18:text -> 0....|
|1106473203766657024|     17304|  false|Map(18:text -> 0....|
|1106473203766657024|     19233|  false|Map(18:text -> 0....|
|1106473203766657024|     44466|  false|Map(18:text -> 0....|
|1106473203766657024|     18795|  false|Map(18:text -> 0....|
|1106473203766657024|    134812|  false|Map(18:text -> 0....|
|1106473203766657024|     13357|  false|Map(18:text -> 0....|
|110

In [26]:
# (3)、读取 word2vec 的 文章画像向量表article_vector（article_id确定唯一行）
ctr.spark.sql("use article")
article_vector = ctr.spark.sql("select article_id, channel_id, articlevector as article_vector from article_vector")
print(article_vector.count())
article_vector.show()

11881
+----------+----------+--------------------+
|article_id|channel_id|      article_vector|
+----------+----------+--------------------+
|        26|        17|[0.02069368539384...|
|        29|        17|[-0.1446092289546...|
|       474|        17|[0.17293323921293...|
|      1677|        17|[-0.1303829028565...|
|      1806|        17|[0.02166337053188...|
|      1950|        17|[-0.3318378543653...|
|      2040|        17|[-0.0164312324191...|
|      2529|        17|[0.02575729180313...|
|      3506|        17|[0.08157531127196...|
|     38543|        17|[-0.3340523649251...|
|     39104|        17|[-0.1363798526910...|
|     40557|        17|[-0.1039882155372...|
|     41895|        17|[-0.0438782209959...|
|     74783|        17|[-0.0667113812378...|
|     75264|        17|[-0.0319393678308...|
|     75465|        17|[-0.0328539103164...|
|     76584|        17|[0.19926537834339...|
|     77605|        17|[0.12450708812808...|
|     78365|        17|[0.09693023461912...|
|   

In [27]:
# 文章画像向量表article_vector  与  train  合并
train = train.join(article_vector, on=['article_id'], how='left')
print(train.count())
train.show()

995
+-------------------+-------------------+-------+--------------------+----------+--------------------+
|         article_id|            user_id|clicked|user_article_partial|channel_id|      article_vector|
+-------------------+-------------------+-------+--------------------+----------+--------------------+
|              13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|
|              14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|
|              14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1111524501104885760|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|              14805|1105105185656537088|  false|Map(18:SHOldboySt...

In [28]:
# (4)、读取 文章关键词、主题词画像表article_profile（article_id确定唯一行） 取出每篇文章前10关键词权重值
ctr.spark.sql("use article")
article_profile = ctr.spark.sql("select article_id, keywords from article_profile")
# 处理 文章关键词 权重
def get_article_weights(row):
    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        # 给定异常默认值：没有关键词 或 不足10个的会报异常
        weights = [0.0] * 10
    
    return row.article_id, weights

article_profile = article_profile.rdd.map(get_article_weights).toDF(['article_id', 'article_keywords_weights'])
print(article_profile.count())
article_profile.show()

138708
+----------+------------------------+
|article_id|article_keywords_weights|
+----------+------------------------+
|        26|    [0.19827163395829...|
|        29|    [0.26031398249056...|
|       474|    [0.49818598558926...|
|       964|    [0.42194661121527...|
|      1677|    [0.19827339246090...|
|      1697|    [0.25105539265038...|
|      1806|    [0.18449119772340...|
|      1950|    [0.33331407122173...|
|      2040|    [0.38583431341698...|
|      2214|    [0.43761156267670...|
|      2250|    [0.46477621366740...|
|      2453|    [0.50514620188273...|
|      2509|    [0.15138306650944...|
|      2529|    [0.11634963900866...|
|      2927|    [0.28513034617795...|
|      3091|    [0.23478830492918...|
|      3506|    [0.22844780420769...|
|      3764|    [0.27265314149033...|
|      4590|    [0.40296288036812...|
|      4823|    [0.21729897161021...|
+----------+------------------------+
only showing top 20 rows



In [29]:
# 结果  与  train  合并
train = train.join(article_profile, on=['article_id'], how='left')
train = train.dropna()
print(train.count())
train.show()

72
+----------+-------------------+-------+--------------------+----------+--------------------+------------------------+
|article_id|            user_id|clicked|user_article_partial|channel_id|      article_vector|article_keywords_weights|
+----------+-------------------+-------+--------------------+----------+--------------------+------------------------+
|     13401|                 10|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|    [0.08196639249252...|
|     13401|1106396183141548032|  false|Map(18:tp2 -> 0.1...|        18|[0.06157120217893...|    [0.08196639249252...|
|     14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|    [0.15069781969741...|
|     14805|1103195673450250240|  false|Map(18:Animal -> ...|        18|[0.11028526511434...|    [0.15069781969741...|
|     14805|1105045287866466304|  false|Map(18:text -> 0....|        18|[0.11028526511434...|    [0.15069781969741...|
|     14805|1111524501104885760|  false|Map(1

In [30]:
train

DataFrame[article_id: bigint, user_id: bigint, clicked: boolean, user_article_partial: map<string,double>, channel_id: int, article_vector: array<double>, article_keywords_weights: array<double>]

In [31]:
# (5)、进行 用户画像字段user_article_partial的特征筛选（只保留与该行数据channel_id相同的 用户画像数据）
# 经过筛选的user_article_partial字段、文章画像向量字段article_vector、文章画像关键词权重字段article_keywords_weights 都为Array类型 转换为 Vectors类型 进模型
columns = ['article_id', 'user_id', 'channel_id', 'article_vector', 'article_keywords_weights', 'user_article_partial', 'clicked']
# array ---> vecoter
def get_user_weights(row):
    # 取出所有对应particle平道的关键词权重（用户）
    from pyspark.ml.linalg import Vectors
    try:
        weights = sorted([row.user_article_partial[key] for key in row.user_article_partial.keys() if key.split(':')[0] == str(row.channel_id)])[:10]
    except Exception as e:
        # 给定异常默认值：没有用户画像数据 或 不足10个的会报异常
        weights = [0.0] * 10
    
    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.article_vector), Vectors.dense(row.article_keywords_weights), Vectors.dense(weights), int(row.clicked) 

train_1 = train.rdd.map(get_user_weights).toDF(columns)

In [32]:
train_1.show()

+----------+-------------------+----------+--------------------+------------------------+--------------------+-------+
|article_id|            user_id|channel_id|      article_vector|article_keywords_weights|user_article_partial|clicked|
+----------+-------------------+----------+--------------------+------------------------+--------------------+-------+
|     13401|                 10|        18|[0.06157120217893...|    [0.08196639249252...|[0.14721006870437...|      0|
|     13401|1106396183141548032|        18|[0.06157120217893...|    [0.08196639249252...|[0.14781548441231...|      0|
|     14805|1106473203766657024|        18|[0.11028526511434...|    [0.15069781969741...|[0.14781548441231...|      0|
|     14805|1103195673450250240|        18|[0.11028526511434...|    [0.15069781969741...|[0.14727626890745...|      0|
|     14805|1105045287866466304|        18|[0.11028526511434...|    [0.15069781969741...|[0.14754371459474...|      0|
|     14805|1111524501104885760|        18|[0.11

In [33]:
train_1

DataFrame[article_id: bigint, user_id: bigint, channel_id: bigint, article_vector: vector, article_keywords_weights: vector, user_article_partial: vector, clicked: bigint]

In [16]:
# 使用 模型特征收集器VectorAssembler：收集特征从 channel_id → user_article_partial 共4个特征。columns[2:6]左闭右开
train_vecrsion_two = VectorAssembler().setInputCols(columns[2:6]).setOutputCol('features').transform(train_1) # API必须连串书写

In [17]:
# 合并特征向量：features = 1个channel_id + 100个文章向量article_vector + 10个文章关键词权重article_keywords_weights + 10个用户特征权重user_article_partial = 121个特征
# 这里的测试数据只有18号频道或13号频道，所以对channel_id没有做One-Hot编码，正式情况下应该对channel_id字段做One-Hot编码：
# 共25个频道One-Hot编码后得到25个特征（每个特征25维向量），所以总特征features = 25 + 100 + 10 + 10 = 145个特征
train_vecrsion_two.show()

+----------+-------------------+----------+--------------------+------------------------+--------------------+-------+--------------------+
|article_id|            user_id|channel_id|      article_vector|article_keywords_weights|user_article_partial|clicked|            features|
+----------+-------------------+----------+--------------------+------------------------+--------------------+-------+--------------------+
|     13401|                 10|        18|[0.06157120217893...|    [0.08196639249252...|[0.14721006870437...|      0|[18.0,0.061571202...|
|     13401|1106396183141548032|        18|[0.06157120217893...|    [0.08196639249252...|[0.14781548441231...|      0|[18.0,0.061571202...|
|     14805|1106473203766657024|        18|[0.11028526511434...|    [0.15069781969741...|[0.14781548441231...|      0|[18.0,0.110285265...|
|     14805|1103195673450250240|        18|[0.11028526511434...|    [0.15069781969741...|[0.14727626890745...|      0|[18.0,0.110285265...|
|     14805|11050452

In [18]:
# lr = LogisticRegression()
# model = lr.setLabelCol("clicked").setFeaturesCol("features").fit(train_vecrsion_two) # API必须连串书写
# model.save("hdfs://hadoop-master:9000/headlines/models/test_ctr.obj")

In [2]:
# 加载之前已经训练好的 逻辑回归模型：
online_model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/headlines/models/logistic_ctr_model.obj")
res_transfrom = online_model.transform(train_version_two) # 对训练集进行预测

In [None]:
def vector_to_double(row):
    return float(row.clicked), float(row.probability[1]) 

# clicked 目标值（真实）， probability:[不点击的概率，点击的概率]
score_label = res_transfrom.select(["clicked", "probability"]).rdd.map(vector_to_double)

In [None]:
# 模型评估-Accuracy与AUC
import matplotlib.pyplot as plt
plt.figure(figsize=(5,5))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(model.summary.roc.select('FPR').collect(),
         model.summary.roc.select('TPR').collect())
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.show()

In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(score_label)
metrics.areaUnderROC

In [None]:
# 其它方法：使用SKLearn库
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

arr = np.array(score_label.collect())
accuracy_score(arr[:, 0], arr[:, 1].round())
roc_auc_score(arr[:, 0], arr[:, 1])

In [69]:
#  二、用户特征中心
'''
每个用户会有25个频道的画像结果
对25个频道都进行特征抽取：10个关键词权重⼤的值
'''
ctr.spark.sql("use profile")
user_profile_hbase = ctr.spark.sql("select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase")
# 特征工程处理
# 抛弃获取值少的特征：用户行为画像字段（用户基础信息画像字段 缺失值多 已被删除）
# user_profile_hbase = user_profile_hbase.drop('env', 'birthday', 'gender') # 不能在这删除，删除之后 DataFrame.rdd 只有20行数据
print(user_profile_hbase.count())
user_profile_hbase.show()

89
+--------------------+--------+------+--------------------+-----+
|             user_id|birthday|gender|     article_partial|  env|
+--------------------+--------+------+--------------------+-----+
|              user:1|     0.0|  null|Map(18:Animal -> ...|Map()|
|             user:10|     0.0|  null|Map(18:tp2 -> 0.1...|Map()|
|             user:11|     0.0|  null|               Map()|Map()|
|user:110249052282...|     0.0|  null|               Map()|Map()|
|user:110256196274...|     0.0|  null|               Map()|Map()|
|user:110319567345...|     0.0|  null|Map(18:Animal -> ...|Map()|
|user:110504528786...|     0.0|  null|Map(18:text -> 0....|Map()|
|user:110509388310...|     0.0|  null|Map(18:赋值 -> 0.14...|Map()|
|user:110510518565...|     0.0|  null|Map(18:SHOldboySt...|Map()|
|user:110639618314...|     0.0|  null|Map(18:tp2 -> 0.1...|Map()|
|user:110647320376...|     0.0|  null|Map(18:text -> 0....|Map()|
|user:110647683337...|     0.0|  null|Map(18:text -> 1....|Map()|
|user:1

In [70]:
def get_user_id(row):
    # return int(row.user_id.split(":")[1]), row.article_partial # 直接返回2个字段即可，后续就无需删除了。
    return int(row.user_id.split(":")[1]), row.gender, row.birthday, row.article_partial # 只是为了做删除演示

user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id)
print(user_profile_hbase.count(), user_profile_hbase_temp.count())

89 89


In [72]:
from pyspark.sql.types import *
# 大量缺失值导致.toDF方法不能自动确定列数据类型而报错，改为手动指定字段数据类型
_schema = StructType([
    StructField("user_id", LongType()),
    StructField('gender', BooleanType()),
    StructField('birthday', DoubleType()),
    StructField("weights", MapType(StringType(), DoubleType()))
])

# 只能在RDD 重建 DataFrame 后 删除字段，数据不会丢失。
user_profile_hbase_schema = ctr.spark.createDataFrame(user_profile_hbase_temp, schema=_schema).drop('birthday', 'gender')
print(user_profile_hbase_schema.count())
user_profile_hbase_schema.show()

89
+-------------------+--------------------+
|            user_id|             weights|
+-------------------+--------------------+
|                  1|Map(18:Animal -> ...|
|                 10|Map(18:tp2 -> 0.1...|
|                 11|               Map()|
|1102490522829717504|               Map()|
|1102561962748805120|               Map()|
|1103195673450250240|Map(18:Animal -> ...|
|1105045287866466304|Map(18:text -> 0....|
|1105093883106164736|Map(18:赋值 -> 0.14...|
|1105105185656537088|Map(18:SHOldboySt...|
|1106396183141548032|Map(18:tp2 -> 0.1...|
|1106473203766657024|Map(18:text -> 0....|
|1106476833370537984|Map(18:text -> 1....|
|1108264901190615040|Map(18:text -> 0....|
|1111189494544990208|Map(18:text -> 0....|
|1111524501104885760|Map(18:text -> 0....|
|1112727762809913344|Map(18:text -> 0....|
|1113053603926376448|Map(18:ssword -> ...|
|1113244157343694848|Map(18:text -> 0....|
|1114863846486441984|               Map()|
|1114863941936218112|               Map()|
+-------

In [73]:
def frature_preprocess(row):

    from pyspark.ml.linalg import Vectors

    channel_weights = [] # 长度为25
    for i in range(1, 26): # 1 到 25 频道
        try:
            # 每个用户 → 每个频道 选取 主题词权重值前10 
            _res = sorted([row.weights[key] for key in row.weights.keys() if key.split(':')[0] == str(i)])[:10]
            channel_weights.append(_res) # _res是数组类型，长度为10
        except:
            channel_weights.append([0.0] * 10) # [0.0] * 10是数组类型

    return row.user_id, channel_weights

res = user_profile_hbase_schema.rdd.map(frature_preprocess).collect()

In [74]:
print(len(res))
res

89


[(1,
  [[],
   [],
   [],
   [],
   [],
   [],
   [1.0371303041051214],
   [],
   [],
   [],
   [],
   [],
   [1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214,
    1.0371303041051214],
   [],
   [],
   [],
   [],
   [0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466,
    0.14714412923983466],
   [],
   [],
   [],
   [],
   [],
   [],
   []]),
 (10,
  [[],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [],
   [0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.14721006870437733,
    0.147210068704377

In [75]:
res[0] # 第一个用户数据：1为user_id，[[],[]...[]]为用户在25个频道中每个频道的主题词对应的前10个权重值

(1,
 [[],
  [],
  [],
  [],
  [],
  [],
  [1.0371303041051214],
  [],
  [],
  [],
  [],
  [],
  [1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214,
   1.0371303041051214],
  [],
  [],
  [],
  [],
  [0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466,
   0.14714412923983466],
  [],
  [],
  [],
  [],
  [],
  [],
  []])

In [76]:
# 用户特征中心 存储进HBASE表：ctr_feature_user
import happybase
# 批量插入Hbase数据库中
pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)
with pool.connection() as conn:
    ctr_feature_user = conn.table('ctr_feature_user')
    with ctr_feature_user.batch(transaction=True) as b:
        for i in range(len(res)): # len(res)个用户
            for j in range(25): # 每个用户有25个频道 → 每个频道前10的主题词权重值
                b.put('{}'.format(res[i][0]).encode(), {'channel:{}'.format(j + 1).encode(): str(res[i][1][j]).encode()}) # res[i][1]是二维数组
    conn.close()

In [3]:
#  三、文章特征中心
ctr.spark.sql("use article")
article_profile = ctr.spark.sql("select * from article_profile")
print(article_profile.count(), article_profile.rdd.count())
article_profile.show()

138708 138708
+----------+----------+--------------------+--------------------+
|article_id|channel_id|            keywords|              topics|
+----------+----------+--------------------+--------------------+
|        26|        17|Map(jpg -> 0.9806...|[Electron, 全自动, 产...|
|        29|        17|Map(github -> 0.8...|[WebAssembly, 影音,...|
|       474|        17|Map(png -> 1.7596...|[textAlign, borde...|
|       964|        11|Map(DemoService -...|[protocol, RMI, d...|
|      1677|        17|Map(require -> 0....|[spritesmith, ico...|
|      1697|         6|Map(函数 -> 0.27181...|[nav, 样式, width, ...|
|      1806|        17|Map(auto -> 0.585...|[声明, word, 容器, Ex...|
|      1950|        17|Map(koala -> 3.97...|[app, scss, koala...|
|      2040|        17|Map(style -> 2.68...|[宽度, 媒体, width, r...|
|      2214|        11|Map(国家 -> 1.38065...|[Cyber, 语言, 黑客, 知...|
|      2250|         6|Map(imageView2 ->...|[宽度, cal, 阶梯, 页面,...|
|      2453|        13|Map(__ -> 2.33283...|[__, CNN, logisti.

In [4]:
def article_profile_to_feature(row):
    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        weights = [0.0] * 10
    return row.article_id, row.channel_id, weights

article_profile = article_profile.rdd.map(article_profile_to_feature).toDF(['article_id', 'channel_id', 'article_keywords_weights'])
print(article_profile.count())
article_profile.show()

138708
+----------+----------+------------------------+
|article_id|channel_id|article_keywords_weights|
+----------+----------+------------------------+
|        26|        17|    [0.19827163395829...|
|        29|        17|    [0.26031398249056...|
|       474|        17|    [0.49818598558926...|
|       964|        11|    [0.42194661121527...|
|      1677|        17|    [0.19827339246090...|
|      1697|         6|    [0.25105539265038...|
|      1806|        17|    [0.18449119772340...|
|      1950|        17|    [0.33331407122173...|
|      2040|        17|    [0.38583431341698...|
|      2214|        11|    [0.43761156267670...|
|      2250|         6|    [0.46477621366740...|
|      2453|        13|    [0.50514620188273...|
|      2509|        13|    [0.15138306650944...|
|      2529|        17|    [0.11634963900866...|
|      2927|         6|    [0.28513034617795...|
|      3091|         6|    [0.23478830492918...|
|      3506|        17|    [0.22844780420769...|
|      3764| 

In [5]:
article_vector = ctr.spark.sql("select * from article_vector")
article_feature = article_profile.join(article_vector, on=['article_id'], how='inner')

def feature_to_vector(row):
    from pyspark.ml.linalg import Vectors
    return row.article_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(row.article_keywords_weights)

article_feature = article_feature.rdd.map(feature_to_vector).toDF(['article_id', 'channel_id', 'article_vector', 'article_keywords_weights'])
print(article_feature.count())
article_feature.show()

11881
+----------+----------+--------------------+------------------------+
|article_id|channel_id|      article_vector|article_keywords_weights|
+----------+----------+--------------------+------------------------+
|        26|        17|[0.02069368539384...|    [0.19827163395829...|
|        29|        17|[-0.1446092289546...|    [0.26031398249056...|
|       474|        17|[0.17293323921293...|    [0.49818598558926...|
|      1677|        17|[-0.1303829028565...|    [0.19827339246090...|
|      1697|         6|[0.05229978313861...|    [0.25105539265038...|
|      1806|        17|[0.02166337053188...|    [0.18449119772340...|
|      1950|        17|[-0.3318378543653...|    [0.33331407122173...|
|      2040|        17|[-0.0164312324191...|    [0.38583431341698...|
|      2250|         6|[-0.0597617824653...|    [0.46477621366740...|
|      2453|        13|[-0.1038588426578...|    [0.50514620188273...|
|      2509|        13|[0.04533940468085...|    [0.15138306650944...|
|      2529|  

In [6]:
# 保存特征数据
cols2 = ['article_id', 'channel_id', 'article_vector', 'article_keywords_weights']
# 做特征的指定指定合并
article_feature_two = VectorAssembler().setInputCols(cols2[1:4]).setOutputCol("features").transform(article_feature)
print(article_feature_two.count())
article_feature_two.show()

11881
+----------+----------+--------------------+------------------------+--------------------+
|article_id|channel_id|      article_vector|article_keywords_weights|            features|
+----------+----------+--------------------+------------------------+--------------------+
|        26|        17|[0.02069368539384...|    [0.19827163395829...|[17.0,0.020693685...|
|        29|        17|[-0.1446092289546...|    [0.26031398249056...|[17.0,-0.14460922...|
|       474|        17|[0.17293323921293...|    [0.49818598558926...|[17.0,0.172933239...|
|      1677|        17|[-0.1303829028565...|    [0.19827339246090...|[17.0,-0.13038290...|
|      1697|         6|[0.05229978313861...|    [0.25105539265038...|[6.0,0.0522997831...|
|      1806|        17|[0.02166337053188...|    [0.18449119772340...|[17.0,0.021663370...|
|      1950|        17|[-0.3318378543653...|    [0.33331407122173...|[17.0,-0.33183785...|
|      2040|        17|[-0.0164312324191...|    [0.38583431341698...|[17.0,-0.016431

In [7]:
# 文章特征中心 存储进HBASE表：ctr_feature_article
def save_article_feature_to_hbase(partition):
    import happybase
    pool = happybase.ConnectionPool(size=10, host='hadoop-master')
    with pool.connection() as conn:
        table = conn.table('ctr_feature_article')
        for row in partition:
            table.put('{}'.format(row.article_id).encode(),
                     {'article:{}'.format(row.article_id).encode(): str(row.features).encode()})

article_feature_two.foreachPartition(save_article_feature_to_hbase)