In [1]:
import os
import sys
# 如果当前代码文件运行测试需要加入修改路径，避免出现后导包问题
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.insert(0, os.path.join(BASE_DIR))
sys.path.insert(0, os.path.join(BASE_DIR, "reco_sys"))
print(sys.path)

PYSPARK_PYTHON = "/miniconda2/envs/reco_sys/bin/python"
# 当存在多个版本时，不指定很可能会导致出错
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from offline import SparkSessionBase

class CtrLogisticRegression(SparkSessionBase):

    SPARK_APP_NAME = "ctrLogisticRegression"
    SPARK_URL = "local"
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):

        self.spark = self._create_spark_hbase()

ctr = CtrLogisticRegression()

['/root/toutiao_project/reco_sys', '/root/toutiao_project', '/miniconda2/envs/py365/lib/python36.zip', '/miniconda2/envs/py365/lib/python3.6', '/miniconda2/envs/py365/lib/python3.6/lib-dynload', '', '/miniconda2/envs/py365/lib/python3.6/site-packages', '/miniconda2/envs/py365/lib/python3.6/site-packages/pyspark-2.2.2-py3.6.egg', '/miniconda2/envs/py365/lib/python3.6/site-packages/py4j-0.10.7-py3.6.egg', '/miniconda2/envs/py365/lib/python3.6/site-packages/IPython/extensions', '/root/.ipython']


In [9]:
# (1)、用户行为日志表 user_article_basic（user_id、article_id 确定唯一行）
ctr.spark.sql('use profile')
user_article_basic = ctr.spark.sql("select user_id, article_id, clicked from user_article_basic")
user_article_basic.show()

+-------------------+----------+-------+
|            user_id|article_id|clicked|
+-------------------+----------+-------+
|1105045287866466304|     14225|  false|
|1106476833370537984|     14208|  false|
|1111189494544990208|     19322|  false|
|1111524501104885760|     44161|  false|
|1112727762809913344|     18172|   true|
|                  1|     44386|   true|
|                  1|     44696|  false|
|                 10|     43907|  false|
|1106473203766657024|     16005|  false|
|1108264901190615040|     15196|  false|
|                 23|     44739|   true|
|                 33|     13570|  false|
|                  1|     17632|  false|
|1106473203766657024|     17665|  false|
|1111189494544990208|     44368|  false|
|                 10|     44368|  false|
|1105093883106164736|     15750|  false|
|1106396183141548032|     19476|  false|
|1111524501104885760|     19233|  false|
|                  2|     44371|   true|
+-------------------+----------+-------+
only showing top

In [3]:
# (2)、HBASE的用户画像表user_profile  对应的  HIVE关联HBASE的 HIVE表user_profile_hbase（user_id确定唯一行）
user_profile_hbase = ctr.spark.sql("select user_id, information.gender, information.birthday, article_partial from user_profile_hbase")
user_profile_hbase.show()

+--------------------+------+--------+--------------------+
|             user_id|gender|birthday|     article_partial|
+--------------------+------+--------+--------------------+
|              user:1|  null|     0.0|Map(18:Animal -> ...|
|             user:10|  null|     0.0|Map(18:tp2 -> 0.1...|
|             user:11|  null|     0.0|               Map()|
|user:110249052282...|  null|     0.0|               Map()|
|user:110256196274...|  null|     0.0|               Map()|
|user:110319567345...|  null|     0.0|Map(18:Animal -> ...|
|user:110504528786...|  null|     0.0|Map(18:text -> 0....|
|user:110509388310...|  null|     0.0|Map(18:赋值 -> 0.14...|
|user:110510518565...|  null|     0.0|Map(18:SHOldboySt...|
|user:110639618314...|  null|     0.0|Map(18:tp2 -> 0.1...|
|user:110647320376...|  null|     0.0|Map(18:text -> 0....|
|user:110647683337...|  null|     0.0|Map(18:text -> 1....|
|user:110826490119...|  null|    null|Map(18:text -> 0....|
|user:111118949454...|  null|    null|Ma

In [7]:
# 对于用户ID做一个处理，取出前面的user字符串
def deal_with_user_id(row):
    return int(row.user_id.split(':')[1]), row.gender, row.birthday, row.article_partial

# 错误
# user_profile_hbase = user_profile_hbase.rdd.map(deal_with_user_id).toDF(['user_id', 'gender', 'birthday', 'article_partial'])
user_profile = user_profile_hbase.rdd.map(deal_with_user_id)

In [8]:
_schema = StructType([
    StructField('user_id', LongType()),
    StructField('gender', BooleanType()),
    StructField('birthday', DoubleType()),
    StructField('article_partial', MapType(StringType(), DoubleType()))
])

In [11]:
user_profile_hbase = ctr.spark.createDataFrame(user_profile, schema=_schema).drop('gender').drop('birthday')

In [12]:
user_profile_hbase

DataFrame[user_id: bigint, article_partial: map<string,double>]

In [35]:
train = user_article_basic.join(user_profile_hbase, on=['user_id'], how='left')

In [36]:
train.show()

+-------------------+----------+-------+--------------------+
|            user_id|article_id|clicked|     article_partial|
+-------------------+----------+-------+--------------------+
|1106473203766657024|     16005|  false|Map(18:text -> 0....|
|1106473203766657024|     17665|  false|Map(18:text -> 0....|
|1106473203766657024|     44664|  false|Map(18:text -> 0....|
|1106473203766657024|     44386|  false|Map(18:text -> 0....|
|1106473203766657024|     14335|  false|Map(18:text -> 0....|
|1106473203766657024|     13778|  false|Map(18:text -> 0....|
|1106473203766657024|     13039|  false|Map(18:text -> 0....|
|1106473203766657024|     13648|  false|Map(18:text -> 0....|
|1106473203766657024|     17304|  false|Map(18:text -> 0....|
|1106473203766657024|     19233|  false|Map(18:text -> 0....|
|1106473203766657024|     44466|  false|Map(18:text -> 0....|
|1106473203766657024|     18795|  false|Map(18:text -> 0....|
|1106473203766657024|    134812|  false|Map(18:text -> 0....|
|1106473

In [37]:
# (3)、文章频道与向量读取合并，删除无用的特征，合并文章画像的权重特征
ctr.spark.sql("use article")
article_vector = ctr.spark.sql("select * from article_vector")
article_vector.show()

In [38]:
train = train.join(article_vector, on=['article_id'], how='left')
train.show()

+----------+-------------------+-------+--------------------+----------+--------------------+
|article_id|            user_id|clicked|     article_partial|channel_id|       articlevector|
+----------+-------------------+-------+--------------------+----------+--------------------+
|     13401|1114864237131333632|  false|Map(18:vars -> 0....|        18|[0.06157120217893...|
|     13401|                 10|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|
|     13401|1106396183141548032|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|
|     13401|1109994594201763840|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|
|     14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|     14805|1113049054452908032|  false|Map(18:text -> 0....|        18|[0.11028526511434...|
|     14805|1114863751909081088|   true|Map(18:text -> 2....|        18|[0.11028526511434...|
|     14805|1115534909935452160|  false|Map(18:text -> 2....

In [40]:
# 读取文章画像
ctr.spark.sql("use article")
article_profile = ctr.spark.sql("select article_id, keywords from article_profile")
# 处理文章权重
def get_article_weights(row):
    
    try:
        weights = sorted(row.keywords.values())[:10]
    except Exception as e:
        # 给定异常默认值
        weights = [0.0] * 10
    
    return row.article_id, weights

article_profile = article_profile.rdd.map(get_article_weights).toDF(['article_id', 'article_weights'])

In [41]:
# article_profile
train = train.join(article_profile, on=['article_id'], how='left')



In [42]:
train.show()



+----------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|article_id|            user_id|clicked|     article_partial|channel_id|       articlevector|     article_weights|
+----------+-------------------+-------+--------------------+----------+--------------------+--------------------+
|     13401|1114864237131333632|  false|Map(18:vars -> 0....|        18|[0.06157120217893...|[0.08196639249252...|
|     13401|                 10|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|[0.08196639249252...|
|     13401|1106396183141548032|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|[0.08196639249252...|
|     13401|1109994594201763840|  false|Map(18:tp2 -> 0.2...|        18|[0.06157120217893...|[0.08196639249252...|
|     14805|1106473203766657024|  false|Map(18:text -> 0....|        18|[0.11028526511434...|[0.15069781969741...|
|     14805|1113049054452908032|  false|Map(18:text -> 0....|        18|[0.11028

In [44]:
train

DataFrame[article_id: bigint, user_id: bigint, clicked: boolean, article_partial: map<string,double>, channel_id: int, articlevector: array<double>, article_weights: array<double>]

In [45]:
# - (4)进行用户的权重特征筛选处理，类型处理
train = train.dropna()

columns = ['article_id', 'user_id', 'channel_id', 'articlevector', 'user_weights', 'article_weights', 'clicked']
# array --->vecoter
def get_user_weights(row):
    
    # 取出所有对应particle平道的关键词权重（用户）
    from pyspark.ml.linalg import Vectors
    try:
        weights = sorted([row.article_partial[key] for key in 
                          row.article_partial.keys() if key.split(':')[0] == str(row.channel_id)])[:10]
    except Exception as e:
        weights = [0.0] * 10
    
    return row.article_id, row.user_id, row.channel_id, Vectors.dense(row.articlevector), Vectors.dense(weights), Vectors.dense(row.article_weights),int(row.clicked) 

train_1 = train.rdd.map(get_user_weights).toDF(columns)


In [46]:
train_1

DataFrame[article_id: bigint, user_id: bigint, channel_id: bigint, articlevector: vector, user_weights: vector, article_weights: vector, clicked: bigint]

In [47]:
# 使用收集特征dao features
train_vecrsion_two = VectorAssembler().setInputCols(columns[2:6]).setOutputCol('features').transform(train_1)

In [48]:
# features 121值, 13, 18,       1,2,3,4,5,6....25
# 25 + 100 + 10 + 10 = 145个特征
train_vecrsion_two.show()

+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|article_id|            user_id|channel_id|       articlevector|        user_weights|     article_weights|clicked|            features|
+----------+-------------------+----------+--------------------+--------------------+--------------------+-------+--------------------+
|     13401|1114864237131333632|        18|[0.06157120217893...|[0.32473420471378...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     13401|                 10|        18|[0.06157120217893...|[0.21215332784742...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     13401|1106396183141548032|        18|[0.06157120217893...|[0.22553064631951...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     13401|1109994594201763840|        18|[0.06157120217893...|[0.24443647588626...|[0.08196639249252...|      0|[18.0,0.061571202...|
|     14805|1106473203766657024|        18|[0.11

In [3]:
# lr = LogisticRegression()
# model = lr.setLabelCol("clicked").setFeaturesCol("features").fit(train_vecrsion_two)
# model.save("hdfs://hadoop-master:9000/headlines/models/test_ctr.obj")

In [2]:
# online_model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/headlines/models/logistic_ctr_model.obj")

# res_transfrom = online_model.transform(train_version_two)


In [None]:
def vector_to_double(row):
    return float(row.clicked), float(row.probability[1]) 

score_label = res_transfrom.select(["clicked", "probability"]).rdd.map(vector_to_double)

In [None]:
# clicked 目标值（真实）
# probability: [不点击的概率， 点击的概率]

In [4]:
# 构造样本
ctr.spark.sql("use profile")

user_profile_hbase = ctr.spark.sql(
    "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase")

# 特征工程处理
# 抛弃获取值少的特征
user_profile_hbase = user_profile_hbase.drop('env', 'birthday', 'gender')

def get_user_id(row):
    return int(row.user_id.split(":")[1]), row.article_partial

user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id)

from pyspark.sql.types import *

_schema = StructType([
    StructField("user_id", LongType()),
    StructField("weights", MapType(StringType(), DoubleType()))
])

user_profile_hbase_schema = ctr.spark.createDataFrame(user_profile_hbase_temp, schema=_schema)

def frature_preprocess(row):

    from pyspark.ml.linalg import Vectors

    channel_weights = []
    for i in range(1, 26):
        try:
            _res = sorted([row.weights[key] for key
                           in row.weights.keys() if key.split(':')[0] == str(i)])[:10]
            channel_weights.append(_res)
        except:
            channel_weights.append([0.0] * 10)

    return row.user_id, channel_weights

res = user_profile_hbase_schema.rdd.map(frature_preprocess).collect()

In [9]:
# res

In [8]:
print(len(res))

72


In [12]:
# (10,
#   [[],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846,
#     0.21215332784742846],
#    [],
#    [],
#    [],
#    [],
#    [],
#    [],
#    []])
import happybase
# 批量插入Hbase数据库中
pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090)
with pool.connection() as conn:
    ctr_feature_user = conn.table('ctr_feature_user')
    with ctr_feature_user.batch(transaction=True) as b:
        for i in range(len(res)):
            for j in range(25):
                # j 0~~~24
                b.put('{}'.format(res[i][0]).encode(), {'channel:{}'.format(j + 1).encode(): str(res[i][1][j]).encode()})
    conn.close()