## 基于LR的点击率预测模型训练

本小节主要根据广告点击样本数据集(raw_sample)、广告基本特征数据集(ad_feature)、用户基本信息数据集(user_profile)构建出了一个完整的样本数据集，并按日期划分为了训练集(前七天)和测试集(最后一天)，利用逻辑回归进行训练。

训练模型时，通过对类别特征数据进行处理，一定程度达到提高了模型的效果

In [2]:
# spark配置信息
from pyspark import SparkConf
from pyspark.sql import SparkSession

SPARK_APP_NAME = "createCTRModelByLR"
SPARK_URL = "yarn"

conf = SparkConf()    # 创建spark config对象
config = (
	("spark.app.name", SPARK_APP_NAME),    # 设置启动的spark的app名称，没有提供，将随机产生一个名称
	("spark.executor.memory", "2g"),    # 设置该app启动时占用的内存用量，默认1g
	("spark.master", SPARK_URL),    # spark master的地址
    ("spark.executor.cores", "2"),   # 设置spark executor使用的CPU核心数
    ("spark.executor.instances", 1)    # 设置spark executor数量，yarn时起作用
)
# 查看更详细配置及说明：https://spark.apache.org/docs/latest/configuration.html
# 
conf.setAll(config)

# 利用config对象，创建spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
!hadoop fs -ls /workspace/3.rs_project/project1/dataset

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/bigdata/hadoop-2.9.1/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/bigdata/apache-hive-2.3.4-bin/lib/log4j-slf4j-impl-2.6.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 4 items
-rw-r--r--   1 root supergroup    31286431 2018-11-30 02:52 /project1-ad-rs/datasets/ad_feature.csv
-rw-r--r--   1 root supergroup 23728773580 2018-11-30 02:50 /project1-ad-rs/datasets/behavior_log.csv
-rw-r--r--   1 root supergroup  1088060964 2018-11-30 02:53 /project1-ad-rs/datasets/raw_sample.csv
-rw-r--r--   1 root supergroup    24056588 2018-11-30 02:52 /project1-ad-rs/datasets/user_profile.csv


In [4]:
'''从HDFS中加载样本数据信息'''
_raw_sample_df1 = spark.read.csv("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/dataset/raw_sample.csv", header=True)
# _raw_sample_df1.show()    # 展示数据，默认前20条
# 更改表结构，转换为对应的数据类型
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType
  
# 更改df表结构：更改列类型和列名称
_raw_sample_df2 = _raw_sample_df1.\
    withColumn("user", _raw_sample_df1.user.cast(IntegerType())).withColumnRenamed("user", "userId").\
    withColumn("time_stamp", _raw_sample_df1.time_stamp.cast(LongType())).withColumnRenamed("time_stamp", "timestamp").\
    withColumn("adgroup_id", _raw_sample_df1.adgroup_id.cast(IntegerType())).withColumnRenamed("adgroup_id", "adgroupId").\
    withColumn("pid", _raw_sample_df1.pid.cast(StringType())).\
    withColumn("nonclk", _raw_sample_df1.nonclk.cast(IntegerType())).\
    withColumn("clk", _raw_sample_df1.clk.cast(IntegerType()))
_raw_sample_df2.printSchema()
_raw_sample_df2.show()

# 样本数据pid特征处理
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stringindexer = StringIndexer(inputCol='pid', outputCol='pid_feature')
encoder = OneHotEncoder(dropLast=False, inputCol='pid_feature', outputCol='pid_value')
pipeline = Pipeline(stages=[stringindexer, encoder])
pipeline_fit = pipeline.fit(_raw_sample_df2)
raw_sample_df = pipeline_fit.transform(_raw_sample_df2)
raw_sample_df.show()

'''pid和特征的对应关系
430548_1007：0
430549_1007：1
'''

root
 |-- userId: integer (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- adgroupId: integer (nullable = true)
 |-- pid: string (nullable = true)
 |-- nonclk: integer (nullable = true)
 |-- clk: integer (nullable = true)

+------+----------+---------+-----------+------+---+
|userId| timestamp|adgroupId|        pid|nonclk|clk|
+------+----------+---------+-----------+------+---+
|581738|1494137644|        1|430548_1007|     1|  0|
|449818|1494638778|        3|430548_1007|     1|  0|
|914836|1494650879|        4|430548_1007|     1|  0|
|914836|1494651029|        5|430548_1007|     1|  0|
|399907|1494302958|        8|430548_1007|     1|  0|
|628137|1494524935|        9|430548_1007|     1|  0|
|298139|1494462593|        9|430539_1007|     1|  0|
|775475|1494561036|        9|430548_1007|     1|  0|
|555266|1494307136|       11|430539_1007|     1|  0|
|117840|1494036743|       11|430548_1007|     1|  0|
|739815|1494115387|       11|430539_1007|     1|  0|
|623911|1494625301|   

'pid和特征的对应关系\n430548_1007：0\n430549_1007：1\n'

In [5]:
'''从HDFS中加载广告基本信息数据'''
_ad_feature_df = spark.read.csv("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/dataset/ad_feature.csv", header=True)

# 更改表结构，转换为对应的数据类型
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# 替换掉NULL字符串
_ad_feature_df = _ad_feature_df.replace("NULL", "-1")
 
# 更改df表结构：更改列类型和列名称
ad_feature_df = _ad_feature_df.\
    withColumn("adgroup_id", _ad_feature_df.adgroup_id.cast(IntegerType())).withColumnRenamed("adgroup_id", "adgroupId").\
    withColumn("cate_id", _ad_feature_df.cate_id.cast(IntegerType())).withColumnRenamed("cate_id", "cateId").\
    withColumn("campaign_id", _ad_feature_df.campaign_id.cast(IntegerType())).withColumnRenamed("campaign_id", "campaignId").\
    withColumn("customer", _ad_feature_df.customer.cast(IntegerType())).withColumnRenamed("customer", "customerId").\
    withColumn("brand", _ad_feature_df.brand.cast(IntegerType())).withColumnRenamed("brand", "brandId").\
    withColumn("price", _ad_feature_df.price.cast(FloatType()))
ad_feature_df.printSchema()
ad_feature_df.show()

root
 |-- adgroupId: integer (nullable = true)
 |-- cateId: integer (nullable = true)
 |-- campaignId: integer (nullable = true)
 |-- customerId: integer (nullable = true)
 |-- brandId: integer (nullable = true)
 |-- price: float (nullable = true)

+---------+------+----------+----------+-------+-----+
|adgroupId|cateId|campaignId|customerId|brandId|price|
+---------+------+----------+----------+-------+-----+
|    63133|  6406|     83237|         1|  95471|170.0|
|   313401|  6406|     83237|         1|  87331|199.0|
|   248909|   392|     83237|         1|  32233| 38.0|
|   208458|   392|     83237|         1| 174374|139.0|
|   110847|  7211|    135256|         2| 145952|32.99|
|   607788|  6261|    387991|         6| 207800|199.0|
|   375706|  4520|    387991|         6|     -1| 99.0|
|    11115|  7213|    139747|         9| 186847| 33.0|
|    24484|  7207|    139744|         9| 186847| 19.0|
|    28589|  5953|    395195|        13|     -1|428.0|
|    23236|  5953|    395195|       

In [6]:
'''从HDFS加载用户基本信息数据'''
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType

# 构建表结构schema对象
schema = StructType([
    StructField("userId", IntegerType()),
    StructField("cms_segid", IntegerType()),
    StructField("cms_group_id", IntegerType()),
    StructField("final_gender_code", IntegerType()),
    StructField("age_level", IntegerType()),
    StructField("pvalue_level", IntegerType()),
    StructField("shopping_level", IntegerType()),
    StructField("occupation", IntegerType()),
    StructField("new_user_class_level", IntegerType())
])
# 利用schema从hdfs加载
_user_profile_df1 = spark.read.csv("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/dataset/user_profile.csv", header=True, schema=schema)
# user_profile_df.printSchema()
# user_profile_df.show()

'''对缺失数据进行特征热编码'''
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

# 使用热编码转换pvalue_level的一维数据为多维，增加n-1个虚拟变量，n为pvalue_level的取值范围

# 需要先将缺失值全部替换为数值，便于处理，否则会抛出异常
from pyspark.sql.types import StringType
_user_profile_df2 = _user_profile_df1.na.fill(-1)
# _user_profile_df2.show()

# 热编码时，必须先将待处理字段转为字符串类型才可处理
_user_profile_df3 = _user_profile_df2.withColumn("pvalue_level", _user_profile_df2.pvalue_level.cast(StringType()))\
    .withColumn("new_user_class_level", _user_profile_df2.new_user_class_level.cast(StringType()))
# _user_profile_df3.printSchema()

# 对pvalue_level进行热编码，求值
# 运行过程是先将pvalue_level转换为一列新的特征数据，然后对该特征数据求出的热编码值，存在了新的一列数据中，类型为一个稀疏矩阵
stringindexer = StringIndexer(inputCol='pvalue_level', outputCol='pl_onehot_feature')
encoder = OneHotEncoder(dropLast=False, inputCol='pl_onehot_feature', outputCol='pl_onehot_value')
pipeline = Pipeline(stages=[stringindexer, encoder])
pipeline_fit = pipeline.fit(_user_profile_df3)
_user_profile_df4 = pipeline_fit.transform(_user_profile_df3)
# pl_onehot_value列的值为稀疏矩阵，存储热编码的结果
# _user_profile_df4.printSchema()
# _user_profile_df4.show()

# 使用热编码转换new_user_class_level的一维数据为多维
stringindexer = StringIndexer(inputCol='new_user_class_level', outputCol='nucl_onehot_feature')
encoder = OneHotEncoder(dropLast=False, inputCol='nucl_onehot_feature', outputCol='nucl_onehot_value')
pipeline = Pipeline(stages=[stringindexer, encoder])
pipeline_fit = pipeline.fit(_user_profile_df4)
user_profile_df = pipeline_fit.transform(_user_profile_df4)
user_profile_df.show()

+------+---------+------------+-----------------+---------+------------+--------------+----------+--------------------+-----------------+---------------+-------------------+-----------------+
|userId|cms_segid|cms_group_id|final_gender_code|age_level|pvalue_level|shopping_level|occupation|new_user_class_level|pl_onehot_feature|pl_onehot_value|nucl_onehot_feature|nucl_onehot_value|
+------+---------+------------+-----------------+---------+------------+--------------+----------+--------------------+-----------------+---------------+-------------------+-----------------+
|   234|        0|           5|                2|        5|          -1|             3|         0|                   3|              0.0|  (4,[0],[1.0])|                2.0|    (5,[2],[1.0])|
|   523|        5|           2|                2|        2|           1|             3|         1|                   2|              2.0|  (4,[2],[1.0])|                1.0|    (5,[1],[1.0])|
|   612|        0|           8|         

In [7]:
'''热编码中：
"pvalue_level"特征对应关系:
+------------+----------------------+
|pvalue_level|pl_onehot_feature     |
+------------+----------------------+
|          -1|                   0.0|
|           3|                   3.0|
|           1|                   2.0|
|           2|                   1.0|
+------------+----------------------+

“new_user_class_level”的特征对应关系：
+--------------------+------------------------+
|new_user_class_level|nucl_onehot_feature     |
+--------------------+------------------------+
|                  -1|                     0.0|
|                   3|                     2.0|
|                   1|                     4.0|
|                   4|                     3.0|
|                   2|                     1.0|
+--------------------+------------------------+
'''
user_profile_df.groupBy("pvalue_level").min("pl_onehot_feature").show()
user_profile_df.groupBy("new_user_class_level").min("nucl_onehot_feature").show()

+------------+----------------------+
|pvalue_level|min(pl_onehot_feature)|
+------------+----------------------+
|          -1|                   0.0|
|           3|                   3.0|
|           1|                   2.0|
|           2|                   1.0|
+------------+----------------------+

+--------------------+------------------------+
|new_user_class_level|min(nucl_onehot_feature)|
+--------------------+------------------------+
|                  -1|                     0.0|
|                   3|                     2.0|
|                   1|                     4.0|
|                   4|                     3.0|
|                   2|                     1.0|
+--------------------+------------------------+



#### Dataframe数据合并：[pyspark.sql.DataFrame.join](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=join#pyspark.sql.DataFrame.join)

#### [不同合并方式介绍](https://stackoverflow.com/questions/38549/what-is-the-difference-between-inner-join-and-outer-join)

In [8]:
# raw_sample_df和ad_feature_df合并条件
condition = [raw_sample_df.adgroupId==ad_feature_df.adgroupId]
_ = raw_sample_df.join(ad_feature_df, condition, 'outer')

# _和user_profile_df合并条件
condition2 = [_.userId==user_profile_df.userId]
datasets = _.join(user_profile_df, condition2, "outer")

In [9]:
# 查看datasets的结构
datasets.printSchema()
# 查看datasets条目数
print(datasets.count())

root
 |-- userId: integer (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- adgroupId: integer (nullable = true)
 |-- pid: string (nullable = true)
 |-- nonclk: integer (nullable = true)
 |-- clk: integer (nullable = true)
 |-- pid_feature: double (nullable = true)
 |-- pid_value: vector (nullable = true)
 |-- adgroupId: integer (nullable = true)
 |-- cateId: integer (nullable = true)
 |-- campaignId: integer (nullable = true)
 |-- customerId: integer (nullable = true)
 |-- brandId: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- userId: integer (nullable = true)
 |-- cms_segid: integer (nullable = true)
 |-- cms_group_id: integer (nullable = true)
 |-- final_gender_code: integer (nullable = true)
 |-- age_level: integer (nullable = true)
 |-- pvalue_level: string (nullable = true)
 |-- shopping_level: integer (nullable = true)
 |-- occupation: integer (nullable = true)
 |-- new_user_class_level: string (nullable = true)
 |-- pl_onehot_feature: double (nu

## 1. 训练CTRModel_Normal：直接将对应的特征的特征值组合成对应的特征向量进行训练

In [10]:
# 剔除冗余、不需要的字段
useful_cols = [
    # 
    # 时间字段，划分训练集和测试集
    "timestamp",
    # label目标值字段
    "clk",  
    # 特征值字段
    "pid_value",       # 资源位的特征向量
    "price",    # 广告价格
    "cms_segid",    # 用户微群ID
    "cms_group_id",    # 用户组ID
    "final_gender_code",    # 用户性别特征，[1,2]
    "age_level",    # 年龄等级，1-
    "shopping_level",
    "occupation",
    "pl_onehot_value",
    "nucl_onehot_value"
]
# 筛选指定字段数据，构建新的数据集
datasets_1 = datasets.select(*useful_cols)

In [11]:
# 由于前面使用的是outer方式合并的数据，产生了部分空值数据，这里必须先剔除掉
datasets_1 = datasets_1.dropna()
print("剔除空值数据后，还剩：", datasets_1.count())

剔除空值数据后，还剩： 25029435


#### 根据特征字段计算出特征向量，并划分出训练数据集和测试数据集

In [12]:
from pyspark.ml.feature import VectorAssembler
# 根据特征字段计算特征向量
datasets_1 = VectorAssembler().setInputCols(useful_cols[2:]).setOutputCol("features").transform(datasets_1)
# 训练数据集: 约7天的数据
train_datasets_1 = datasets_1.filter(datasets_1.timestamp<=(1494691186-24*60*60))
# 测试数据集：约1天的数据量
test_datasets_1 = datasets_1.filter(datasets_1.timestamp>(1494691186-24*60*60))

In [13]:
# 所有的特征的特征向量已经汇总到在features字段中
train_datasets_1.show(5)
test_datasets_1.show(5)

+----------+---+-------------+------+---------+------------+-----------------+---------+--------------+----------+---------------+-----------------+--------------------+
| timestamp|clk|    pid_value| price|cms_segid|cms_group_id|final_gender_code|age_level|shopping_level|occupation|pl_onehot_value|nucl_onehot_value|            features|
+----------+---+-------------+------+---------+------------+-----------------+---------+--------------+----------+---------------+-----------------+--------------------+
|1494261938|  0|(2,[1],[1.0])| 108.0|        0|          11|                1|        5|             3|         0|  (4,[0],[1.0])|    (5,[1],[1.0])|(18,[1,2,4,5,6,7,...|
|1494261938|  0|(2,[1],[1.0])|1880.0|        0|          11|                1|        5|             3|         0|  (4,[0],[1.0])|    (5,[1],[1.0])|(18,[1,2,4,5,6,7,...|
|1494416583|  0|(2,[1],[1.0])|2760.0|       19|           3|                2|        3|             3|         0|  (4,[1],[1.0])|    (5,[1],[1.0])|(1

#### 创建逻辑回归训练器，并训练模型：[LogisticRegression](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=logisticregression#pyspark.ml.classification.LogisticRegression)、 [LogisticRegressionModel](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=logisticregression#pyspark.ml.classification.LogisticRegressionModel)

In [14]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
# 设置目标字段、特征值字段并训练
model = lr.setLabelCol("clk").setFeaturesCol("features").fit(train_datasets_1)

In [15]:
# 对模型进行存储
model.save("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/trained_result/models/CTRModel_Normal.obj")

In [20]:
# 载入训练好的模型
from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/trained_result/models/CTRModel_Normal.obj")

In [21]:
# 根据测试数据进行预测
result_1 = model.transform(test_datasets_1)

In [22]:
# 按probability升序排列数据，probability表示预测结果的概率
# 如果预测值是0，其概率是0.9248，那么反之可推出1的可能性就是1-0.9248=0.0752，即点击概率约为7.52%
# 因为前面提到广告的点击率一般都比较低，所以预测值通常都是0，因此通常需要反减得出点击的概率
result_1.select("clk", "price", "probability", "prediction").sort("probability").show(100)

+---+-----------+--------------------+----------+
|clk|      price|         probability|prediction|
+---+-----------+--------------------+----------+
|  0|      1.0E8|[0.86822033939259...|       0.0|
|  0|      1.0E8|[0.88410457194969...|       0.0|
|  0|      1.0E8|[0.89175497837562...|       0.0|
|  1|5.5555556E7|[0.92481456486873...|       0.0|
|  0|      1.5E7|[0.93741450446939...|       0.0|
|  0|      1.5E7|[0.93757135079959...|       0.0|
|  0|      1.5E7|[0.93834723093801...|       0.0|
|  0|     1099.0|[0.93972095713786...|       0.0|
|  0|      338.0|[0.93972134993018...|       0.0|
|  0|      311.0|[0.93972136386626...|       0.0|
|  0|      300.0|[0.93972136954393...|       0.0|
|  0|      278.0|[0.93972138089925...|       0.0|
|  0|      188.0|[0.93972142735283...|       0.0|
|  0|      176.0|[0.93972143354663...|       0.0|
|  0|      168.0|[0.93972143767584...|       0.0|
|  0|      158.0|[0.93972144283734...|       0.0|
|  1|      138.0|[0.93972145316035...|       0.0|


In [23]:
# 只查看样本中点击的被实际点击的条目的预测情况
result_1.filter(result_1.clk==1).select("clk", "price", "probability", "prediction").sort("probability").show(100)

+---+-----------+--------------------+----------+
|clk|      price|         probability|prediction|
+---+-----------+--------------------+----------+
|  1|5.5555556E7|[0.92481456486873...|       0.0|
|  1|      138.0|[0.93972145316035...|       0.0|
|  1|       35.0|[0.93972150632383...|       0.0|
|  1|      149.0|[0.93999389726180...|       0.0|
|  1|     5608.0|[0.94001892245145...|       0.0|
|  1|      275.0|[0.94002166230631...|       0.0|
|  1|       35.0|[0.94002178560473...|       0.0|
|  1|       49.0|[0.94004219516957...|       0.0|
|  1|      915.0|[0.94021082858784...|       0.0|
|  1|      598.0|[0.94021099096349...|       0.0|
|  1|      568.0|[0.94021100633025...|       0.0|
|  1|      398.0|[0.94021109340848...|       0.0|
|  1|      368.0|[0.94021110877521...|       0.0|
|  1|      299.0|[0.94021114411869...|       0.0|
|  1|      278.0|[0.94021115487539...|       0.0|
|  1|      259.0|[0.94021116460765...|       0.0|
|  1|      258.0|[0.94021116511987...|       0.0|


In [16]:
!hadoop fs -ls /project1-ad-rs/models

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/bigdata/hadoop-2.9.1/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/bigdata/apache-hive-2.3.4-bin/lib/log4j-slf4j-impl-2.6.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 2 items
drwxr-xr-x   - root supergroup          0 2018-12-05 01:34 /project1-ad-rs/models/CTRModel_Normal.obj
drwxr-xr-x   - root supergroup          0 2018-12-04 18:13 /project1-ad-rs/models/userCateRatingALSModel.obj


## 2. 训练CTRModel_AllOneHot
- "pid_value",   类别型特征，已被转换为多维特征==> 2维
- "price",    统计型特征 ===> 1维
- "cms_segid",   类别型特征，约97个分类 ===> 1维
- "cms_group_id",   类别型特征，约13个分类 ==> 1维
- "final_gender_code", 类别型特征，2个分类 ==> 1维
- "age_level",    类别型特征，7个分类 ==> 1维
- "shopping_level",    类别型特征，3个分类 ==> 1维
- "occupation",    类别型特征，2个分类 ==> 1维
- "pl_onehot_value",   类别型特征，已被转换为多维特征 ==> 4维
- "nucl_onehot_value"   类别型特征，已被转换为多维特征 ==> 5维

类别性特征都可以考虑进行热独编码，将单一变量变为多变量，相当于增加了相关特征的数量

- "cms_segid",   类别型特征，约97个分类 ===> 97维   舍弃
- "cms_group_id",   类别型特征，约13个分类 ==> 13维
- "final_gender_code", 类别型特征，2个分类 ==> 2维
- "age_level",    类别型特征，7个分类 ==>7维
- "shopping_level",    类别型特征，3个分类 ==> 3维
- "occupation",    类别型特征，2个分类 ==> 2维

但由于cms_segid分类过多，这里考虑舍弃，避免数据过于稀疏

In [24]:
datasets_1.first()

Row(timestamp=1494261938, clk=0, pid_value=SparseVector(2, {1: 1.0}), price=1880.0, cms_segid=0, cms_group_id=11, final_gender_code=1, age_level=5, shopping_level=3, occupation=0, pl_onehot_value=SparseVector(4, {0: 1.0}), nucl_onehot_value=SparseVector(5, {1: 1.0}), features=SparseVector(18, {1: 1.0, 2: 1880.0, 4: 11.0, 5: 1.0, 6: 5.0, 7: 3.0, 9: 1.0, 14: 1.0}))

In [17]:
# 先将下列五列数据转为字符串类型，以便于进行热独编码
# - "cms_group_id",   类别型特征，约13个分类 ==> 13
# - "final_gender_code", 类别型特征，2个分类 ==> 2
# - "age_level",    类别型特征，7个分类 ==>7
# - "shopping_level",    类别型特征，3个分类 ==> 3
# - "occupation",    类别型特征，2个分类 ==> 2

datasets_2 = datasets.withColumn("cms_group_id", datasets.cms_group_id.cast(StringType()))\
    .withColumn("final_gender_code", datasets.final_gender_code.cast(StringType()))\
    .withColumn("age_level", datasets.age_level.cast(StringType()))\
    .withColumn("shopping_level", datasets.shopping_level.cast(StringType()))\
    .withColumn("occupation", datasets.occupation.cast(StringType()))

In [18]:
useful_cols_2 = [
    # 时间值，划分训练集和测试集
    "timestamp",
    # label目标值
    "clk",  
    # 特征值
    "price",
    "cms_group_id",
    "final_gender_code",
    "age_level",
    "shopping_level",
    "occupation",
    "pid_value", 
    "pl_onehot_value",
    "nucl_onehot_value"
]
# 筛选指定字段数据
datasets_2 = datasets_2.select(*useful_cols_2)
# 由于前面使用的是outer方式合并的数据，产生了部分空值数据，这里必须先剔除掉
datasets_2 = datasets_2.dropna()


from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
# 热编码处理函数封装
def oneHotEncoder(col1, col2, col3, data):
    stringindexer = StringIndexer(inputCol=col1, outputCol=col2)
    encoder = OneHotEncoder(dropLast=False, inputCol=col2, outputCol=col3)
    pipeline = Pipeline(stages=[stringindexer, encoder])
    pipeline_fit = pipeline.fit(data)
    return pipeline_fit.transform(data)

# 对这五个字段进行热独编码
#     "cms_group_id",
#     "final_gender_code",
#     "age_level",
#     "shopping_level",
#     "occupation",
datasets_2 = oneHotEncoder("cms_group_id", "cms_group_id_feature", "cms_group_id_value", datasets_2)
datasets_2 = oneHotEncoder("final_gender_code", "final_gender_code_feature", "final_gender_code_value", datasets_2)
datasets_2 = oneHotEncoder("age_level", "age_level_feature", "age_level_value", datasets_2)
datasets_2 = oneHotEncoder("shopping_level", "shopping_level_feature", "shopping_level_value", datasets_2)
datasets_2 = oneHotEncoder("occupation", "occupation_feature", "occupation_value", datasets_2)

In [19]:
'''
"cms_group_id"特征对应关系：
+------------+-------------------------+
|cms_group_id|min(cms_group_id_feature)|
+------------+-------------------------+
|           7|                      9.0|
|          11|                      6.0|
|           3|                      0.0|
|           8|                      8.0|
|           0|                     12.0|
|           5|                      3.0|
|           6|                     10.0|
|           9|                      5.0|
|           1|                      7.0|
|          10|                      4.0|
|           4|                      1.0|
|          12|                     11.0|
|           2|                      2.0|
+------------+-------------------------+

"final_gender_code"特征对应关系：
+-----------------+------------------------------+
|final_gender_code|min(final_gender_code_feature)|
+-----------------+------------------------------+
|                1|                           1.0|
|                2|                           0.0|
+-----------------+------------------------------+

"age_level"特征对应关系：
+---------+----------------------+
|age_level|min(age_level_feature)|
+---------+----------------------+
|        3|                   0.0|
|        0|                   6.0|
|        5|                   2.0|
|        6|                   5.0|
|        1|                   4.0|
|        4|                   1.0|
|        2|                   3.0|
+---------+----------------------+

"shopping_level"特征对应关系：
|shopping_level|min(shopping_level_feature)|
+--------------+---------------------------+
|             3|                        0.0|
|             1|                        2.0|
|             2|                        1.0|
+--------------+---------------------------+

"occupation"特征对应关系：
+----------+-----------------------+
|occupation|min(occupation_feature)|
+----------+-----------------------+
|         0|                    0.0|
|         1|                    1.0|
+----------+-----------------------+
'''
datasets_2.groupBy("cms_group_id").min("cms_group_id_feature").show()
datasets_2.groupBy("final_gender_code").min("final_gender_code_feature").show()
datasets_2.groupBy("age_level").min("age_level_feature").show()
datasets_2.groupBy("shopping_level").min("shopping_level_feature").show()
datasets_2.groupBy("occupation").min("occupation_feature").show()

+------------+-------------------------+
|cms_group_id|min(cms_group_id_feature)|
+------------+-------------------------+
|           7|                      9.0|
|          11|                      6.0|
|           3|                      0.0|
|           8|                      8.0|
|           0|                     12.0|
|           5|                      3.0|
|           6|                     10.0|
|           9|                      5.0|
|           1|                      7.0|
|          10|                      4.0|
|           4|                      1.0|
|          12|                     11.0|
|           2|                      2.0|
+------------+-------------------------+

+-----------------+------------------------------+
|final_gender_code|min(final_gender_code_feature)|
+-----------------+------------------------------+
|                1|                           1.0|
|                2|                           0.0|
+-----------------+----------------------------

In [20]:
# 由于热独编码后，特征字段不再是之前的字段，重新定义特征值字段
feature_cols = [
    # 特征值
    "price",
    "cms_group_id_value",
    "final_gender_code_value",
    "age_level_value",
    "shopping_level_value",
    "occupation_value",
    "pid_value",
    "pl_onehot_value",
    "nucl_onehot_value"
]

In [21]:
# 根据特征字段计算出特征向量，并划分出训练数据集和测试数据集
from pyspark.ml.feature import VectorAssembler
datasets_2 = VectorAssembler().setInputCols(feature_cols).setOutputCol("features").transform(datasets_2)
train_datasets_2 = datasets_2.filter(datasets_2.timestamp<=(1494691186-24*60*60))
test_datasets_2 = datasets_2.filter(datasets_2.timestamp>(1494691186-24*60*60))

In [22]:
train_datasets_2.printSchema()
train_datasets_2.first()

root
 |-- timestamp: long (nullable = true)
 |-- clk: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- cms_group_id: string (nullable = true)
 |-- final_gender_code: string (nullable = true)
 |-- age_level: string (nullable = true)
 |-- shopping_level: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- pid_value: vector (nullable = true)
 |-- pl_onehot_value: vector (nullable = true)
 |-- nucl_onehot_value: vector (nullable = true)
 |-- cms_group_id_feature: double (nullable = true)
 |-- cms_group_id_value: vector (nullable = true)
 |-- final_gender_code_feature: double (nullable = true)
 |-- final_gender_code_value: vector (nullable = true)
 |-- age_level_feature: double (nullable = true)
 |-- age_level_value: vector (nullable = true)
 |-- shopping_level_feature: double (nullable = true)
 |-- shopping_level_value: vector (nullable = true)
 |-- occupation_feature: double (nullable = true)
 |-- occupation_value: vector (nullable = true)
 |-- featur

Row(timestamp=1494261938, clk=0, price=1880.0, cms_group_id='11', final_gender_code='1', age_level='5', shopping_level='3', occupation='0', pid_value=SparseVector(2, {1: 1.0}), pl_onehot_value=SparseVector(4, {0: 1.0}), nucl_onehot_value=SparseVector(5, {1: 1.0}), cms_group_id_feature=6.0, cms_group_id_value=SparseVector(13, {6: 1.0}), final_gender_code_feature=1.0, final_gender_code_value=SparseVector(2, {1: 1.0}), age_level_feature=2.0, age_level_value=SparseVector(7, {2: 1.0}), shopping_level_feature=0.0, shopping_level_value=SparseVector(3, {0: 1.0}), occupation_feature=0.0, occupation_value=SparseVector(2, {0: 1.0}), features=SparseVector(39, {0: 1880.0, 7: 1.0, 15: 1.0, 18: 1.0, 23: 1.0, 26: 1.0, 29: 1.0, 30: 1.0, 35: 1.0}))

In [24]:
# 创建逻辑回归训练器，并训练模型
from pyspark.ml.classification import LogisticRegression
lr2 = LogisticRegression()
model2 = lr2.setLabelCol("clk").setFeaturesCol("features").fit(train_datasets_2)

In [25]:
# 存储模型
model2.save("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/trained_result/models/CTRModel_AllOneHot.obj")

In [26]:
!hadoop fs -ls /project1-ad-rs/models

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/root/bigdata/hadoop-2.9.1/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/root/bigdata/apache-hive-2.3.4-bin/lib/log4j-slf4j-impl-2.6.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Found 3 items
drwxr-xr-x   - root supergroup          0 2018-12-05 02:03 /project1-ad-rs/models/CTRModel_AllOneHot.obj
drwxr-xr-x   - root supergroup          0 2018-12-05 01:34 /project1-ad-rs/models/CTRModel_Normal.obj
drwxr-xr-x   - root supergroup          0 2018-12-04 18:13 /project1-ad-rs/models/userCateRatingALSModel.obj


In [25]:
from pyspark.ml.classification import LogisticRegressionModel
# 载入训练好的模型
model2 = LogisticRegressionModel.load("hdfs://hadoop-master:9000/workspace/3.rs_project/project1/trained_result/models/CTRModel_AllOneHot.obj")

In [32]:
result_2 = model2.transform(test_datasets_2)

In [33]:
# 按probability升序排列数据，probability表示预测结果的概率
result_2.select("clk", "price", "probability", "prediction").sort("probability").show(100)

# 对比前面的result_1的预测结果，能发现这里的预测率稍微准确了一点，这里top20里出现了3个点击的，但前面的只出现了1个
# 因此可见对特征的细化处理，已经帮助我们提高模型的效果的

+---+-----------+--------------------+----------+
|clk|      price|         probability|prediction|
+---+-----------+--------------------+----------+
|  0|      1.0E8|[0.85524418892857...|       0.0|
|  0|      1.0E8|[0.88353143762124...|       0.0|
|  0|      1.0E8|[0.89169808985616...|       0.0|
|  1|5.5555556E7|[0.92511743960350...|       0.0|
|  0|     179.01|[0.93239951738307...|       0.0|
|  1|      159.0|[0.93239952905659...|       0.0|
|  0|      118.0|[0.93239955297535...|       0.0|
|  0|      688.0|[0.93451506165953...|       0.0|
|  0|      339.0|[0.93451525933626...|       0.0|
|  0|      335.0|[0.93451526160190...|       0.0|
|  0|      220.0|[0.93451532673881...|       0.0|
|  0|      176.0|[0.93451535166074...|       0.0|
|  0|      158.0|[0.93451536185607...|       0.0|
|  0|      158.0|[0.93451536185607...|       0.0|
|  1|      149.0|[0.93451536695374...|       0.0|
|  0|      122.5|[0.93451538196353...|       0.0|
|  0|       99.0|[0.93451539527410...|       0.0|


In [34]:
result_2.filter(result_2.clk==1).select("clk", "price", "probability", "prediction").sort("probability").show(100)
# 从该结果也可以看出，result_2的点击率预测率普遍要比result_1高出一点点

+---+-----------+--------------------+----------+
|clk|      price|         probability|prediction|
+---+-----------+--------------------+----------+
|  1|5.5555556E7|[0.92511743960350...|       0.0|
|  1|      159.0|[0.93239952905659...|       0.0|
|  1|      149.0|[0.93451536695374...|       0.0|
|  1|     8888.0|[0.93494392746484...|       0.0|
|  1|      138.0|[0.93494414770804...|       0.0|
|  1|       35.0|[0.93494420569256...|       0.0|
|  1|      519.0|[0.93494863870621...|       0.0|
|  1|      478.0|[0.93494866178596...|       0.0|
|  1|      349.0|[0.93494873440265...|       0.0|
|  1|      348.0|[0.93494873496557...|       0.0|
|  1|      316.0|[0.93494875297901...|       0.0|
|  1|      298.0|[0.93494876311156...|       0.0|
|  1|      298.0|[0.93494876311156...|       0.0|
|  1|      199.0|[0.93494881884058...|       0.0|
|  1|      199.0|[0.93494881884058...|       0.0|
|  1|      198.0|[0.93494881940350...|       0.0|
|  1|      187.1|[0.93494882553931...|       0.0|
