## Spark逻辑回归(LR)模型使用介绍

In [2]:
# spark配置信息
from pyspark import SparkConf
from pyspark.sql import SparkSession

SPARK_APP_NAME = "preprocessingUserProfile"
SPARK_URL = "yarn"

conf = SparkConf()    # 创建spark config对象
config = (
	("spark.app.name", SPARK_APP_NAME),    # 设置启动的spark的app名称，没有提供，将随机产生一个名称
	("spark.executor.memory", "2g"),    # 设置该app启动时占用的内存用量，默认1g
	("spark.master", SPARK_URL),    # spark master的地址
    ("spark.executor.cores", "2"),   # 设置spark executor使用的CPU核心数
    ("spark.executor.instances", 1)    # 设置spark executor数量，yarn时起作用
)
# 查看更详细配置及说明：https://spark.apache.org/docs/latest/configuration.html
# 
conf.setAll(config)

# 利用config对象，创建spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
from pyspark.ml.feature import VectorAssembler
import pandas as pd

# 样本数据集
sample_dataset = [
    (0, "male", 37, 10, "no", 3, 18, 7, 4),
    (0, "female", 27, 4, "no", 4, 14, 6, 4),
    (0, "female", 32, 15, "yes", 1, 12, 1, 4),
    (0, "male", 57, 15, "yes", 5, 18, 6, 5),
    (0, "male", 22, 0.75, "no", 2, 17, 6, 3),
    (0, "female", 32, 1.5, "no", 2, 17, 5, 5),
    (0, "female", 22, 0.75, "no", 2, 12, 1, 3),
    (0, "male", 57, 15, "yes", 2, 14, 4, 4),
    (0, "female", 32, 15, "yes", 4, 16, 1, 2),
    (0, "male", 22, 1.5, "no", 4, 14, 4, 5),
    (0, "male", 37, 15, "yes", 2, 20, 7, 2),
    (0, "male", 27, 4, "yes", 4, 18, 6, 4),
    (0, "male", 47, 15, "yes", 5, 17, 6, 4),
    (0, "female", 22, 1.5, "no", 2, 17, 5, 4),
    (0, "female", 27, 4, "no", 4, 14, 5, 4),
    (0, "female", 37, 15, "yes", 1, 17, 5, 5),
    (0, "female", 37, 15, "yes", 2, 18, 4, 3),
    (0, "female", 22, 0.75, "no", 3, 16, 5, 4),
    (0, "female", 22, 1.5, "no", 2, 16, 5, 5),
    (0, "female", 27, 10, "yes", 2, 14, 1, 5),
    (1, "female", 32, 15, "yes", 3, 14, 3, 2),
    (1, "female", 27, 7, "yes", 4, 16, 1, 2),
    (1, "male", 42, 15, "yes", 3, 18, 6, 2),
    (1, "female", 42, 15, "yes", 2, 14, 3, 2),
    (1, "male", 27, 7, "yes", 2, 17, 5, 4),
    (1, "male", 32, 10, "yes", 4, 14, 4, 3),
    (1, "male", 47, 15, "yes", 3, 16, 4, 2),
    (0, "male", 37, 4, "yes", 2, 20, 6, 4)
]

columns = ["affairs", "gender", "age", "label", "children", "religiousness", "education", "occupation", "rating"]

# pandas构建dataframe，方便
pdf = pd.DataFrame(sample_dataset, columns=columns)

# 转换成spark的dataframe
df = spark.createDataFrame(pdf)

# 特征选取：affairs为目标值，其余为特征值
df2 = df.select("affairs","age", "religiousness", "education", "occupation", "rating")

# 用于计算特征向量的字段
colArray2 = ["age", "religiousness", "education", "occupation", "rating"]

# 计算出特征向量
df3 = VectorAssembler().setInputCols(colArray2).setOutputCol("features").transform(df2)
print("数据集：")
df3.show()

#  随机切分为训练集和测试集
trainDF, testDF = df3.randomSplit([0.8,0.2])
print("训练集：")
trainDF.show(10)
print("测试集：")
testDF.show(10)

数据集：
+-------+---+-------------+---------+----------+------+--------------------+
|affairs|age|religiousness|education|occupation|rating|            features|
+-------+---+-------------+---------+----------+------+--------------------+
|      0| 37|            3|       18|         7|     4|[37.0,3.0,18.0,7....|
|      0| 27|            4|       14|         6|     4|[27.0,4.0,14.0,6....|
|      0| 32|            1|       12|         1|     4|[32.0,1.0,12.0,1....|
|      0| 57|            5|       18|         6|     5|[57.0,5.0,18.0,6....|
|      0| 22|            2|       17|         6|     3|[22.0,2.0,17.0,6....|
|      0| 32|            2|       17|         5|     5|[32.0,2.0,17.0,5....|
|      0| 22|            2|       12|         1|     3|[22.0,2.0,12.0,1....|
|      0| 57|            2|       14|         4|     4|[57.0,2.0,14.0,4....|
|      0| 32|            4|       16|         1|     2|[32.0,4.0,16.0,1....|
|      0| 22|            4|       14|         4|     5|[22.0,4.0,14.0,4

#### [逻辑回归分类器-pyspark.ml.classification.LogisticRegression](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=logisticregression#pyspark.ml.classification.LogisticRegression)
#### [逻辑回归分类模型-pyspark.ml.classification.LogisticRegressionModel](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html?highlight=logisticregression#pyspark.ml.classification.LogisticRegressionModel)

In [5]:
from pyspark.ml.classification import LogisticRegression
# 创建逻辑回归训练器
lr = LogisticRegression()

# 训练模型
model = lr.setLabelCol("affairs").setFeaturesCol("features").fit(trainDF)

# 预测数据
model.transform(testDF).show()

+-------+---+-------------+---------+----------+------+--------------------+--------------------+--------------------+----------+
|affairs|age|religiousness|education|occupation|rating|            features|       rawPrediction|         probability|prediction|
+-------+---+-------------+---------+----------+------+--------------------+--------------------+--------------------+----------+
|      0| 27|            4|       14|         6|     4|[27.0,4.0,14.0,6....|[0.39067871041193...|[0.59644607432863...|       0.0|
|      0| 22|            2|       12|         1|     3|[22.0,2.0,12.0,1....|[-2.6754687573263...|[0.06443650129497...|       1.0|
|      0| 32|            4|       16|         1|     2|[32.0,4.0,16.0,1....|[-4.5240336812732...|[0.01072883305878...|       1.0|
|      0| 27|            4|       14|         5|     4|[27.0,4.0,14.0,5....|[0.16206512668426...|[0.54042783360658...|       0.0|
|      0| 22|            3|       16|         5|     4|[22.0,3.0,16.0,5....|[1.69102697292