In [2]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

from pyspark.ml.classification import LogisticRegression

from pyspark.ml.pipeline import Pipeline

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.datasets import make_classification

In [3]:
import pandas as pd 
import numpy as np

from collections import Counter

import pyspark.pandas as ps

import findspark
from pyspark.sql import SparkSession
findspark.init()
spark = SparkSession \
    .builder \
    .appName("classification_evaluation2") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .master("local[*]") \
    .getOrCreate()



#### 二分类评估

In [4]:
X, y = make_classification(n_samples=1000, n_features=4, random_state=42)
data = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4'])
data['label'] = y
# data['label'] = data['label'].map({1: '是', 0:'否'})
data = ps.from_pandas(data).to_spark()
data.show()

+--------------------+--------------------+-------------------+--------------------+-----+
|            feature1|            feature2|           feature3|            feature4|label|
+--------------------+--------------------+-------------------+--------------------+-----+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|
|  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|    1|
|  1.8288841806481393| -0.2892612860302994| -1.049150931886449|  0.2062714398834348|    0|
|  0.5399095786887405| -0.6708391613721514| 1.0269387108558683|  -0.654529086677752|    1|
|-0.21015535126988671| -0.6526563106588037| 1.6865595485775817| -0.8618761517856935|    1|
| -0.8275441309046899| 0.14449974071098143| 0.4436450947685622| -0.0766994748959056|    0|

In [5]:
feature_columns = ["feature1", "feature2", "feature3", "feature4"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)
assembled_data.show()

+--------------------+--------------------+-------------------+--------------------+-----+--------------------+
|            feature1|            feature2|           feature3|            feature4|label|            features|
+--------------------+--------------------+-------------------+--------------------+-----+--------------------+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|[-1.6077112718140...|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|[0.29013138652200...|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|[-1.2250960321892...|
|  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|    1|[2.04686111611454...|
|  1.8288841806481393| -0.2892612860302994| -1.049150931886449|  0.2062714398834348|    0|[1.82888418064813...|
|  0.5399095786887405| -0.6708391613721514| 1.0269387108558683|  -0.654529086677752|    1|[0.53990957868

In [6]:
from pyspark.ml.feature import StringIndexer 
label_indexer = StringIndexer(inputCol="label", outputCol='label'+str('_as_int'))
indexer_model = label_indexer.fit(assembled_data)
indexer_model

StringIndexerModel: uid=StringIndexer_667e5922dc9b, handleInvalid=error

In [7]:
label_assembled_data = indexer_model.transform(assembled_data)
label_assembled_data.show()

+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+
|            feature1|            feature2|           feature3|            feature4|label|            features|label_as_int|
+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|[-1.6077112718140...|         0.0|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|[0.29013138652200...|         1.0|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|[-1.2250960321892...|         0.0|
|  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|    1|[2.04686111611454...|         0.0|
|  1.8288841806481393| -0.2892612860302994| -1.049150931886449|  0.2062714398834348|    0|[1.82888418064813...|         1.0|


In [8]:
lr = LogisticRegression(featuresCol="features", labelCol="label_as_int")
lr_model = lr.fit(label_assembled_data)
result = lr_model.transform(label_assembled_data)

result.show()

+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+
|            feature1|            feature2|           feature3|            feature4|label|            features|label_as_int|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|[-1.6077112718140...|         0.0|[3.77758376023245...|[0.97763378881895...|       0.0|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|[0.29013138652200...|         1.0|[-1.8803396062127...|[0.13234987054261...|       1.0|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|[-1.2250960321892...

In [9]:
rr = result.toPandas()

In [10]:
rr

Unnamed: 0,feature1,feature2,feature3,feature4,label,features,label_as_int,rawPrediction,probability,prediction
0,-1.607711,-0.029398,1.569953,-0.527984,1,"[-1.6077112718140278, -0.029397554536784845, 1...",0.0,"[3.7775837602324507, -3.7775837602324507]","[0.9776337888189529, 0.02236621118104709]",0.0
1,0.290131,0.317681,-0.996519,0.477009,0,"[0.29013138652200965, 0.3176812583459609, -0.9...",1.0,"[-1.8803396062127113, 1.8803396062127113]","[0.13234987054261216, 0.8676501294573878]",1.0
2,-1.225096,0.895911,-0.900324,0.719861,1,"[-1.2250960321892006, 0.8959114872456515, -0.9...",0.0,"[-1.3213880339946944, 1.3213880339946944]","[0.21058745348720354, 0.7894125465127965]",1.0
3,2.046861,-1.680595,1.923718,-1.427244,1,"[2.0468611161145427, -1.6805950177168596, 1.92...",0.0,"[3.616779025888871, -3.616779025888871]","[0.9738339751829868, 0.026166024817013245]",0.0
4,1.828884,-0.289261,-1.049151,0.206271,0,"[1.8288841806481393, -0.2892612860302994, -1.0...",1.0,"[-2.3566415255318, 2.3566415255318]","[0.08653931438678378, 0.9134606856132163]",1.0
...,...,...,...,...,...,...,...,...,...,...
995,1.399763,-1.159334,1.338486,-0.988308,1,"[1.3997628088214704, -1.159334266210187, 1.338...",0.0,"[2.5859964614991893, -2.5859964614991893]","[0.929954879901566, 0.070045120098434]",0.0
996,-0.967111,0.994751,-1.367144,0.919603,0,"[-0.9671113926968453, 0.99475064364074, -1.367...",1.0,"[-2.3295671200658754, 2.3295671200658754]","[0.08870364900964259, 0.9112963509903574]",1.0
997,0.710796,-0.612212,0.733345,-0.530584,1,"[0.7107961366555511, -0.612212027827626, 0.733...",0.0,"[1.5249178437704356, -1.5249178437704356]","[0.8212615168612516, 0.1787384831387484]",0.0
998,2.029842,-0.416705,-0.946026,0.112039,0,"[2.029842299340718, -0.4167051885889812, -0.94...",1.0,"[-2.1958868236233644, 2.1958868236233644]","[0.1001204622757348, 0.8998795377242652]",1.0


In [49]:
result.select('label').dtypes[0][1] == 'string'

True

In [9]:
label_index = StringIndexer(inputCol="label", outputCol='label'+str('_int'))
indexer_model = label_index.fit(result)
indexer_model

StringIndexerModel: uid=StringIndexer_0eb6a8e45c89, handleInvalid=error

In [10]:
dict(enumerate(indexer_model.labels))

{0: '1', 1: '0'}

In [11]:
label_mapping = dict(enumerate(indexer_model.labels))
swapped_dict = {value: key for key, value in label_mapping.items()}
swapped_dict

{'1': 0, '0': 1}

In [12]:
result = indexer_model.transform(result)
result.show()

+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+---------+
|            feature1|            feature2|           feature3|            feature4|label|            features|label_as_int|       rawPrediction|         probability|prediction|label_int|
+--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+---------+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|[-1.6077112718140...|         0.0|[3.77758376023245...|[0.97763378881895...|       0.0|      0.0|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|[0.29013138652200...|         1.0|[-1.8803396062127...|[0.13234987054261...|       1.0|      1.0|
| -1.2250960321892006|  0.8959114872456515|-0.90032415801711

In [66]:
classes = result.select('label').distinct()
classes.show()

classes.filter(classes['label'] != '是').collect()[0][0]

+-----+
|label|
+-----+
|   否|
|   是|
+-----+



'否'

In [13]:
# 计算二分类评估指标
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label_as_int")
areaUnderROC = evaluator.evaluate(result, {evaluator.metricName: "areaUnderROC"})
print(areaUnderROC)

areaUnderPR = evaluator.evaluate(result, {evaluator.metricName: "areaUnderPR"})
print(areaUnderPR)

0.9257868125890009
0.9083405059265925


In [116]:
positive_sample_label = '是'
classes = result.select('label').distinct()
# 根据positive_sample_label获取到negative_sample_label
negative_sample_label = classes.filter(classes['label'] != positive_sample_label).collect()[0][0]

In [21]:
positive_sample_label = 0
negative_sample_label = 1
print(positive_sample_label)
print(negative_sample_label)

0
1


In [19]:
# TP：True Positive，分类器预测结果为正样本，实际也为正样本
TP = result.filter(result['prediction'] == positive_sample_label).filter(result["label"] == positive_sample_label).count()
# FN：False Negative，分类器预测结果为负样本，实际为正样本
FN = result.filter(result['prediction'] == negative_sample_label).filter(result["label"] == positive_sample_label).count()
# TN: True Negative，分类器预测结果为负样本，实际为负样本
TN = result.filter(result['prediction'] == negative_sample_label).filter(result["label"] == negative_sample_label).count()
# FP：False Positive，分类器预测结果为正样本，实际为负样本
FP = result.filter(result['prediction'] == positive_sample_label).filter(result["label"] == negative_sample_label).count()

print('TP:', TP)
print('FN:', FN)
print('TN:', TN)
print('FP:', FP)

TP: 74
FN: 428
TN: 65
FP: 433


In [22]:
# TP：True Positive，分类器预测结果为正样本，实际也为正样本
TP = result.filter(result['prediction'] == positive_sample_label).filter(result["label"] == positive_sample_label).count()
# FN：False Negative，分类器预测结果为负样本，实际为正样本
FN = result.filter(result['prediction'] == negative_sample_label).filter(result["label"] == positive_sample_label).count()
# TN: True Negative，分类器预测结果为负样本，实际为负样本
TN = result.filter(result['prediction'] == negative_sample_label).filter(result["label"] == negative_sample_label).count()
# FP：False Positive，分类器预测结果为正样本，实际为负样本
FP = result.filter(result['prediction'] == positive_sample_label).filter(result["label"] == negative_sample_label).count()

print('TP:', TP)
print('FN:', FN)
print('TN:', TN)
print('FP:', FP)

TP: 65
FN: 433
TN: 74
FP: 428


In [121]:
# positive_sample_label, negative_sample_label = swapped_dict[positive_sample_label], swapped_dict[negative_sample_label], 

In [47]:
accuracy = (TP + TN) / data.count()
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = (2 * precision * recall) / (precision + recall)
# FPR：在所有实际为阴性的样本中，被错误地判断为阳性之比率，要越低越好
fpr = FP / (FP+TN)

In [82]:
type(f"{accuracy:.3f}")

str

In [49]:
print(accuracy)
print(precision)
print(recall)
print(f1)
print(fpr)

0.861
0.8681541582150102
0.852589641434263
0.8603015075376884
0.13052208835341367


In [75]:
ps.DataFrame({"名称": 'dfvsd', "输出变量": 'dsv', 'auc': [3]})

Unnamed: 0,名称,输出变量,auc
0,dfvsd,dsv,3


In [77]:
ps.DataFrame({"名称": 'dfvsd', "输出变量": 'dsv', 'auc': 3}, index=[2])

Unnamed: 0,名称,输出变量,auc
2,dfvsd,dsv,3


In [67]:
# +--------+---------+------+-------------------+-------+-----+-------------+
# |accuracy|precision|recall|false_positive_rate|f1score|  auc|area_under_pr|
# +--------+---------+------+-------------------+-------+-----+-------------+
# |   0.861|    0.868| 0.853|              0.131|  0.860|0.926|        0.908|
# +--------+---------+------+-------------------+-------+-----+-------------+

In [None]:
# --------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+
# |            feature1|            feature2|           feature3|            feature4|label|            features|label_as_int|       rawPrediction|         probability|prediction|
# +--------------------+--------------------+-------------------+--------------------+-----+--------------------+------------+--------------------+--------------------+----------+
# | -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|   是|[-1.6077112718140...|         0.0|[3.77758376023245...|[0.97763378881895...|       0.0|
# | 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|   否|[0.29013138652200...|         1.0|[-1.8803396062127...|[0.13234987054261...|       1.0|
# | -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|   是|[-1.2250960321892...|         0.0|[-1.3213880339946...|[0.21058745348720...|       1.0|
# |  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|   是|[2.04686111611454...|         0.0|[3.61677902588887...|[0.97383397518298...|       0.0|
# |  1.8288841806481393| -0.2892612860302994| -1.049150931886449|  0.2062714398834348|   否|[1.82888418064813...|         1.0|[-2.3566415255317...|[0.08653931438678...|       1.0|

#### 多分类评估

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [46]:
X, y = make_classification(n_samples=1000, n_features=4, n_informative=3,
                           n_redundant=0, n_repeated=0, n_classes=5,
                           n_clusters_per_class=1,
                           weights=[0.18, 0.12, 0.4, 0.08, 0.22],
                           class_sep=0.8, random_state=56)
print(sorted(Counter(y).items()))

[(0, 181), (1, 120), (2, 399), (3, 81), (4, 219)]


In [48]:
data = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4'])
data = pd.concat([data, pd.Series(y, name='y')], axis=1)
data['y'] = data['y'].map({0: '第0类', 1: '第1类', 2: '第2类', 3: '第3类', 4: '第4类'})
data = ps.from_pandas(data).to_spark()
data.show()

+--------------------+--------------------+--------------------+--------------------+-----+
|            feature1|            feature2|            feature3|            feature4|    y|
+--------------------+--------------------+--------------------+--------------------+-----+
|  1.1935348262532623|  1.1871044314638282| -1.1117344952061874|  0.5274782864181788|第0类|
| -0.3933771831723913|-0.07590771491388915|-0.46010303960315035|  0.6541795156129617|第4类|
|  0.2618875099965555| -1.1952035742566547|  2.2179078517610895| -0.6773199343487093|第2类|
|  0.7484406905582527|   1.012601737753978| -1.0110116231436872|  -0.565262801725875|第0类|
| -0.4664569919562217|  0.4364024336692316|  3.6138968976559553|  2.9376807712040147|第2类|
| -0.5354205965046313|-0.28360254994682066|  -1.023224476081892|-0.24769280796548876|第2类|
|   0.326241638220264| -1.1788197070040687|0.001218018791641...|  0.6827665980343813|第2类|
| -0.7193915936127249|  0.1970475903151857|  0.4494513099235583|  1.5484605014831632|第1类|
|  1

In [49]:
feature_columns = ["feature1", "feature2", "feature3", "feature4"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)
assembled_data.show()

+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|            feature1|            feature2|            feature3|            feature4|    y|            features|
+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|  1.1935348262532623|  1.1871044314638282| -1.1117344952061874|  0.5274782864181788|第0类|[1.19353482625326...|
| -0.3933771831723913|-0.07590771491388915|-0.46010303960315035|  0.6541795156129617|第4类|[-0.3933771831723...|
|  0.2618875099965555| -1.1952035742566547|  2.2179078517610895| -0.6773199343487093|第2类|[0.26188750999655...|
|  0.7484406905582527|   1.012601737753978| -1.0110116231436872|  -0.565262801725875|第0类|[0.74844069055825...|
| -0.4664569919562217|  0.4364024336692316|  3.6138968976559553|  2.9376807712040147|第2类|[-0.4664569919562...|
| -0.5354205965046313|-0.28360254994682066|  -1.023224476081892|-0.24769280796548876|第2类|[-0.5354205965046

In [54]:
label_indexer = StringIndexer(inputCol="y", outputCol='y' + str('_as_int'))
indexer_model_ = label_indexer.fit(assembled_data)
label_assembled_data = indexer_model_.transform(assembled_data)
label_assembled_data.show()

+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------+
|            feature1|            feature2|            feature3|            feature4|    y|            features|y_as_int|
+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------+
|  1.1935348262532623|  1.1871044314638282| -1.1117344952061874|  0.5274782864181788|第0类|[1.19353482625326...|     2.0|
| -0.3933771831723913|-0.07590771491388915|-0.46010303960315035|  0.6541795156129617|第4类|[-0.3933771831723...|     1.0|
|  0.2618875099965555| -1.1952035742566547|  2.2179078517610895| -0.6773199343487093|第2类|[0.26188750999655...|     0.0|
|  0.7484406905582527|   1.012601737753978| -1.0110116231436872|  -0.565262801725875|第0类|[0.74844069055825...|     2.0|
| -0.4664569919562217|  0.4364024336692316|  3.6138968976559553|  2.9376807712040147|第2类|[-0.4664569919562...|     0.0|
| -0.5354205965046313|-0.283602549

In [53]:
dict(enumerate(indexer_model_.labels))

{0: '第2类', 1: '第4类', 2: '第0类', 3: '第1类', 4: '第3类'}

In [56]:
lr = LogisticRegression(featuresCol="features", labelCol="y_as_int")
lr_model = lr.fit(label_assembled_data)
result = lr_model.transform(label_assembled_data)
result.show()

+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------+--------------------+--------------------+----------+
|            feature1|            feature2|            feature3|            feature4|    y|            features|y_as_int|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------+--------------------+--------------------+----------+
|  1.1935348262532623|  1.1871044314638282| -1.1117344952061874|  0.5274782864181788|第0类|[1.19353482625326...|     2.0|[-1.8971000028839...|[0.01288003723198...|       4.0|
| -0.3933771831723913|-0.07590771491388915|-0.46010303960315035|  0.6541795156129617|第4类|[-0.3933771831723...|     1.0|[0.87711892581422...|[0.39375644417189...|       0.0|
|  0.2618875099965555| -1.1952035742566547|  2.2179078517610895| -0.6773199343487093|第2类|[0.26188750999655...|     0.0|[5.2941613

In [59]:
evaluator = MulticlassClassificationEvaluator(labelCol='y_as_int')  # MulticlassClassificationEvaluator

In [60]:
weighted_recall = evaluator.evaluate(result, {evaluator.metricName: "weightedRecall"})
print(weighted_recall)

weighted_precision = evaluator.evaluate(result, {evaluator.metricName: "weightedPrecision"})
print(weighted_precision)

weighted_false_positive_rate = evaluator.evaluate(result, {evaluator.metricName: "weightedFalsePositiveRate"})
print(weighted_false_positive_rate)

weighted_f1_score = evaluator.evaluate(result, {evaluator.metricName: "weightedFMeasure"})
print(weighted_f1_score)

false_positive_rate_by_label = evaluator.evaluate(result, {evaluator.metricName: "falsePositiveRateByLabel"})
print(false_positive_rate_by_label)

precision_by_label = evaluator.evaluate(result, {evaluator.metricName: "precisionByLabel"})
print(precision_by_label)

recall_by_label = evaluator.evaluate(result, {evaluator.metricName: "recallByLabel"})
print(recall_by_label)

f_measure_by_label = evaluator.evaluate(result, {evaluator.metricName: "fMeasureByLabel"})
print(f_measure_by_label)

0.7500000000000001
0.7526904133695197
0.08796714979515059
0.744353616636528
0.12312811980033278
0.8145363408521303
0.8145363408521303
0.8145363408521303


#### string类型转vector

In [187]:
X, y = make_classification(n_samples=1000, n_features=4, random_state=42)
data = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3', 'feature4'])
data['label'] = y
# data['label'] = data['label'].map({1: '是', 0:'否'})
data = ps.from_pandas(data).to_spark()
data.show()

+--------------------+--------------------+-------------------+--------------------+-----+
|            feature1|            feature2|           feature3|            feature4|label|
+--------------------+--------------------+-------------------+--------------------+-----+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|
|  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|    1|
|  1.8288841806481393| -0.2892612860302994| -1.049150931886449|  0.2062714398834348|    0|
|  0.5399095786887405| -0.6708391613721514| 1.0269387108558683|  -0.654529086677752|    1|
|-0.21015535126988671| -0.6526563106588037| 1.6865595485775817| -0.8618761517856935|    1|
| -0.8275441309046899| 0.14449974071098143| 0.4436450947685622| -0.0766994748959056|    0|

In [188]:
# 拟合逻辑回归模型
feature_columns = ["feature1", "feature2", "feature3", "feature4"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled_data = assembler.transform(data)

lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=100, threshold=0.5, regParam=0.0, elasticNetParam=0.0,
                       tol=1e-06)
lr_model = lr.fit(assembled_data)
result_ = lr_model.transform(assembled_data)
result_ = result_.drop('features')
result_.show()

+--------------------+--------------------+-------------------+--------------------+-----+--------------------+--------------------+----------+
|            feature1|            feature2|           feature3|            feature4|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-------------------+--------------------+-----+--------------------+--------------------+----------+
| -1.6077112718140278|-0.02939755453678...|  1.569952605061414| -0.5279844217352537|    1|[-3.7775837602324...|[0.02236621118104...|       1.0|
| 0.29013138652200965|  0.3176812583459609|-0.9965194758869388|  0.4770094498983824|    0|[1.88033960621271...|[0.86765012945738...|       0.0|
| -1.2250960321892006|  0.8959114872456515|-0.9003241580171191|  0.7198608301661266|    1|[1.32138803399469...|[0.78941254651279...|       0.0|
|  2.0468611161145427| -1.6805950177168596| 1.9237179710396748| -1.4272439763682618|    1|[-3.6167790258888...|[0.02616602481701...|    

In [189]:
result_.dtypes

[('feature1', 'double'),
 ('feature2', 'double'),
 ('feature3', 'double'),
 ('feature4', 'double'),
 ('label', 'int'),
 ('rawPrediction', 'vector'),
 ('probability', 'vector'),
 ('prediction', 'double')]

In [190]:
ret = result_.withColumn('probability', F.col('probability').cast('string'))  \
                .withColumn('rawPrediction', F.col('rawPrediction').cast('string'))  
#                 .withColumn('features', F.col('features').cast('string'))

In [191]:
print(ret.dtypes)

[('feature1', 'double'), ('feature2', 'double'), ('feature3', 'double'), ('feature4', 'double'), ('label', 'int'), ('rawPrediction', 'string'), ('probability', 'string'), ('prediction', 'double')]


In [192]:
from pyspark.sql.types import DoubleType, ArrayType
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors,VectorUDT

In [175]:
class ConvertToVector:
    
    def __init__(self, data):
        self.data = data
    
    # 定义一个UDF来将字符串解析为数组
    @staticmethod
    def parse_array_string(array_string):
        values = [float(x) for x in array_string.strip("[]").split(",")]
        return values

    # 定义一个UDF来将数组转换为向量类型
    @staticmethod
    def array_to_vector(array):
        return Vectors.dense(array)

    def convert_to_vector(self, column):
        array_to_array_udf = F.udf(self.parse_array_string, ArrayType(DoubleType()))
        array_to_vector_udf = F.udf(self.array_to_vector, VectorUDT())
        # 增加array列：转成array后的columns
        array_df = self.data.withColumn("array", array_to_array_udf(self.data[column]))
        # 增加vector列：将array转为vector
        vectorized_df = array_df.withColumn("vector", array_to_vector_udf(array_df["array"]))
        # 删去中间列array，将vector重命名为原来的名字
        self.data = vectorized_df.drop('array', column).withColumnRenamed('vector', column)
        return self

In [1]:
# ctv = ConvertToVector(ret)
# # eer1 = ctv.convert_to_vector('probability').convert_to_vector('rawPrediction').data
# eer1 = ctv.convert_to_vector('probability').convert_to_vector('rawPrediction').convert_to_vector('features').data

In [183]:
eer1.dtypes

[('feature1', 'double'),
 ('feature2', 'double'),
 ('feature3', 'double'),
 ('feature4', 'double'),
 ('label', 'int'),
 ('prediction', 'double'),
 ('probability', 'vector'),
 ('rawPrediction', 'vector'),
 ('features', 'vector')]

In [19]:
# # 定义一个UDF来将字符串解析为数组
# def parse_array_string(array_string):
#     values = [float(x) for x in array_string.strip("[]").split(",")]
#     return values

# array_to_array_udf = F.udf(parse_array_string, ArrayType(DoubleType()))
# array_df = eer.withColumn("array", array_to_array_udf(eer["features"]))

# def array_to_vector(array):
#     return Vectors.dense(array)
# array_to_vector_udf = F.udf(array_to_vector, VectorUDT())


# vectorized_df = array_df.withColumn("vector", array_to_vector_udf(array_df["array"]))

# vectorized_df.show()

In [122]:
# assembler1 = VectorAssembler(inputCols='features', outputCol="features")
# assembler2 = VectorAssembler(inputCols='rawPrediction', outputCol="rawPrediction")
# assembler3 = VectorAssembler(inputCols='probability', outputCol="probability")

In [None]:
lr = LogisticRegression(featuresCol="vector", labelCol="label", maxIter=100, threshold=0.5, regParam=0.0, elasticNetParam=0.0,
                        tol=1e-06)
lr_model = lr.fit(vectorized_df)
result_ = lr_model.transform(vectorized_df)