In [1]:
MASTER = "local"
NUM_PROCESSORS = "8"
NUM_EXECUTORS = "4"
NUM_PARTITIONS = 10

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
import pyspark.sql.functions as psf
import json



In [2]:
conf = SparkConf()

conf.set("spark.app.name", "one_part_data")
conf.set("spark.master", MASTER)
conf.set("spark.executor.cores", NUM_PROCESSORS)
conf.set("spark.executor.instances", NUM_EXECUTORS)
conf.set("spark.executor.memory", "6g")
conf.set("spark.locality.wait", "0")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryoserializer.buffer.max", "2000")
conf.set("spark.executor.heartbeatInterval", "6000s")
conf.set("spark.network.timeout", "10000000s")
conf.set("spark.shuffle.spill", "true")
conf.set("spark.driver.memory", "15g")
conf.set("spark.driver.maxResultSize", "15g")

<pyspark.conf.SparkConf at 0x7f00d6ec36d0>

In [3]:
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .config(conf=conf) \
      .master("local[*]") \
      .appName("SparkByExamples.com") \
      .getOrCreate()
spark

23/01/11 14:01:38 WARN Utils: Your hostname, yagor-pc resolves to a loopback address: 127.0.1.1; using 192.168.0.107 instead (on interface enp7s0)
23/01/11 14:01:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/11 14:01:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/11 14:01:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
dataset = spark.read.csv("/home/yagor/Рабочий стол/mipt/lab3/notebook/nutrition_table.csv",header=True,inferSchema=True)

In [5]:
dataset.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanuts|
|  3|   57.14|             17.86|       3.57|        17.86|  1.22428|     2540.0|              2835.7|            92.86|       0|Organic Salted Nu...|
|  7|   18.75|             57.81|      15.62|        14.06|   0.1397|     1833.0|             

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
feat_cols = [ #'_c0',
 'fat_100g',
 'carbohydrates_100g',
 'sugars_100g',
 'proteins_100g',
 'salt_100g',
 'energy_100g',
 'reconstructed_energy',
 'g_sum',
 'exceeded',
 #'product'
 ]

In [8]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')

In [9]:
final_data = vec_assembler.transform(dataset)

In [10]:
final_data.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|            features|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|[28.57,64.29,14.2...|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanuts|[17.86,60.71,17.8...|
|  3|   57.14|             17.86|       3.57|        17.86|  1.22428|     2540.0|              2835.7|            92.86|       0|Organic Sal

In [11]:
from pyspark.ml.feature import StandardScaler

In [12]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [13]:
final_data

DataFrame[_c0: int, fat_100g: double, carbohydrates_100g: double, sugars_100g: double, proteins_100g: double, salt_100g: double, energy_100g: double, reconstructed_energy: double, g_sum: double, exceeded: int, product: string, features: vector]

In [14]:
# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(final_data)

In [15]:
# Normalize each feature to have unit standard deviation.
cluster_final_data = scalerModel.transform(final_data)

## train model

In [16]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [17]:
for k in range(2,15):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(cluster_final_data)
    predictions = model.transform(cluster_final_data)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("With K={}".format(k))
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    print('--'*30)

With K=2
Silhouette with squared euclidean distance = 0.7668198339263336
------------------------------------------------------------
With K=3
Silhouette with squared euclidean distance = 0.7630900317357944
------------------------------------------------------------
With K=4
Silhouette with squared euclidean distance = 0.45382779775000187
------------------------------------------------------------
With K=5
Silhouette with squared euclidean distance = 0.5329292075618965
------------------------------------------------------------
With K=6
Silhouette with squared euclidean distance = 0.2752095921075341
------------------------------------------------------------
With K=7
Silhouette with squared euclidean distance = 0.27855535014230576
------------------------------------------------------------
With K=8
Silhouette with squared euclidean distance = 0.32041363209041773
------------------------------------------------------------
With K=9
Silhouette with squared euclidean distance = 0.338

In [18]:
k = 2

kmeans = KMeans(featuresCol='scaledFeatures', predictionCol='pred_kmeans_cluster', k=k)

In [19]:
modelKMeans = kmeans.fit(cluster_final_data)

In [20]:
predictions = modelKMeans.transform(cluster_final_data)

## 2. Split on train and test and analys

In [21]:
predictions.show()

+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+--------------------+-------------------+
|_c0|fat_100g|carbohydrates_100g|sugars_100g|proteins_100g|salt_100g|energy_100g|reconstructed_energy|            g_sum|exceeded|             product|            features|      scaledFeatures|pred_kmeans_cluster|
+---+--------+------------------+-----------+-------------+---------+-----------+--------------------+-----------------+--------+--------------------+--------------------+--------------------+-------------------+
|  1|   28.57|             64.29|      14.29|         3.57|      0.0|     2243.0|             2267.85|            96.43|       0|Banana Chips Swee...|[28.57,64.29,14.2...|[1.91358558312757...|                  0|
|  2|   17.86|             60.71|      17.86|        17.86|    0.635|     1941.0|             2032.23|96.42999999999999|       0|             Peanut

In [22]:
evaluator = ClusteringEvaluator(predictionCol='pred_kmeans_cluster')

In [23]:
silhouette = evaluator.evaluate(predictions)

In [24]:
print(f"With k={k} Silhouette with squared euclidean distance = " + str(silhouette))

With k=2 Silhouette with squared euclidean distance = 0.7668198339263336


In [25]:
centers=modelKMeans.clusterCenters()
print("Cluster Centers:")
for center in centers:
    print(center)

Cluster Centers:
[1.19486632 1.8629406  1.15892678 1.16388705 0.11194177 2.2653329
 2.25972459 2.41225435 0.18817334]
[0.24865497 0.44342242 0.33145447 0.50509242 0.11777061 0.58014115
 0.55044094 0.61165374 0.        ]


In [26]:
modelKMeans.transform(cluster_final_data).groupBy('pred_kmeans_cluster').count().show()

+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1|22546|
|                  0|22482|
+-------------------+-----+



In [27]:
cols = ["_c0", "pred_kmeans_cluster"]
result = predictions.select(*cols)

In [28]:
result.show()

+---+-------------------+
|_c0|pred_kmeans_cluster|
+---+-------------------+
|  1|                  0|
|  2|                  0|
|  3|                  0|
|  7|                  0|
| 12|                  0|
| 15|                  0|
| 16|                  0|
| 19|                  0|
| 20|                  0|
| 21|                  0|
| 22|                  0|
| 23|                  0|
| 24|                  0|
| 26|                  0|
| 27|                  0|
| 28|                  0|
| 29|                  0|
| 31|                  0|
| 32|                  0|
| 34|                  0|
+---+-------------------+
only showing top 20 rows



In [29]:
(trainingData, testData) = predictions.randomSplit([0.7, 0.3])

In [30]:
trainingData.groupBy('pred_kmeans_cluster').count().show()

+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1|15727|
|                  0|15742|
+-------------------+-----+



In [31]:
testData.count()

13559

In [32]:
trainingData.count()

31469

## 3. Classification model

In [33]:
from pyspark.ml.classification import MultilayerPerceptronClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [34]:
RandomForest = RandomForestClassifier(labelCol="pred_kmeans_cluster", featuresCol="scaledFeatures", \
                        predictionCol='pred_from_randomforest_class', numTrees=20, maxDepth=3)


In [35]:
modelRandomForest = RandomForest.fit(trainingData)

In [36]:
# Make predictions.
predictions_RandomForest = modelRandomForest.transform(testData)

In [37]:

# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_RandomForest)
print("Test Error = %g" % (1.0 - accuracy))

+----------------------------+-------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|
+----------------------------+-------------------+--------------------+
|                         0.0|                  0|[1.91358558312757...|
|                         0.0|                  0|[2.4561142223762,...|
|                         0.0|                  0|[0.39852412389251...|
|                         0.0|                  0|[1.33957688703365...|
|                         0.0|                  0|[3.12590266589302...|
+----------------------------+-------------------+--------------------+
only showing top 5 rows

Test Error = 0.0150454


In [38]:
# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures", 'rawPrediction', 'probability').show(5)

+----------------------------+-------------------+--------------------+--------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|       rawPrediction|         probability|
+----------------------------+-------------------+--------------------+--------------------+--------------------+
|                         0.0|                  0|[1.91358558312757...|[19.6653915832780...|[0.98326957916390...|
|                         0.0|                  0|[2.4561142223762,...|[19.6181705200750...|[0.98090852600375...|
|                         0.0|                  0|[0.39852412389251...|[19.1479398846721...|[0.95739699423360...|
|                         0.0|                  0|[1.33957688703365...|[19.7516364636404...|[0.98758182318202...|
|                         0.0|                  0|[3.12590266589302...|[19.6021880065115...|[0.98010940032557...|
+----------------------------+-------------------+--------------------+-----------------

In [39]:
predictions_RandomForest.groupBy('pred_from_randomforest_class').count().show()

+----------------------------+-----+
|pred_from_randomforest_class|count|
+----------------------------+-----+
|                         0.0| 6806|
|                         1.0| 6753|
+----------------------------+-----+



### Train all data

In [40]:
modelRandomForest = RandomForest.fit(predictions)

In [41]:
# Make predictions.
predictions_RandomForest = modelRandomForest.transform(testData)

In [42]:

# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(predictions_RandomForest)
print("Test Error = %g" % (1.0 - accuracy))

+----------------------------+-------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|
+----------------------------+-------------------+--------------------+
|                         0.0|                  0|[1.91358558312757...|
|                         0.0|                  0|[2.4561142223762,...|
|                         0.0|                  0|[0.39852412389251...|
|                         0.0|                  0|[1.33957688703365...|
|                         0.0|                  0|[3.12590266589302...|
+----------------------------+-------------------+--------------------+
only showing top 5 rows

Test Error = 0.0144553


In [43]:
# Select example rows to display.
predictions_RandomForest.select("pred_from_randomforest_class", "pred_kmeans_cluster", "scaledFeatures", 'rawPrediction', 'probability').show(5)

+----------------------------+-------------------+--------------------+--------------------+--------------------+
|pred_from_randomforest_class|pred_kmeans_cluster|      scaledFeatures|       rawPrediction|         probability|
+----------------------------+-------------------+--------------------+--------------------+--------------------+
|                         0.0|                  0|[1.91358558312757...|[19.5958653690484...|[0.97979326845242...|
|                         0.0|                  0|[2.4561142223762,...|[19.4051662509886...|[0.97025831254943...|
|                         0.0|                  0|[0.39852412389251...|[19.1860806411448...|[0.95930403205724...|
|                         0.0|                  0|[1.33957688703365...|[19.7294927338896...|[0.98647463669448...|
|                         0.0|                  0|[3.12590266589302...|[19.3358259609285...|[0.96679129804642...|
+----------------------------+-------------------+--------------------+-----------------

In [44]:
predictions_RandomForest.groupBy('pred_from_randomforest_class').count().show()

+----------------------------+-----+
|pred_from_randomforest_class|count|
+----------------------------+-----+
|                         0.0| 6860|
|                         1.0| 6699|
+----------------------------+-----+



## 4. Linear regression model and propability (logostic regression)

In [45]:
# LinRegression = LinearRegression(maxIter=30, regParam=0.1, elasticNetParam=0.1, labelCol="pred_from_randomforest_class", featuresCol="scaledFeatures")

# # Fit the model
# modelLinearRegression = LinRegression.fit(predictions_RandomForest)

# # Print the coefficients and intercept for linear regression
# print("Coefficients: %s" % str(modelLinearRegression.coefficients))
# print("Intercept: %s" % str(modelLinearRegression.intercept))

# # Summarize the model over the training set and print out some metrics
# trainingSummary = modelLinearRegression.summary
# print("numIterations: %d" % trainingSummary.totalIterations)
# print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
# trainingSummary.residuals.show()
# print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
# print("r2: %f" % trainingSummary.r2)

In [46]:
columns_to_drop = ['rawPrediction', 'probability']
predictions_RandomForest = predictions_RandomForest.drop(*columns_to_drop)

In [47]:
LogRegression = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=1.0, labelCol="pred_from_randomforest_class", featuresCol="scaledFeatures", probabilityCol="lr_prob", predictionCol='pred_from_logregression_class')

In [48]:
lrModel = LogRegression.fit(predictions_RandomForest)

In [49]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 9 CSRMatrix
(0,5) -0.2434
(0,6) -0.5471
(0,7) -1.1639
Intercept: [2.810568922526613]
objectiveHistory:
0.693076682585191
0.645808998874504
0.44833530734612487
0.4278920355836153
0.4198770993521608
0.41804537105492373
0.4130563875037308
0.4125837510695403
0.4118426040514606
0.4100892941057106
0.40952883776409743
False positive rate by label:
label 0: 0.01671891327063741
label 1: 0.019533527696793004
True positive rate by label:
label 0: 0.980466472303207
label 1: 0.9832810867293625
Precision by label:
label 0: 0.9836209417958467
label 1: 0.9800624907007885
Recall by label:
label 0: 0.980466472303207
label 1: 0.9832810867293625
F-measure by label:
label 0: 0.982041173893999
label 1: 0.9816691505216095
Accuracy: 0.9818570691053913
FPR: 0.018109510072821668
TPR: 0.9818570691053912
F-measure: 0.98185737091652
Precision: 0.9818628428294189
Recall: 0.9818570691053912


## Create pipeline

In [50]:
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Bucketizer
from pyspark.sql import DataFrame
from typing import Iterable
from pyspark.mllib.evaluation import MulticlassMetrics

In [51]:
# CUSTOM TRANSFORMER ----------------------------------------------------------------
class ColumnDropper(Transformer):
    """
    A custom Transformer which drops all columns that have at least one of the
    words from the banned_list in the name.
    """
    def __init__(self, banned_list: Iterable[str]):
        super(ColumnDropper, self).__init__()
        self.banned_list = banned_list
    def _transform(self, df: DataFrame) -> DataFrame:
        df = df.drop(*[x for x in df.columns if any(y in x for y in self.banned_list)])
        return df

In [52]:
column_dropper = ColumnDropper(banned_list = ['rawPrediction', 'probability'])

In [53]:
vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [54]:
k = 2
kmeans = KMeans(featuresCol='scaledFeatures', predictionCol='pred_kmeans_cluster', k=k)

In [55]:
RandomForest = RandomForestClassifier(labelCol="pred_kmeans_cluster", featuresCol="scaledFeatures", predictionCol='pred_from_randomforest_class', numTrees=20, maxDepth=3)


In [56]:
LogRegression = LogisticRegression(maxIter=10, regParam=0.1, elasticNetParam=1.0, labelCol="pred_from_randomforest_class", \
featuresCol="scaledFeatures", probabilityCol="lr_prob", predictionCol='pred_from_logregression_class')

In [57]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3])

In [58]:
pipeline = Pipeline(stages=[vec_assembler, scaler, kmeans, RandomForest, column_dropper, LogRegression])

model = pipeline.fit(trainingData)
output = model.transform(testData)

In [59]:
evaluator = ClusteringEvaluator(predictionCol='pred_kmeans_cluster', featuresCol='scaledFeatures')
silhouette = evaluator.evaluate(output)
print(f"With k={k} Silhouette with squared euclidean distance = " + str(silhouette))
output.groupBy('pred_kmeans_cluster').count().show()

With k=2 Silhouette with squared euclidean distance = 0.33976540615883033
+-------------------+-----+
|pred_kmeans_cluster|count|
+-------------------+-----+
|                  1| 6794|
|                  0| 6827|
+-------------------+-----+



In [60]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="pred_kmeans_cluster", predictionCol="pred_from_randomforest_class", metricName="accuracy")
accuracy = evaluator.evaluate(output)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0147566


In [61]:
y_true = output.select(['pred_kmeans_cluster']).collect()
y_pred = output.select(['pred_from_randomforest_class']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      6827
           1       0.99      0.98      0.99      6794

    accuracy                           0.99     13621
   macro avg       0.99      0.99      0.99     13621
weighted avg       0.99      0.99      0.99     13621



In [62]:
model.stages[5]

LogisticRegressionModel: uid=LogisticRegression_b7dbaa45e054, numClasses=2, numFeatures=9

In [63]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(model.stages[5].coefficientMatrix))
print("Intercept: " + str(model.stages[5].interceptVector))

trainingSummary = model.stages[5].summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
1 X 9 CSRMatrix
(0,5) -0.4503
(0,6) -0.65
(0,7) -1.045
Intercept: [3.092357938841986]
objectiveHistory:
0.693146409574009
0.6459304743256445
0.44815434558424794
0.4275511517897116
0.4199204817416282
0.41832222831473115
0.4169196662780055
0.4137765716338624
0.4131883506284702
0.4125110577394595
0.4116716198314688
False positive rate by label:
label 0: 0.016131089007906146
label 1: 0.0026076448514914456
True positive rate by label:
label 0: 0.9973923551485085
label 1: 0.9838689109920938
Precision by label:
label 0: 0.9841229996862253
label 1: 0.9973500517063082
Recall by label:
label 0: 0.9973923551485085
label 1: 0.9838689109920938
F-measure by label:
label 0: 0.9907132478362499
label 1: 0.9905636153549878
Accuracy: 0.990639029515713
FPR: 0.009377763375110648
TPR: 0.990639029515713
F-measure: 0.9906385244995379
Precision: 0.9907283132750107
Recall: 0.990639029515713


In [64]:
from operator import add

predict = model.transform(testData)

df_normed = predict.rdd.map(lambda x: (x['pred_from_logregression_class'], x['lr_prob'])) \
    .reduceByKey(add).toDF(['pred_from_logregression_class', 'lr_prob'])

                                                                                

In [65]:
from pyspark.ml.feature import Normalizer

normilize = Normalizer(inputCol='lr_prob', outputCol='normilize', p=1)
norm = normilize.transform(df_normed).rdd.map(lambda x: (x['pred_from_logregression_class'], x['normilize'].toArray().max()))

print('Classification mean propability confidence for classes:')
for label, confidence in norm.collect():
    print(f'label {int(label)}: {confidence}')

Classification mean propability confidence for classes:




label 0: 0.8281598623016101
label 1: 0.8364402755714606
