In [1]:
import findspark
findspark.init('C:\Spark\spark-3.0.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('logreg').getOrCreate()

In [3]:
from pyspark.ml.classification import LogisticRegression

In [4]:
my_data = spark.read.format('libsvm').option("numFeatures", "692").load('sample_libsvm_data.txt')

In [5]:
my_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
+-----+--------------------+
only showing top 5 rows



In [6]:
my_log_reg_model = LogisticRegression()

In [7]:
fitted_logreg = my_log_reg_model.fit(my_data)

In [8]:
log_summary = fitted_logreg.summary

In [9]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [10]:
log_summary.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111056...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126979...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170303...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200604...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



<h3> Train Test Split </h3>

In [11]:
lr_train, lr_test = my_data.randomSplit([0.7, 0.3])

In [12]:
final_model = LogisticRegression()

In [13]:
fit_final = final_model.fit(lr_train)

In [14]:
prediction_and_labels = fit_final.evaluate(lr_test)

In [15]:
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|[22.8458791185374...|[0.99999999988028...|       0.0|
|  0.0|(692,[100,101,102...|[7.78835380722706...|[0.99958563704116...|       0.0|
|  0.0|(692,[125,126,127...|[21.8877573457658...|[0.99999999968791...|       0.0|
|  0.0|(692,[126,127,128...|[22.3695667014829...|[0.99999999980723...|       0.0|
|  0.0|(692,[126,127,128...|[21.7974130538080...|[0.99999999965841...|       0.0|
|  0.0|(692,[126,127,128...|[28.5697300219229...|[0.99999999999960...|       0.0|
|  0.0|(692,[126,127,128...|[16.7768026749001...|[0.99999994824799...|       0.0|
|  0.0|(692,[127,128,129...|[21.2165754875946...|[0.99999999938939...|       0.0|
|  0.0|(692,[129,130,131...|[15.8753600062588...|[0.99999987252685...|       0.0|
|  0.0|(692,[153

In [16]:
prediction_and_labels = prediction_and_labels.predictions.select('label', 'prediction')

In [17]:
prediction_and_labels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



<h3> Evaluators </h3>

In [18]:
from pyspark.ml.evaluation import (BinaryClassificationEvaluator,
                                   MulticlassClassificationEvaluator)

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='prediction',
    labelCol='label'
)
evaluator.evaluate(prediction_and_labels)

1.0

In [19]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label',
                                             metricName='accuracy')

In [20]:
acc = evaluator.evaluate(prediction_and_labels)
acc

1.0