In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName('Logistic-start').getOrCreate()

In [3]:
my_data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [4]:
my_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [5]:
my_log_reg_model = LogisticRegression()

In [6]:
fitted_logreg = my_log_reg_model.fit(my_data)

In [7]:
log_summary = fitted_logreg.summary

In [8]:
log_summary.predictions

DataFrame[label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [10]:
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[39.9727764450750...|[1.0,4.3655982185...|       0.0|
|  1.0|(692,[158,159,160...|[-35.662380562160...|[3.25105944320044...|       1.0|
|  1.0|(692,[124,125,126...|[-39.336799621156...|[8.24603148700906...|       1.0|
|  1.0|(692,[152,153,154...|[-28.219286248176...|[5.55289803944932...|       1.0|
|  1.0|(692,[151,152,153...|[-28.142070329444...|[5.99865861146384...|       1.0|
|  0.0|(692,[129,130,131...|[37.8748140555402...|[1.0,3.5577649524...|       0.0|
|  1.0|(692,[158,159,160...|[-36.610101257122...|[1.26018713626709...|       1.0|
|  1.0|(692,[99,100,101,...|[-29.504314986871...|[1.53616833535573...|       1.0|
|  0.0|(692,[154,155,156...|[13.6899025280154...|[0.99999886616363...|       0.0|
|  0.0|(692,[127

In [11]:
lr_train, lr_test = my_data.randomSplit([0.7,0.3])

In [12]:
final_model = LogisticRegression()

In [14]:
fit_final = final_model.fit(lr_train)

In [15]:
prediction_and_labels = fit_final.evaluate(lr_test)

In [19]:
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[123,124,125...|[25.1954997598396...|[0.99999999998857...|       0.0|
|  0.0|(692,[124,125,126...|[70.9574775938392...|[1.0,1.5260158035...|       0.0|
|  0.0|(692,[124,125,126...|[37.1766900835939...|[1.0,7.1510306795...|       0.0|
|  0.0|(692,[124,125,126...|[24.2066319420712...|[0.99999999996929...|       0.0|
|  0.0|(692,[125,126,127...|[23.5701713593830...|[0.99999999994197...|       0.0|
|  0.0|(692,[126,127,128...|[28.3290568751257...|[0.99999999999950...|       0.0|
|  0.0|(692,[127,128,129...|[24.8234481744146...|[0.99999999998343...|       0.0|
|  0.0|(692,[127,128,129...|[24.6286816848315...|[0.99999999997986...|       0.0|
|  0.0|(692,[129,130,131...|[17.0468031323317...|[0.99999996049360...|       0.0|
|  0.0|(692,[150

In [20]:
prediction_and_labels = prediction_and_labels.predictions.select('label','prediction')

In [21]:
prediction_and_labels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



In [24]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [26]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')

In [27]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='label',
                                             metricName='accuracy')

In [29]:
acc = evaluator.evaluate(prediction_and_labels)

In [30]:
acc

1.0