In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.\
    builder.\
    master('local').\
    appName('logistic-regression-basics').\
    getOrCreate()

In [2]:
from pyspark.ml.classification import LogisticRegression

In [3]:
data = spark.read.format('libsvm').\
    load('D:/learn-ab/learning-PySpark/sample-data/sample-libsvm-data.txt')

In [4]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [5]:
logreg = LogisticRegression()

In [6]:
logreg_model = logreg.fit(data)

In [7]:
logreg_model_summary = logreg_model.summary

In [8]:
logreg_model_summary.predictions

DataFrame[label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [9]:
logreg_model_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [10]:
logreg_model_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000486...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716171...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298141...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113070...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823781...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446123...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [11]:
for item in logreg_model_summary.predictions.head(1)[0]:
    print(item)

0.0
(692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0,114.0,253.0,22

In [12]:
train_data, test_data = data.randomSplit([0.7, 0.3])

In [13]:
logreg_2 = LogisticRegression()
logreg_model_2 = logreg_2.fit(train_data)

In [14]:
pred_and_labels = logreg_model_2.evaluate(test_data)

In [15]:
pred_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[30.4838258018209...|[0.99999999999994...|       0.0|
|  0.0|(692,[124,125,126...|[50.0626771777086...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[40.3791533076966...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[43.1549243548129...|           [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|[24.2073736826804...|[0.99999999996931...|       0.0|
|  0.0|(692,[125,126,127...|[32.9401966713895...|[0.99999999999999...|       0.0|
|  0.0|(692,[126,127,128...|[36.1417644179509...|[0.99999999999999...|       0.0|
|  0.0|(692,[128,129,130...|[24.3415938286769...|[0.99999999997317...|       0.0|
|  0.0|(692,[150,151,152...|[26.9172819365379...|[0.99999999999795...|       0.0|
|  0.0|(692,[151

In [16]:
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator
)

In [17]:
?BinaryClassificationEvaluator

[1;31mInit signature:[0m
[0mBinaryClassificationEvaluator[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mrawPredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'rawPrediction'[0m[1;33m,[0m[1;33m
[0m    [0mlabelCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'label'[0m[1;33m,[0m[1;33m
[0m    [0mmetricName[0m[1;33m:[0m [1;34m'BinaryClassificationEvaluatorMetricType'[0m [1;33m=[0m [1;34m'areaUnderROC'[0m[1;33m,[0m[1;33m
[0m    [0mweightCol[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnumBins[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m1000[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Evaluator for binary classification, which expects input columns rawPrediction, label
and an optional weight column.
The rawPrediction column can be of type double (binary 0/1 prediction, or probability of 

In [18]:
bin_evaluator = BinaryClassificationEvaluator()

In [19]:
bin_roc = bin_evaluator.evaluate(pred_and_labels.predictions)
bin_roc

1.0

In [20]:
?MulticlassClassificationEvaluator

[1;31mInit signature:[0m
[0mMulticlassClassificationEvaluator[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mpredictionCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'prediction'[0m[1;33m,[0m[1;33m
[0m    [0mlabelCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'label'[0m[1;33m,[0m[1;33m
[0m    [0mmetricName[0m[1;33m:[0m [1;34m'MulticlassClassificationEvaluatorMetricType'[0m [1;33m=[0m [1;34m'f1'[0m[1;33m,[0m[1;33m
[0m    [0mweightCol[0m[1;33m:[0m [0mOptional[0m[1;33m[[0m[0mstr[0m[1;33m][0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmetricLabel[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbeta[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mprobabilityCol[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'probability'[0m[1;33m,[0m[1;33m
[0m    [0meps[0m[1;33m:[0m [0mfloat[0m [1;33m=[0m [1;36m1e-15[

In [21]:
mul_evaluator = MulticlassClassificationEvaluator()

In [22]:
mul_f1 = mul_evaluator.evaluate(pred_and_labels.predictions)
mul_f1

1.0