In [None]:
!pip install pyspark

In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

## Logistic Regression

In [2]:
# Load training data
data = spark.read.format("libsvm").load("../data/sample_libsvm_data.txt")

log_reg = LogisticRegression()

# Fit the model
model = log_reg.fit(data)

summary = model.summary

In [3]:
summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [7]:
model.evaluate(data).predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [8]:
y_hat = model.evaluate(data)

y_hat.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514872...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198868...|[6.76550380000472...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678716177...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012720...|[4.62137287298144...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874699...|[1.81823629113068...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504196...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212814...|[6.97903542823766...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503550...|[3.00582577446132...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606582...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [9]:
y_hat = y_hat.predictions.select("label", "prediction")
y_hat.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.evaluation.MulticlassMetrics.html_

In [10]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [19]:
metrics = MulticlassMetrics(y_hat.rdd)

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Precision
print("Precision label 0")
print(metrics.precision(label = 0.0))

print("Precision label 1")
print(metrics.precision(label = 1.0))

# Recall
print("Recall label 0")
print(metrics.recall(label = 0.0))

print("Recall label 1")
print(metrics.recall(label = 1.0))

# F1-Score
print("F1-Score label 0")
print(metrics.fMeasure(label = 0.0))

print("F1-Score label 1")
print(metrics.fMeasure(label = 1.0))

# Accuracy
print("Accuracy")
print(metrics.accuracy)

# Falsos Positivos
print("Falsos positivos label 0")
print(metrics.falsePositiveRate(label = 0.0))

print("Falsos positivos label 1")
print(metrics.falsePositiveRate(label = 1.0))

Confusion Matrix:
[[43.  0.]
 [ 0. 57.]]
Precision label 0
1.0
Precision label 1
1.0
Recall label 0
1.0
Recall label 1
1.0
F1-Score label 0
1.0
F1-Score label 1
1.0
Accuracy
1.0
Falsos positivos label 0
0.0
Falsos positivos label 1
0.0


### Evaluators

**Documentacion:** Binary <br> https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator

**Documentacion:** MultiClass <br>
https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [13]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "label")

evaluator.metricName

# Por defecto la metrica es la curva ROC

Param(parent='BinaryClassificationEvaluator_2f3e9ad7a529', name='metricName', doc='metric name in evaluation (areaUnderROC|areaUnderPR)')

In [14]:
# Sin parametros de metricas
evaluator.evaluate(dataset = y_hat)

1.0

In [15]:
# Con parametros
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderROC"})

1.0

In [16]:
# Area Under Precision-Recall, esta metrica puede resultar util cuando las clases estan desbalanceadas
evaluator.evaluate(dataset = y_hat, params = {evaluator.metricName: "areaUnderPR"})

1.0

In [17]:
# MultiClass
evaluator_m = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "label", metricName = "accuracy")

accuracy = evaluator.evaluate(y_hat)

accuracy

1.0

In [None]:
################################################################################################################################