# Spark
## Logistic Regression
### Documentation example

In [16]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.feature import StringIndexer

from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_theme(style='darkgrid')
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

In [2]:
random_seed = 1234

In [3]:
spark = SparkSession.builder.appName('logReg').getOrCreate()

your 131072x1 screen size is bogus. expect trouble


23/11/03 11:38:28 WARN Utils: Your hostname, Diego-desktop resolves to a loopback address: 127.0.1.1; using 172.27.76.109 instead (on interface eth0)
23/11/03 11:38:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/03 11:38:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read.format('libsvm').load('../data/sample_libsvm_data.txt')
df.show()

23/11/03 11:38:44 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.
[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [5]:
model = LogisticRegression()

In [6]:
model_fit = model.fit(df)

                                                                                

                                                                                

In [7]:
summary = model_fit.summary

In [8]:
summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [9]:
summary.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514862...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198867...|[6.76550380001560...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678715831...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012719...|[4.62137287298722...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874697...|[1.81823629113437...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



                                                                                

In [10]:
train, test = df.randomSplit([0.7, 0.3])

In [11]:
train.groupBy('label').count().show()



+-----+-----+
|label|count|
+-----+-----+
|  0.0|   31|
|  1.0|   35|
+-----+-----+



                                                                                

In [12]:
test.groupBy('label').count().show()

[Stage 35:>                                                         (0 + 1) / 1]

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   12|
|  1.0|   22|
+-----+-----+



                                                                                

In [13]:
model_cv = LogisticRegression()

model_cv_fit = model_cv.fit(train)

                                                                                

In [14]:
prediction = model_cv_fit.evaluate(test)

In [15]:
prediction.predictions.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[124,125,126...|[36.1232896067082...|[0.99999999999999...|       0.0|
|  0.0|(692,[124,125,126...|[27.2252466367894...|[0.99999999999849...|       0.0|
|  0.0|(692,[126,127,128...|[25.3365977483272...|[0.99999999999008...|       0.0|
|  0.0|(692,[126,127,128...|[28.0094412466195...|[0.99999999999931...|       0.0|
|  0.0|(692,[126,127,128...|[33.5353449526587...|[0.99999999999999...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [17]:
BinaryClassificationEvaluator?

[0;31mInit signature:[0m
[0mBinaryClassificationEvaluator[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrawPredictionCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'rawPrediction'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabelCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'label'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetricName[0m[0;34m:[0m [0;34m'BinaryClassificationEvaluatorMetricType'[0m [0;34m=[0m [0;34m'areaUnderROC'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweightCol[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnumBins[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Evaluator for binary classification, which expects input columns rawPrediction, label
and an optional weight column.
The rawPredi

In [18]:
model_eval = BinaryClassificationEvaluator().evaluate(prediction.predictions)

                                                                                

In [19]:
model_eval

0.9848484848484849

__________

In [31]:
train = df.sampleBy('label', fractions={0: 0.7, 1: 0.7}, seed=random_seed)

train.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   29|
|  1.0|   38|
+-----+-----+



In [32]:
test = df.subtract(train)

test.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   14|
|  1.0|   19|
+-----+-----+



In [26]:
df_zeros = df.filter(df['label'] == 0)
df_ones = df.filter(df['label'] == 1)

train_zeros, test_zeros = df_zeros.randomSplit([0.7, 0.3], seed=random_seed)
train_ones, test_ones = df_ones.randomSplit([0.7, 0.3], seed=random_seed)

train = train_zeros.union(train_ones)
test = train_ones.union(test_ones)

train.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   31|
|  1.0|   41|
+-----+-----+



In [22]:
test.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|   12|
|  1.0|   16|
+-----+-----+

