# Spark
## Logistic Regression
### Documentation example

In [105]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from pyspark.ml import Pipeline

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.mllib.evaluation import BinaryClassificationMetrics

from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_theme(style='darkgrid')
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

In [106]:
random_seed = 1234

In [107]:
spark = SparkSession.builder.appName('logReg_code_alon').getOrCreate()

23/11/03 14:55:18 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [108]:
df = spark.read.csv('../data/titanic.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [109]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [110]:
df.groupBy('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [111]:
df.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [112]:
df.groupBy('SibSp').count().show()

+-----+-----+
|SibSp|count|
+-----+-----+
|    1|  209|
|    3|   16|
|    5|    5|
|    4|   18|
|    8|    7|
|    2|   28|
|    0|  608|
+-----+-----+



In [113]:
df.groupBy('Parch').count().show()

+-----+-----+
|Parch|count|
+-----+-----+
|    1|  118|
|    6|    1|
|    3|    5|
|    5|    5|
|    4|    4|
|    2|   80|
|    0|  678|
+-----+-----+



In [114]:
df.groupBy('Embarked').count().show()

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|    NULL|    2|
|       C|  168|
|       S|  644|
+--------+-----+



In [115]:
df.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [116]:
cols = [
    #'PassengerId',
    'Pclass',
    #'Name',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    #'Ticket',
    'Fare',
    #'Cabin',
    'Embarked',
    'Survived'
 ]

df_new = df.select(cols)

In [117]:

df_new = df_new.na.drop()

In [118]:
sex_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_Index')
sex_encoder = OneHotEncoder(inputCol='Sex_Index', outputCol='Sex_Vec')

embarked_indexer = StringIndexer(inputCol='Embarked', outputCol='Embarked_Index')
embarked_encoder = OneHotEncoder(inputCol='Embarked_Index', outputCol='Embarked_Vec')

In [119]:
features = [
    'Pclass',
    'Sex_Vec',
    'Embarked_Vec',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
]

target = 'Survived'

In [120]:
assembler = VectorAssembler(
    inputCols=features,
    outputCol='features'
)

In [121]:
LogisticRegression?

[0;31mInit signature:[0m
[0mLogisticRegression[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeaturesCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'features'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabelCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'label'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpredictionCol[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'prediction'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmaxIter[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mregParam[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0melasticNetParam[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1e-06[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfitIntercept[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue

In [122]:
model = LogisticRegression(
    featuresCol='features',
    labelCol='Survived'
)

In [123]:
Pipeline?

[0;31mInit signature:[0m [0mPipeline[0m[0;34m([0m[0;34m*[0m[0;34m,[0m [0mstages[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mForwardRef[0m[0;34m([0m[0;34m'PipelineStage'[0m[0;34m)[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
A simple pipeline, which acts as an estimator. A Pipeline consists
of a sequence of stages, each of which is either an
:py:class:`Estimator` or a :py:class:`Transformer`. When
:py:meth:`Pipeline.fit` is called, the stages are executed in
order. If a stage is an :py:class:`Estimator`, its
:py:meth:`Estimator.fit` method will be called on the input
dataset to fit a model. Then the model, which is a transformer,
will be used to transform the dataset as the input to the next
stage. If a stage is a :py:class:`Transformer`, its
:py:meth:`Transformer.transform` method will be called to produce
the dataset for the next stage. The fitted model from a
:py:

In [124]:
pipeline = Pipeline(
    stages=[
        sex_indexer,
        embarked_indexer,
        sex_encoder,
        embarked_encoder,
        assembler,
        model
    ]
)

In [125]:
train, test = df_new.randomSplit([0.7, 0.3])

In [126]:
df_zeros = df_new.filter(df_new['Survived'] == 0)
df_ones = df_new.filter(df_new['Survived'] == 1)

train_zeros, test_zeros = df_zeros.randomSplit([0.7, 0.3], seed=random_seed)
train_ones, test_ones = df_ones.randomSplit([0.7, 0.3], seed=random_seed)

train = train_zeros.union(train_ones)
test = test_zeros.union(test_ones)

In [127]:
model_fit = pipeline.fit(train)

In [128]:
results = model_fit.transform(test)

In [129]:
model_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol=target)

In [130]:
results.select('Survived', 'prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       1.0|
+--------+----------+
only showing top 20 rows



In [131]:
results.groupBy('Survived').count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       0|  135|
|       1|   96|
+--------+-----+



In [132]:
results.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  140|
|       1.0|   91|
+----------+-----+



In [133]:
AUC = model_eval.evaluate(results)
AUC

0.7600694444444444