In [1]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                                   OneHotEncoder, StringIndexer)
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
data = spark.read.format("csv").load("/FileStore/tables/titanic.csv", inferSchema=True, header=True)

In [3]:
data.show()

In [4]:
data.columns

In [5]:
df = data.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])
df.show()

In [6]:
df = df.na.drop()
df.show()

In [7]:
sex_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
sex_encoder = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

In [8]:
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

In [9]:
assembler = VectorAssembler(inputCols=['Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'], outputCol='features')

In [10]:
logreg = LogisticRegression(featuresCol='features', labelCol='Survived')

In [11]:
#Pipelining the tasks such as categorical data to one hot encoding and logistic regression
pipeline = Pipeline(stages=[sex_indexer, sex_encoder,
                            embark_indexer, embark_encoder,
                           assembler, logreg])

train_data, test_data = df.randomSplit([0.7, 0.3])

fit_model = pipeline.fit(train_data)

In [12]:
results = fit_model.transform(test_data)
results.select('Survived', 'prediction').show()

In [13]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived')

In [14]:
#AUC - Area Under the Curve 
AUC = evaluator.evaluate(results)
print(AUC)