In [1]:
#Code Snippet 29
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkLogReg').getOrCreate()
data = spark.read.csv('brain_tumor_dataset.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(3)
from pyspark.ml.feature import VectorAssembler,VectorIndexer,StringIndexer,OneHotEncoder
#Stage 1
sex_string_indexer = StringIndexer(inputCol='sex',outputCol='sexIndexer')
#Stage 2
sex_encoder = OneHotEncoder(inputCol='sexIndexer',outputCol='sexVector')
#Stage 3
assembler = VectorAssembler(inputCols=['age','sexVector','tumor_size'],outputCol='features')
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
#Stage 4
logreg = LogisticRegression(featuresCol='features',labelCol='cancerous')
#passing the 4 stages directly into a pipeline object
pipeline_object = Pipeline(stages=[sex_string_indexer,sex_encoder,assembler,logreg])
train_data , test_data = data.randomSplit([0.6,0.4])
logreg_model = pipeline_object.fit(train_data)
model_results = logreg_model.transform(test_data)
print("Prediction Data")
model_results.select('cancerous','prediction').show(3)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluation_object = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='cancerous')
AUC = evaluation_object.evaluate(model_results)
print("Area Under the Curve value is {}".format(AUC))

Initial Data
+------+---+----+----------+---------+
|  name|age| sex|tumor_size|cancerous|
+------+---+----+----------+---------+
|Roland| 58|Male|       7.0|        1|
| Adolf| 65|Male|       9.0|        1|
| Klaus| 50|Male|       3.0|        0|
+------+---+----+----------+---------+
only showing top 3 rows

Prediction Data
+---------+----------+
|cancerous|prediction|
+---------+----------+
|        1|       1.0|
|        0|       0.0|
|        0|       1.0|
+---------+----------+
only showing top 3 rows

Area Under the Curve value is 0.6666666666666667
