In [1]:
#Code Snippet 28
#Step 1 - Importing the data and essential libraries 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkLogReg').getOrCreate()
data = spark.read.csv('brain_tumor_dataset.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(3)
#Step 2 - Data pre-processing and converting any categorical data to spark accepted format
from pyspark.ml.feature import VectorAssembler,VectorIndexer,StringIndexer,OneHotEncoder
#Formatting the categorical column - sex
#Creating a String Indexer - To convert every string into a unique number
sex_string_indexer_direct = StringIndexer(inputCol='sex',outputCol='sexIndexer')
indexed_data = sex_string_indexer_direct.fit(data)
final_string_indexed_data = indexed_data.transform(data)
# Male - 1 and Female 0 or vice versa
#Performing OneHotEncoing - convert this value into an array form
sex_encoder_direct = OneHotEncoder(inputCol='sexIndexer',outputCol='sexVector')
encoded_data = sex_encoder_direct.transform(final_string_indexed_data)
# Male - [1,0] and Female - [0,1] or vice versa
print("Data after OneHotEncoding")
encoded_data.show(4)
assembler_direct = VectorAssembler(inputCols=['age','sexVector','tumor_size'],outputCol='features')
assembler_data = assembler_direct.transform(encoded_data)
final_data_direct = assembler_data.select('features','cancerous')
print("Consolidated Data with accepted features and labels")
final_data_direct.show(3)
#Step 3 - Training our Logistic Regression model
from pyspark.ml.classification import LogisticRegression
logreg_direct = LogisticRegression(featuresCol='features',labelCol='cancerous')
train_data_direct,test_data_direct = final_data_direct.randomSplit([0.6,0.4])
logreg_model_direct = logreg_direct.fit(train_data_direct)
#Step 4 - Evaluating and performing Predictions on our model
#Evaluating our model with testing data
#Direct Evaluation using Trivial method
predictions_labels = logreg_model_direct.evaluate(test_data_direct)
print("Prediction Data")
predictions_labels.predictions.select(['features','cancerous','prediction']).show(3)

#Evaluation using BinaryClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
direct_evaluation = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='cancerous')
AUC_direct = direct_evaluation.evaluate(predictions_labels.predictions)
print("Area Under the Curve value is {}".format(AUC_direct))
print("\nCoeffecients are {}".format(logreg_model_direct.coefficients))
print("\nIntercept is {}".format(logreg_model_direct.intercept))

Initial Data
+------+---+----+----------+---------+
|  name|age| sex|tumor_size|cancerous|
+------+---+----+----------+---------+
|Roland| 58|Male|       7.0|        1|
| Adolf| 65|Male|       9.0|        1|
| Klaus| 50|Male|       3.0|        0|
+------+---+----+----------+---------+
only showing top 3 rows

Data after OneHotEncoding
+------+---+------+----------+---------+----------+-------------+
|  name|age|   sex|tumor_size|cancerous|sexIndexer|    sexVector|
+------+---+------+----------+---------+----------+-------------+
|Roland| 58|  Male|       7.0|        1|       0.0|(1,[0],[1.0])|
| Adolf| 65|  Male|       9.0|        1|       0.0|(1,[0],[1.0])|
| Klaus| 50|  Male|       3.0|        0|       0.0|(1,[0],[1.0])|
|  Rosh| 26|Female|       2.0|        0|       1.0|    (1,[],[])|
+------+---+------+----------+---------+----------+-------------+
only showing top 4 rows

Consolidated Data with accepted features and labels
+--------------+---------+
|      features|cancerous|
+---