# Introduction to Machine Learning in PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import log1p
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, when, count 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder\
        .appName("Evaluating a Logistic Regression Model")\
        .getOrCreate() # create a Spark session 

In [5]:
spark # spark session I've created

In [6]:
path = '/home/aspphem/Desktop/MCE/BigData/archive/breast-cancer.csv' # file path
df = spark.read.option('header', 'true').csv(path, inferSchema = True) # read csv file

In [7]:
df.printSchema() # print out the schema in tree format

root
 |-- id: integer (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- radius_mean: double (nullable = true)
 |-- texture_mean: double (nullable = true)
 |-- perimeter_mean: double (nullable = true)
 |-- area_mean: double (nullable = true)
 |-- smoothness_mean: double (nullable = true)
 |-- compactness_mean: double (nullable = true)
 |-- concavity_mean: double (nullable = true)
 |-- concave points_mean: double (nullable = true)
 |-- symmetry_mean: double (nullable = true)
 |-- fractal_dimension_mean: double (nullable = true)
 |-- radius_se: double (nullable = true)
 |-- texture_se: double (nullable = true)
 |-- perimeter_se: double (nullable = true)
 |-- area_se: double (nullable = true)
 |-- smoothness_se: double (nullable = true)
 |-- compactness_se: double (nullable = true)
 |-- concavity_se: double (nullable = true)
 |-- concave points_se: double (nullable = true)
 |-- symmetry_se: double (nullable = true)
 |-- fractal_dimension_se: double (nullable = true)
 |-- radi

In [8]:
df = df.withColumn("diagnosis_no", when(df['diagnosis'] == 'M', 1).otherwise(0)) # map the label to numeric values

## Building and Evaluating a Logistic Regression Model for Classification

In [10]:
assembler = VectorAssembler(
    inputCols = ["radius_mean", "perimeter_mean", "area_mean", "smoothness_mean"],
    outputCol = "features"
) # combine multiple features into a single vector column

In [11]:
assembled_data = assembler.transform(df) # apply VectorAssembler to the dataframe
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed = 42) # split data in training data (80%) and test data (20%)

In [13]:
logistic_reg = LogisticRegression(featuresCol = 'features', labelCol = 'diagnosis_no') # define a log regression model
logistic_reg_model = logistic_reg.fit(train_data) # fit the logistic regression model 
predictions = logistic_reg_model.transform(test_data) # apply model to test data

### Assessing Model Accuracy

In [14]:
evaluator = MulticlassClassificationEvaluator(labelCol = "diagnosis_no", predictionCol = "prediction", metricName = "accuracy") # define an evaluator
accuracy = evaluator.evaluate(predictions) # evaluate the model on test data

print(f"Accuracy: {accuracy}")

Accuracy: 0.9418604651162791


In [16]:
predictions.select("diagnosis_no", "prediction", "features").show()

+------------+----------+--------------------+
|diagnosis_no|prediction|            features|
+------------+----------+--------------------+
|           0|       0.0|[14.96,97.03,687....|
|           0|       0.0|[12.18,77.79,451....|
|           0|       0.0|[12.63,82.15,480....|
|           0|       0.0|[10.8,68.77,357.6...|
|           0|       0.0|[11.46,73.59,403....|
|           1|       1.0|[23.51,155.1,1747...|
|           1|       0.0|[14.6,93.97,664.7...|
|           0|       0.0|[12.54,81.25,476....|
|           1|       1.0|[13.0,87.5,519.8,...|
|           1|       0.0|[16.02,102.7,797....|
|           1|       1.0|[19.17,132.4,1123...|
|           1|       1.0|[14.68,94.74,684....|
|           1|       1.0|[21.16,137.2,1404...|
|           1|       1.0|[18.61,122.1,1094...|
|           1|       1.0|[16.74,110.1,869....|
|           1|       1.0|[19.07,128.3,1104...|
|           1|       1.0|[18.22,120.3,1033...|
|           1|       1.0|[15.37,100.2,728....|
|           1

In [17]:
spark.stop() # stop spark session