# **Cancer Decision Tree Classification**

In [None]:
# install
!pip install Pyspark

In [2]:
# start spark session
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
# read sklearn inbuilt data
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer(as_frame=True)
cancer = cancer.frame
cancer = spark.createDataFrame(cancer)

In [6]:
cancer.show()

+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+------+
|mean radius|mean texture|mean perimeter|mean area|mean smoothness|mean compactness|mean concavity|mean concave points|mean symmetry|mean fractal dimension|radius error|texture error|perimeter error|area error|smoothness error|compactness error|concavity error|concave points error|symmetry error|fractal dimension error|worst radius|worst texture|worst perimeter|worst area|worst smoothness|worst compactness|worst concavity|worst concave points|worst symmetry|worst fractal dimension|target|
+-----------

In [7]:
cancer.printSchema()

root
 |-- mean radius: double (nullable = true)
 |-- mean texture: double (nullable = true)
 |-- mean perimeter: double (nullable = true)
 |-- mean area: double (nullable = true)
 |-- mean smoothness: double (nullable = true)
 |-- mean compactness: double (nullable = true)
 |-- mean concavity: double (nullable = true)
 |-- mean concave points: double (nullable = true)
 |-- mean symmetry: double (nullable = true)
 |-- mean fractal dimension: double (nullable = true)
 |-- radius error: double (nullable = true)
 |-- texture error: double (nullable = true)
 |-- perimeter error: double (nullable = true)
 |-- area error: double (nullable = true)
 |-- smoothness error: double (nullable = true)
 |-- compactness error: double (nullable = true)
 |-- concavity error: double (nullable = true)
 |-- concave points error: double (nullable = true)
 |-- symmetry error: double (nullable = true)
 |-- fractal dimension error: double (nullable = true)
 |-- worst radius: double (nullable = true)
 |-- worst 

In [8]:
cancer.columns

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry',
 'worst fractal dimension',
 'target']

In [9]:
from pyspark.ml.feature import VectorAssembler 

In [10]:
featureassembler= VectorAssembler(inputCols=['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry',
 'worst fractal dimension',], outputCol='Features')

In [11]:
output = featureassembler.transform(cancer)

In [12]:
output.show()

+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+------+--------------------+
|mean radius|mean texture|mean perimeter|mean area|mean smoothness|mean compactness|mean concavity|mean concave points|mean symmetry|mean fractal dimension|radius error|texture error|perimeter error|area error|smoothness error|compactness error|concavity error|concave points error|symmetry error|fractal dimension error|worst radius|worst texture|worst perimeter|worst area|worst smoothness|worst compactness|worst concavity|worst concave points|worst symmetry|worst fractal dimension

In [13]:
modeldata=output.select('Features','target')

In [14]:
modeldata.show()

+--------------------+------+
|            Features|target|
+--------------------+------+
|[17.99,10.38,122....|     0|
|[20.57,17.77,132....|     0|
|[19.69,21.25,130....|     0|
|[11.42,20.38,77.5...|     0|
|[20.29,14.34,135....|     0|
|[12.45,15.7,82.57...|     0|
|[18.25,19.98,119....|     0|
|[13.71,20.83,90.2...|     0|
|[13.0,21.82,87.5,...|     0|
|[12.46,24.04,83.9...|     0|
|[16.02,23.24,102....|     0|
|[15.78,17.89,103....|     0|
|[19.17,24.8,132.4...|     0|
|[15.85,23.95,103....|     0|
|[13.73,22.61,93.6...|     0|
|[14.54,27.54,96.7...|     0|
|[14.68,20.13,94.7...|     0|
|[16.13,20.68,108....|     0|
|[19.81,22.15,130....|     0|
|[13.54,14.36,87.4...|     1|
+--------------------+------+
only showing top 20 rows



In [15]:
# split data
train_data,test_data=modeldata.randomSplit([0.8,0.2])

In [16]:
train_data.show()

+--------------------+------+
|            Features|target|
+--------------------+------+
|[6.981,13.43,43.7...|     1|
|[8.196,16.84,51.7...|     1|
|[8.219,20.7,53.27...|     1|
|[8.598,20.98,54.6...|     1|
|[8.618,11.79,54.3...|     1|
|[8.888,14.64,58.7...|     1|
|[8.95,15.76,58.74...|     1|
|[9.029,17.33,58.7...|     1|
|[9.173,13.86,59.2...|     1|
|[9.504,12.44,60.3...|     1|
|[9.567,15.91,60.2...|     1|
|[9.72,18.22,60.73...|     1|
|[9.731,15.34,63.7...|     1|
|[9.738,11.97,61.2...|     1|
|[9.742,15.67,61.5...|     1|
|[9.777,16.99,62.5...|     1|
|[9.787,19.94,62.1...|     1|
|[9.876,17.27,62.9...|     1|
|[9.904,18.06,64.6...|     1|
|[10.08,15.11,63.7...|     1|
+--------------------+------+
only showing top 20 rows



In [17]:
# Decision Tree classification model
from pyspark.ml.classification import DecisionTreeClassifier

In [18]:
dt = DecisionTreeClassifier(featuresCol='Features', labelCol='target')

In [19]:
dt = dt.fit(train_data)

In [20]:
# prediction
y_pred = dt.transform(test_data)

In [21]:
y_pred.show()

+--------------------+------+-------------+--------------------+----------+
|            Features|target|rawPrediction|         probability|prediction|
+--------------------+------+-------------+--------------------+----------+
|[8.671,14.45,54.4...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[8.726,15.83,55.8...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[9.465,21.01,60.1...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[9.876,19.4,63.95...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.18,17.53,65.1...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.48,19.86,66.7...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.49,19.29,67.4...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.51,20.19,68.6...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.65,25.22,68.0...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.71,20.39,69.5...|     1|  [1.0,243.0]|[0.00409836065573...|       1.0|
|[10.75,14.9

In [22]:
# confusion matrix
y_pred.groupBy('target', 'prediction').count().show()

+------+----------+-----+
|target|prediction|count|
+------+----------+-----+
|     1|       0.0|    7|
|     0|       1.0|    3|
|     0|       0.0|   38|
|     1|       1.0|   64|
+------+----------+-----+



In [23]:
from sklearn.metrics import confusion_matrix
pred=y_pred.select("prediction").collect()
orig=y_pred.select("target").collect()
print(confusion_matrix(orig, pred))

[[38  3]
 [ 7 64]]


(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)

In [24]:
# evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='target', predictionCol='prediction')

In [26]:
accuracy = evaluator.evaluate(y_pred)
accuracy

0.9114967018152054

In [27]:
# close connection to spark
spark.stop()