In [59]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

In [60]:
spark = SparkSession.builder.appName("Heart Disease Prediction").getOrCreate()

In [61]:
df = spark.read.csv('heart_cleaned.csv', inferSchema=True,header=True)

In [62]:
df.show()

+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
|Age|Sex|ChestPainType|RestingBP|Cholesterol|FastingBS|RestingECG|MaxHR|ExerciseAngina|Oldpeak|ST_Slope|HeartDisease|
+---+---+-------------+---------+-----------+---------+----------+-----+--------------+-------+--------+------------+
| 40|  1|            1|      140|        289|        0|         1|  172|             0|    0.0|       2|           0|
| 49|  0|            2|      160|        180|        0|         1|  156|             0|    1.0|       1|           1|
| 37|  1|            1|      130|        283|        0|         2|   98|             0|    0.0|       2|           0|
| 48|  0|            0|      138|        214|        0|         1|  108|             1|    1.5|       1|           1|
| 54|  1|            2|      150|        195|        0|         1|  122|             0|    0.0|       2|           0|
| 39|  1|            2|      120|        339|        0| 

In [63]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- ChestPainType: integer (nullable = true)
 |-- RestingBP: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- FastingBS: integer (nullable = true)
 |-- RestingECG: integer (nullable = true)
 |-- MaxHR: integer (nullable = true)
 |-- ExerciseAngina: integer (nullable = true)
 |-- Oldpeak: double (nullable = true)
 |-- ST_Slope: integer (nullable = true)
 |-- HeartDisease: integer (nullable = true)



In [64]:
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|summary|               Age|               Sex|     ChestPainType|         RestingBP|       Cholesterol|          FastingBS|        RestingECG|             MaxHR|     ExerciseAngina|           Oldpeak|          ST_Slope|       HeartDisease|
+-------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+
|  count|               918|               918|               918|               918|               918|                918|               918|               918|                918|               918|               918|                918|
|   mean|53.510893246187365| 0.78976

In [65]:
target_column = df.columns[-1]

In [66]:
feature_columns = df.columns[:-1]

In [67]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)

In [68]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

In [69]:
layers = [len(feature_columns), 10, 5, 2]

In [70]:
mlp = MultilayerPerceptronClassifier(labelCol=target_column, featuresCol="features", layers=layers, blockSize=128, seed=12)


In [71]:
evaluator = MulticlassClassificationEvaluator(labelCol=target_column, metricName="accuracy")

In [72]:
def show_confusion_matrix(predictions, model_name):
    confusion_matrix = predictions.groupBy(target_column, "prediction").count().orderBy(target_column, "prediction")
    print(f"\nConfusion Matrix for {model_name}:")
    confusion_matrix.show()

In [73]:
lr = LogisticRegression(labelCol=target_column, featuresCol="features")
lr_model = lr.fit(train_df)
lr_predictions = lr_model.transform(test_df)
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")
show_confusion_matrix(lr_predictions, "Logistic Regression")

Logistic Regression Accuracy: 0.8776371308016878

Confusion Matrix for Logistic Regression:
+------------+----------+-----+
|HeartDisease|prediction|count|
+------------+----------+-----+
|           0|       0.0|   83|
|           0|       1.0|   16|
|           1|       0.0|   13|
|           1|       1.0|  125|
+------------+----------+-----+



In [74]:
dt = DecisionTreeClassifier(labelCol=target_column, featuresCol="features")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_accuracy = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")
show_confusion_matrix(dt_predictions, "Decision Tree")

Decision Tree Accuracy: 0.8438818565400844

Confusion Matrix for Decision Tree:
+------------+----------+-----+
|HeartDisease|prediction|count|
+------------+----------+-----+
|           0|       0.0|   79|
|           0|       1.0|   20|
|           1|       0.0|   17|
|           1|       1.0|  121|
+------------+----------+-----+



In [75]:
gbt = GBTClassifier(labelCol=target_column, featuresCol="features", maxIter=100)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_accuracy = evaluator.evaluate(gbt_predictions)
print(f"Gradient Boosted Tree Accuracy: {gbt_accuracy}")
show_confusion_matrix(gbt_predictions, "Gradient Boosted Tree")

Gradient Boosted Tree Accuracy: 0.8565400843881856

Confusion Matrix for Gradient Boosted Tree:
+------------+----------+-----+
|HeartDisease|prediction|count|
+------------+----------+-----+
|           0|       0.0|   86|
|           0|       1.0|   13|
|           1|       0.0|   21|
|           1|       1.0|  117|
+------------+----------+-----+



In [77]:
layers = [len(feature_columns), 36, 12, 2]
mlp = MultilayerPerceptronClassifier(labelCol=target_column, featuresCol="features", layers=layers, blockSize=128, seed=12)
mlp_model = mlp.fit(train_df)
mlp_predictions = mlp_model.transform(test_df)
mlp_accuracy = evaluator.evaluate(mlp_predictions)
print(f"Multilayer Perceptron (MLP) Accuracy: {mlp_accuracy}")
show_confusion_matrix(mlp_predictions, "Multilayer Perceptron (MLP)")

Multilayer Perceptron (MLP) Accuracy: 0.7130801687763713

Confusion Matrix for Multilayer Perceptron (MLP):
+------------+----------+-----+
|HeartDisease|prediction|count|
+------------+----------+-----+
|           0|       0.0|   79|
|           0|       1.0|   20|
|           1|       0.0|   48|
|           1|       1.0|   90|
+------------+----------+-----+

