In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HA').getOrCreate()

In [4]:
#reading the heart dataset 
df = spark.read.csv('heart.csv',header=True,inferSchema=True)
df.show()

+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
|age|sex| cp|trtbps|chol|fbs|restecg|thalachh|exng|oldpeak|slp|caa|thall|output|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+
| 63|  1|  3|   145| 233|  1|      0|     150|   0|    2.3|  0|  0|    1|     1|
| 37|  1|  2|   130| 250|  0|      1|     187|   0|    3.5|  0|  0|    2|     1|
| 41|  0|  1|   130| 204|  0|      0|     172|   0|    1.4|  2|  0|    2|     1|
| 56|  1|  1|   120| 236|  0|      1|     178|   0|    0.8|  2|  0|    2|     1|
| 57|  0|  0|   120| 354|  0|      1|     163|   1|    0.6|  2|  0|    2|     1|
| 57|  1|  0|   140| 192|  0|      1|     148|   0|    0.4|  1|  0|    1|     1|
| 56|  0|  1|   140| 294|  0|      0|     153|   0|    1.3|  1|  0|    2|     1|
| 44|  1|  1|   120| 263|  0|      1|     173|   0|    0.0|  2|  0|    3|     1|
| 52|  1|  2|   172| 199|  1|      1|     162|   0|    0.5|  2|  0|    3|     1|
| 57|  1|  2|   150| 168|  0

In [5]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['age','sex','trtbps','chol','fbs','restecg','thalachh','exng','oldpeak','slp','caa','thall'], outputCol='Independent Features')

In [6]:
output = featureassembler.transform(df)

In [7]:
output.show()

+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+--------------------+
|age|sex| cp|trtbps|chol|fbs|restecg|thalachh|exng|oldpeak|slp|caa|thall|output|Independent Features|
+---+---+---+------+----+---+-------+--------+----+-------+---+---+-----+------+--------------------+
| 63|  1|  3|   145| 233|  1|      0|     150|   0|    2.3|  0|  0|    1|     1|[63.0,1.0,145.0,2...|
| 37|  1|  2|   130| 250|  0|      1|     187|   0|    3.5|  0|  0|    2|     1|[37.0,1.0,130.0,2...|
| 41|  0|  1|   130| 204|  0|      0|     172|   0|    1.4|  2|  0|    2|     1|[41.0,0.0,130.0,2...|
| 56|  1|  1|   120| 236|  0|      1|     178|   0|    0.8|  2|  0|    2|     1|[56.0,1.0,120.0,2...|
| 57|  0|  0|   120| 354|  0|      1|     163|   1|    0.6|  2|  0|    2|     1|[57.0,0.0,120.0,3...|
| 57|  1|  0|   140| 192|  0|      1|     148|   0|    0.4|  1|  0|    1|     1|[57.0,1.0,140.0,1...|
| 56|  0|  1|   140| 294|  0|      0|     153|   0|    1.3|  1|  0|    2|     1|[5

In [8]:
output.columns

['age',
 'sex',
 'cp',
 'trtbps',
 'chol',
 'fbs',
 'restecg',
 'thalachh',
 'exng',
 'oldpeak',
 'slp',
 'caa',
 'thall',
 'output',
 'Independent Features']

In [11]:
finalized_df = output.select("Independent Features","output")
finalized_df.show()

+--------------------+------+
|Independent Features|output|
+--------------------+------+
|[63.0,1.0,145.0,2...|     1|
|[37.0,1.0,130.0,2...|     1|
|[41.0,0.0,130.0,2...|     1|
|[56.0,1.0,120.0,2...|     1|
|[57.0,0.0,120.0,3...|     1|
|[57.0,1.0,140.0,1...|     1|
|[56.0,0.0,140.0,2...|     1|
|[44.0,1.0,120.0,2...|     1|
|[52.0,1.0,172.0,1...|     1|
|[57.0,1.0,150.0,1...|     1|
|[54.0,1.0,140.0,2...|     1|
|[48.0,0.0,130.0,2...|     1|
|[49.0,1.0,130.0,2...|     1|
|[64.0,1.0,110.0,2...|     1|
|[58.0,0.0,150.0,2...|     1|
|[50.0,0.0,120.0,2...|     1|
|[58.0,0.0,120.0,3...|     1|
|[66.0,0.0,150.0,2...|     1|
|[43.0,1.0,150.0,2...|     1|
|[69.0,0.0,140.0,2...|     1|
+--------------------+------+
only showing top 20 rows



In [14]:
from pyspark.ml.regression import LinearRegression 
#train test split 
train_data,test_data = finalized_df.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent Features',labelCol='output')
regressor = regressor.fit(train_data)

In [15]:
#coefficients 
regressor.coefficients

DenseVector([0.0025, -0.1679, -0.0033, -0.0005, 0.0978, 0.0662, 0.0037, -0.174, -0.07, 0.0753, -0.1167, -0.1782])

In [16]:
#intercept 
regressor.intercept

0.9998204245152984

In [17]:
#prediction 
pred_results = regressor.evaluate(test_data)

In [18]:
pred_results.predictions.show()

+--------------------+------+-------------------+
|Independent Features|output|         prediction|
+--------------------+------+-------------------+
|(12,[0,2,3,6,9,11...|     1| 1.0179297895548767|
|(12,[0,2,3,6,9,11...|     1| 0.9473560755964334|
|(12,[0,2,3,6,9,11...|     1| 1.0179807810433752|
|[35.0,1.0,120.0,1...|     0|0.22684564662151296|
|[35.0,1.0,122.0,1...|     1|  0.926250212763492|
|[37.0,1.0,130.0,2...|     1| 0.5275200630525806|
|[38.0,1.0,138.0,1...|     1| 0.4195909607658941|
|[38.0,1.0,138.0,1...|     1| 0.4195909607658941|
|[39.0,0.0,94.0,19...|     1| 1.2115747430887642|
|[40.0,1.0,110.0,1...|     0|0.13502349457766039|
|[41.0,0.0,112.0,2...|     1| 0.8554716340577331|
|[41.0,0.0,130.0,2...|     1| 0.9052396132851642|
|[41.0,1.0,112.0,2...|     1| 0.9630286155060348|
|[41.0,1.0,130.0,2...|     1| 0.5999786286803137|
|[41.0,1.0,135.0,2...|     1| 0.8398464674289179|
|[43.0,1.0,130.0,3...|     1|  0.562299977558868|
|[44.0,1.0,120.0,1...|     0| 0.5138247380514618|


In [19]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.30169570544461466, 0.14570985755986793)

In [21]:
pred_results.r2

0.4167566616314642