In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv


In [2]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 30 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 25.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612244 sha256=356f9bd6438ad89254491cd6c74efd1dd0ded410c317928a047cc8cde2187838
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1

In [3]:
# Importing the Package
from pyspark import SparkContext
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

print("All Package Loaded")

All Package Loaded


In [4]:
#Load the Data
spark=SparkSession.builder.master('local').appName("Classification Heart Failure").getOrCreate()
data=spark.read.csv("../input/heart-failure-clinical-data", inferSchema=True, header=True)
data.show(truncate=False, n=5)

+----+-------+------------------------+--------+-----------------+-------------------+---------+----------------+------------+---+-------+----+-----------+
|age |anaemia|creatinine_phosphokinase|diabetes|ejection_fraction|high_blood_pressure|platelets|serum_creatinine|serum_sodium|sex|smoking|time|DEATH_EVENT|
+----+-------+------------------------+--------+-----------------+-------------------+---------+----------------+------------+---+-------+----+-----------+
|75.0|0      |582                     |0       |20               |1                  |265000.0 |1.9             |130         |1  |0      |4   |1          |
|55.0|0      |7861                    |0       |38               |0                  |263358.03|1.1             |136         |1  |0      |6   |1          |
|65.0|0      |146                     |0       |20               |0                  |162000.0 |1.3             |129         |1  |1      |7   |1          |
|50.0|1      |111                     |0       |20              

In [5]:
#Cek the data
data.printSchema()

root
 |-- age: double (nullable = true)
 |-- anaemia: integer (nullable = true)
 |-- creatinine_phosphokinase: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- ejection_fraction: integer (nullable = true)
 |-- high_blood_pressure: integer (nullable = true)
 |-- platelets: double (nullable = true)
 |-- serum_creatinine: double (nullable = true)
 |-- serum_sodium: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- smoking: integer (nullable = true)
 |-- time: integer (nullable = true)
 |-- DEATH_EVENT: integer (nullable = true)



In [6]:
#Split data
data_split=data.randomSplit([0.7,0.3])
train=data_split[0]
test=data_split[1]
print("train data:\n", train.count(), "\ntest data:\n", test.count())

train data:
 213 
test data:
 86


In [7]:
#Featuring Field
train.show(2)
assembler=VectorAssembler(inputCols=['age','anaemia','creatinine_phosphokinase',
                                     'diabetes','ejection_fraction','high_blood_pressure',
                                     'platelets','serum_creatinine','serum_sodium',
                                     'sex','smoking','time'],outputCol='features')
trainingdatafinal=assembler.transform(train).select(col('features'),col('DEATH_EVENT').alias('label'))
trainingdatafinal.show(truncate=False, n=5)

+----+-------+------------------------+--------+-----------------+-------------------+---------+----------------+------------+---+-------+----+-----------+
| age|anaemia|creatinine_phosphokinase|diabetes|ejection_fraction|high_blood_pressure|platelets|serum_creatinine|serum_sodium|sex|smoking|time|DEATH_EVENT|
+----+-------+------------------------+--------+-----------------+-------------------+---------+----------------+------------+---+-------+----+-----------+
|40.0|      0|                      90|       0|               35|                  0| 255000.0|             1.1|         136|  1|      1| 212|          0|
|40.0|      0|                     582|       1|               35|                  0| 222000.0|             1.0|         132|  1|      0| 244|          0|
+----+-------+------------------------+--------+-----------------+-------------------+---------+----------------+------------+---+-------+----+-----------+
only showing top 2 rows

+--------------------------------------

In [8]:
#Training Model
Classification=LogisticRegression(featuresCol='features', labelCol='label', maxIter=100,regParam=0.3)
model=Classification.fit(trainingdatafinal)
print("Training Done")

Training Done


In [9]:
#Preparing data test
testingdatafinal=assembler.transform(test).select(col('features'),
                                                  col('DEATH_EVENT').alias("label"))
testingdatafinal.show(truncate=False, n=5)

+--------------------------------------------------------------+-----+
|features                                                      |label|
+--------------------------------------------------------------+-----+
|[40.0,0.0,244.0,0.0,45.0,1.0,275000.0,0.9,140.0,0.0,0.0,174.0]|0    |
|[40.0,0.0,478.0,1.0,30.0,0.0,303000.0,0.9,136.0,1.0,0.0,148.0]|0    |
|[40.0,1.0,129.0,0.0,35.0,0.0,255000.0,0.9,137.0,1.0,0.0,209.0]|0    |
|[42.0,0.0,64.0,0.0,30.0,0.0,215000.0,3.8,128.0,1.0,1.0,250.0] |0    |
|[42.0,0.0,64.0,0.0,40.0,0.0,189000.0,0.7,140.0,1.0,0.0,245.0] |0    |
+--------------------------------------------------------------+-----+
only showing top 5 rows



In [10]:
#Testing model using data test
raw_prediction=model.transform(testingdatafinal)
raw_prediction.show(truncate=False, n=4)
raw_prediction.printSchema()


+--------------------------------------------------------------+-----+------------------------------------------+----------------------------------------+----------+
|features                                                      |label|rawPrediction                             |probability                             |prediction|
+--------------------------------------------------------------+-----+------------------------------------------+----------------------------------------+----------+
|[40.0,0.0,244.0,0.0,45.0,1.0,275000.0,0.9,140.0,0.0,0.0,174.0]|0    |[1.7044296300106812,-1.7044296300106812]  |[0.8461123849622791,0.15388761503772083]|0.0       |
|[40.0,0.0,478.0,1.0,30.0,0.0,303000.0,0.9,136.0,1.0,0.0,148.0]|0    |[1.0978796403606115,-1.0978796403606115]  |[0.7498626032826219,0.2501373967173781] |0.0       |
|[40.0,1.0,129.0,0.0,35.0,0.0,255000.0,0.9,137.0,1.0,0.0,209.0]|0    |[1.5476557125291404,-1.5476557125291404]  |[0.8245748859547668,0.1754251140452332] |0.0       |
|[42

In [11]:
#Final test look
final_prediction=raw_prediction.select('features','label','prediction')
final_prediction.show(truncate=False, n=5)

+--------------------------------------------------------------+-----+----------+
|features                                                      |label|prediction|
+--------------------------------------------------------------+-----+----------+
|[40.0,0.0,244.0,0.0,45.0,1.0,275000.0,0.9,140.0,0.0,0.0,174.0]|0    |0.0       |
|[40.0,0.0,478.0,1.0,30.0,0.0,303000.0,0.9,136.0,1.0,0.0,148.0]|0    |0.0       |
|[40.0,1.0,129.0,0.0,35.0,0.0,255000.0,0.9,137.0,1.0,0.0,209.0]|0    |0.0       |
|[42.0,0.0,64.0,0.0,30.0,0.0,215000.0,3.8,128.0,1.0,1.0,250.0] |0    |0.0       |
|[42.0,0.0,64.0,0.0,40.0,0.0,189000.0,0.7,140.0,1.0,0.0,245.0] |0    |0.0       |
+--------------------------------------------------------------+-----+----------+
only showing top 5 rows



In [12]:
#Accuracy
Total_true=final_prediction.filter(final_prediction['label']==final_prediction['prediction']).count()
Total_data=final_prediction.count()
print("Accuracy is :\n", Total_true/Total_data*100,"%")

Accuracy is :
 77.90697674418605 %


#I have using some methods, in non spark analysis, wait me to  improve this accuracy. Please upvote! Thank you.