## Install Openjdk and Pyspark

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark==2.4.4

## Set environment

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

## Create Spark Session

In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("spark").getOrCreate()

## Cloning and  using the diabetes data set

In [None]:
! git clone https://github.com/education454/diabetes_dataset

In [6]:
! ls diabetes_dataset 

diabetes.csv


## Read the data

In [7]:
df = spark.read.csv('/content/diabetes_dataset/diabetes.csv',header = True , inferSchema= True)

### Check 20 rows of data set

In [8]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|
|          4|     99|           72|           17|      0|25.6|                   0.294| 28|      0|


## Display data in root fashion

In [9]:
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



##  Check no of observations and total variables

In [14]:
print((df.count(),len(df.columns)))

(2000, 9)


## Check how many patients are diabetic and non diabetic ( 0 - means non diabetic and 1 means person is diabetic )

In [15]:
df.groupby('Outcome').count().show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  684|
|      0| 1316|
+-------+-----+



In [17]:
df.describe().show()

+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|summary|      Pregnancies|           Glucose|     BloodPressure|    SkinThickness|          Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|  count|             2000|              2000|              2000|             2000|             2000|              2000|                    2000|              2000|              2000|
|   mean|           3.7035|          121.1825|           69.1455|           20.935|           80.254|32.192999999999984|     0.47092999999999974|           33.0905|             0.342|
| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.103242909926

In [26]:
# find the total no of zeroes in columns :  Glucose|     BloodPressure|    SkinThickness|          Insulin               BMI
def count_zeroes():
  columns_list=['Glucose', 'BloodPressure','SkinThickness','Insulin','BMI']
  for i in columns_list :
    print(i+":"+str(df[df[i]==0].count()))

In [27]:
count_zeroes()

Glucose:13
BloodPressure:90
SkinThickness:573
Insulin:956
BMI:28


## Replacing the 0 values with the mean values to fill the columns for better prediction

In [30]:
from pyspark.sql.functions import *
for i in df.columns[1:6]:
  data=df.agg({i:'mean'}).first()[0]
  print("Mean value for {} is {}".format(i,int(data)))
  df=df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))

Mean value for Glucose is 121
Mean value for BloodPressure is 69
Mean value for SkinThickness is 20
Mean value for Insulin is 80
Mean value for BMI is 32


In [31]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|     80|33.6|                   0.127| 47|      1|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          0|    145|           69|           20|     80|44.2|                    0.63| 31|      1|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
|          0|    173|           78|           32|    265|46.5|                   1.159| 58|      0|
|          4|     99|           72|           17|     80|25.6|                   0.294| 28|      0|


## Getting Correlation value for all columns expect Outcome 

In [33]:
for col in df.columns:
  print("correlation to outcome for {} is {}".format(col,df.stat.corr('Outcome',col)))

correlation to outcome for Pregnancies is 0.22443699263363961
correlation to outcome for Glucose is 0.48796646527321064
correlation to outcome for BloodPressure is 0.17171333286446713
correlation to outcome for SkinThickness is 0.1659010662889893
correlation to outcome for Insulin is 0.1711763270226193
correlation to outcome for BMI is 0.2827927569760082
correlation to outcome for DiabetesPedigreeFunction is 0.1554590791569403
correlation to outcome for Age is 0.23650924717620253
correlation to outcome for Outcome is 1.0


## Getting important features and transforming it into a single column as a vector

In [36]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'],outputCol='features' )
output_data = assembler.transform(df)

In [37]:
output_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)



In [38]:
output_data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          2|    138|           62|           35|     80|33.6|                   0.127| 47|      1|[2.0,138.0,62.0,3...|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|[0.0,84.0,82.0,31...|
|          0|    145|           69|           20|     80|44.2|                    0.63| 31|      1|[0.0,145.0,69.0,2...|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|[0.0,135.0,68.0,4...|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|[1.0,139.0,62.0,4...|
|          0|    173|           

## Import Logistic Regression

In [39]:
from pyspark.ml.classification import LogisticRegression
final_data = output_data.select('features','Outcome')

In [40]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Outcome: integer (nullable = true)



## 70 % training data and 30 % data

In [41]:
train , test = final_data.randomSplit([0.7 , 0.3])
models = LogisticRegression(labelCol = 'Outcome')
model = models.fit(train)

In [43]:
summary = model.summary

## Training Prediction and Seeing Data Prediction Column

In [44]:
summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|            Outcome|         prediction|
+-------+-------------------+-------------------+
|  count|               1408|               1408|
|   mean|0.34517045454545453|0.25639204545454547|
| stddev| 0.4755927428644417|   0.43679591254823|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



## Import BinaryClassificationEvaluator

In [45]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test)

## Getting Prediction

In [46]:
predictions.predictions.show(10)

+--------------------+-------+--------------------+--------------------+----------+
|            features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[0.0,67.0,76.0,20...|      0|[2.25766168642901...|[0.90530937060898...|       0.0|
|[0.0,73.0,69.0,20...|      0|[4.04161009008745...|[0.98273418421216...|       0.0|
|[0.0,73.0,69.0,20...|      0|[4.04161009008745...|[0.98273418421216...|       0.0|
|[0.0,74.0,52.0,10...|      0|[3.33173973257151...|[0.96550176386567...|       0.0|
|[0.0,78.0,88.0,29...|      0|[2.67176356565568...|[0.93533977250152...|       0.0|
|[0.0,84.0,82.0,31...|      0|[2.56279313171632...|[0.92842828137213...|       0.0|
|[0.0,86.0,68.0,32...|      0|[2.48779754409512...|[0.92328194278224...|       0.0|
|[0.0,91.0,68.0,32...|      0|[2.08331939399655...|[0.88927130945859...|       0.0|
|[0.0,91.0,80.0,20...|      0|[2.34390325876509...|[0.91244840452865...|    

## Getting prediction for Test 

In [47]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='Outcome')
evaluator.evaluate(model.transform(test))

0.8589832333487161

## Accuracy is approx 86 % which is quite good.

In [55]:
# For saving the model
# model.save("save")
from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load('model')