# **Diabetes Classification**

In [1]:
# install
!pip install Pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# start spark session
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
# read external data with url
import pandas as pd
diabetes = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv')
diabetes = spark.createDataFrame(diabetes)

In [6]:
diabetes.show()

+-----------+-------+---------+-------+-------+----+------------------+---+--------+
|pregnancies|glucose|diastolic|triceps|insulin| bmi|               dpf|age|diabetes|
+-----------+-------+---------+-------+-------+----+------------------+---+--------+
|          6|    148|       72|     35|      0|33.6|             0.627| 50|       1|
|          1|     85|       66|     29|      0|26.6|             0.351| 31|       0|
|          8|    183|       64|      0|      0|23.3|             0.672| 32|       1|
|          1|     89|       66|     23|     94|28.1|0.1669999999999999| 21|       0|
|          0|    137|       40|     35|    168|43.1|2.2880000000000003| 33|       1|
|          5|    116|       74|      0|      0|25.6|             0.201| 30|       0|
|          3|     78|       50|     32|     88|31.0|             0.248| 26|       1|
|         10|    115|        0|      0|      0|35.3|             0.134| 29|       0|
|          2|    197|       70|     45|    543|30.5|             

In [7]:
diabetes.printSchema()

root
 |-- pregnancies: long (nullable = true)
 |-- glucose: long (nullable = true)
 |-- diastolic: long (nullable = true)
 |-- triceps: long (nullable = true)
 |-- insulin: long (nullable = true)
 |-- bmi: double (nullable = true)
 |-- dpf: double (nullable = true)
 |-- age: long (nullable = true)
 |-- diabetes: long (nullable = true)



In [8]:
diabetes.columns

['pregnancies',
 'glucose',
 'diastolic',
 'triceps',
 'insulin',
 'bmi',
 'dpf',
 'age',
 'diabetes']

In [9]:
from pyspark.ml.feature import VectorAssembler 

In [10]:
featureassembler= VectorAssembler(inputCols=['pregnancies',
 'glucose',
 'diastolic',
 'triceps',
 'insulin',
 'bmi',
 'dpf',
 'age'], outputCol='Features')

In [11]:
output = featureassembler.transform(diabetes)

In [12]:
output.show()

+-----------+-------+---------+-------+-------+----+------------------+---+--------+--------------------+
|pregnancies|glucose|diastolic|triceps|insulin| bmi|               dpf|age|diabetes|            Features|
+-----------+-------+---------+-------+-------+----+------------------+---+--------+--------------------+
|          6|    148|       72|     35|      0|33.6|             0.627| 50|       1|[6.0,148.0,72.0,3...|
|          1|     85|       66|     29|      0|26.6|             0.351| 31|       0|[1.0,85.0,66.0,29...|
|          8|    183|       64|      0|      0|23.3|             0.672| 32|       1|[8.0,183.0,64.0,0...|
|          1|     89|       66|     23|     94|28.1|0.1669999999999999| 21|       0|[1.0,89.0,66.0,23...|
|          0|    137|       40|     35|    168|43.1|2.2880000000000003| 33|       1|[0.0,137.0,40.0,3...|
|          5|    116|       74|      0|      0|25.6|             0.201| 30|       0|[5.0,116.0,74.0,0...|
|          3|     78|       50|     32|     88

In [13]:
output.select('Features').show()

+--------------------+
|            Features|
+--------------------+
|[6.0,148.0,72.0,3...|
|[1.0,85.0,66.0,29...|
|[8.0,183.0,64.0,0...|
|[1.0,89.0,66.0,23...|
|[0.0,137.0,40.0,3...|
|[5.0,116.0,74.0,0...|
|[3.0,78.0,50.0,32...|
|[10.0,115.0,0.0,0...|
|[2.0,197.0,70.0,4...|
|[8.0,125.0,96.0,0...|
|[4.0,110.0,92.0,0...|
|[10.0,168.0,74.0,...|
|[10.0,139.0,80.0,...|
|[1.0,189.0,60.0,2...|
|[5.0,166.0,72.0,1...|
|[7.0,100.0,0.0,0....|
|[0.0,118.0,84.0,4...|
|[7.0,107.0,74.0,0...|
|[1.0,103.0,30.0,3...|
|[1.0,115.0,70.0,3...|
+--------------------+
only showing top 20 rows



In [14]:
modeldata=output.select('Features','diabetes')

In [15]:
modeldata.show()

+--------------------+--------+
|            Features|diabetes|
+--------------------+--------+
|[6.0,148.0,72.0,3...|       1|
|[1.0,85.0,66.0,29...|       0|
|[8.0,183.0,64.0,0...|       1|
|[1.0,89.0,66.0,23...|       0|
|[0.0,137.0,40.0,3...|       1|
|[5.0,116.0,74.0,0...|       0|
|[3.0,78.0,50.0,32...|       1|
|[10.0,115.0,0.0,0...|       0|
|[2.0,197.0,70.0,4...|       1|
|[8.0,125.0,96.0,0...|       1|
|[4.0,110.0,92.0,0...|       0|
|[10.0,168.0,74.0,...|       1|
|[10.0,139.0,80.0,...|       0|
|[1.0,189.0,60.0,2...|       1|
|[5.0,166.0,72.0,1...|       1|
|[7.0,100.0,0.0,0....|       1|
|[0.0,118.0,84.0,4...|       1|
|[7.0,107.0,74.0,0...|       1|
|[1.0,103.0,30.0,3...|       0|
|[1.0,115.0,70.0,3...|       1|
+--------------------+--------+
only showing top 20 rows



In [16]:
# split data
train_data,test_data=modeldata.randomSplit([0.8,0.2])

In [17]:
train_data.show()

+--------------------+--------+
|            Features|diabetes|
+--------------------+--------+
|(8,[0,1,6,7],[2.0...|       0|
|(8,[0,1,6,7],[2.0...|       0|
|(8,[0,1,6,7],[7.0...|       0|
|(8,[1,5,6,7],[117...|       0|
|(8,[1,5,6,7],[167...|       1|
|[0.0,78.0,88.0,29...|       0|
|[0.0,84.0,64.0,22...|       0|
|[0.0,86.0,68.0,32...|       0|
|[0.0,93.0,60.0,25...|       0|
|[0.0,98.0,82.0,15...|       0|
|[0.0,100.0,88.0,6...|       0|
|[0.0,101.0,64.0,1...|       0|
|[0.0,101.0,65.0,2...|       0|
|[0.0,101.0,76.0,0...|       0|
|[0.0,102.0,52.0,0...|       0|
|[0.0,102.0,75.0,2...|       0|
|[0.0,104.0,64.0,2...|       0|
|[0.0,104.0,76.0,0...|       0|
|[0.0,105.0,64.0,4...|       0|
|[0.0,105.0,68.0,2...|       0|
+--------------------+--------+
only showing top 20 rows



In [18]:
# logistic regression model
from pyspark.ml.classification import LogisticRegression

In [19]:
logit = LogisticRegression(featuresCol='Features', labelCol='diabetes')

In [20]:
logit=logit.fit(train_data)

In [21]:
logit.coefficients

DenseVector([0.1046, 0.034, -0.0138, 0.0022, -0.0015, 0.094, 0.4844, 0.0192])

In [22]:
# prediction
y_pred = logit.transform(test_data)

In [23]:
y_pred.show()

+--------------------+--------+--------------------+--------------------+----------+
|            Features|diabetes|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[131...|       1|[-0.8775347293445...|[0.29368890390497...|       1.0|
|(8,[1,5,6,7],[138...|       1|[-0.7687875768549...|[0.31674143607486...|       1.0|
|[0.0,93.0,100.0,3...|       0|[1.26893485151084...|[0.78056035753217...|       0.0|
|[0.0,95.0,85.0,25...|       1|[2.11955373104362...|[0.89278922164102...|       0.0|
|[0.0,100.0,70.0,2...|       0|[2.26989513285913...|[0.90635288729852...|       0.0|
|[0.0,108.0,68.0,2...|       0|[1.93211044859029...|[0.87348283065706...|       0.0|
|[0.0,125.0,96.0,0...|       0|[2.70152230030653...|[0.93711641166328...|       0.0|
|[0.0,128.0,68.0,1...|       1|[1.06910774034432...|[0.74442719568390...|       0.0|
|[0.0,129.0,80.0,0...|       0|[1.15831511662281...|[0.7610264279

In [24]:
# confusion matrix
y_pred.groupBy('diabetes', 'prediction').count().show()

+--------+----------+-----+
|diabetes|prediction|count|
+--------+----------+-----+
|       1|       0.0|   29|
|       0|       1.0|   11|
|       0|       0.0|   91|
|       1|       1.0|   34|
+--------+----------+-----+



(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)

In [25]:
# evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [26]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='diabetes', predictionCol='prediction', 
    metricName='weightedPrecision')

In [27]:
evaluator.evaluate(y_pred)

0.7572727272727273

In [28]:
# close connection to spark
spark.stop()