<a href="https://colab.research.google.com/github/amanullah20/rdd_project/blob/main/LogisticRegression_%26_Naive_Bayes_with_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f75aa8595c4ff16a41d5a9f206c7e438ea7dd0efce1ca1628244f3e4a6af0f8f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


# **Importing Library**

In [2]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.sql import SparkSession

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
dataset=spark.read.csv('/content/diabetes.csv',header=True,inferSchema=True)

In [5]:
dataset.show(3)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 3 rows



In [6]:
dataset.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [7]:
from pyspark.sql.functions import col
new_data=dataset.select(*(col(c).cast('float').alias(c) for c in dataset.columns))

In [9]:
new_data.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [10]:
from pyspark.sql.functions import count,isnan,when

In [11]:
new_data.select([count(when(col(c).isNull(),c)).alias(c) for c in new_data.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [13]:
cols=new_data.columns
cols.remove('Outcome')
assembler = VectorAssembler(inputCols=cols,outputCol='features')
data= assembler.transform(new_data)

In [14]:
data.select('Features','Outcome').show(truncate=False)

+-----------------------------------------------------------------------+-------+
|Features                                                               |Outcome|
+-----------------------------------------------------------------------+-------+
|[6.0,148.0,72.0,35.0,0.0,33.599998474121094,0.6269999742507935,50.0]   |1.0    |
|[1.0,85.0,66.0,29.0,0.0,26.600000381469727,0.35100001096725464,31.0]   |0.0    |
|[8.0,183.0,64.0,0.0,0.0,23.299999237060547,0.671999990940094,32.0]     |1.0    |
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0]  |0.0    |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0] |1.0    |
|[5.0,116.0,74.0,0.0,0.0,25.600000381469727,0.20100000500679016,30.0]   |0.0    |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.24799999594688416,26.0]                |1.0    |
|[10.0,115.0,0.0,0.0,0.0,35.29999923706055,0.1340000033378601,29.0]     |0.0    |
|[2.0,197.0,70.0,45.0,543.0,30.5,0.15800000727176666,53.0]              |1.0    |
|[8.0,125.0,96.0

In [15]:
standardscaler= StandardScaler().setInputCol('features').setOutputCol('Scaled_features')
data = standardscaler.fit(data).transform(data)

In [16]:
data.select('features','Outcome','Scaled_features').show()

+--------------------+-------+--------------------+
|            features|Outcome|     Scaled_features|
+--------------------+-------+--------------------+
|[6.0,148.0,72.0,3...|    1.0|[1.78063837321943...|
|[1.0,85.0,66.0,29...|    0.0|[0.29677306220323...|
|[8.0,183.0,64.0,0...|    1.0|[2.37418449762590...|
|[1.0,89.0,66.0,23...|    0.0|[0.29677306220323...|
|[0.0,137.0,40.0,3...|    1.0|[0.0,4.2849165233...|
|[5.0,116.0,74.0,0...|    0.0|[1.48386531101619...|
|[3.0,78.0,50.0,32...|    1.0|[0.89031918660971...|
|[10.0,115.0,0.0,0...|    0.0|[2.96773062203238...|
|[2.0,197.0,70.0,4...|    1.0|[0.59354612440647...|
|[8.0,125.0,96.0,0...|    1.0|[2.37418449762590...|
|[4.0,110.0,92.0,0...|    0.0|[1.18709224881295...|
|[10.0,168.0,74.0,...|    1.0|[2.96773062203238...|
|[10.0,139.0,80.0,...|    0.0|[2.96773062203238...|
|[1.0,189.0,60.0,2...|    1.0|[0.29677306220323...|
|[5.0,166.0,72.0,1...|    1.0|[1.48386531101619...|
|[7.0,100.0,0.0,0....|    1.0|[2.07741143542266...|
|[0.0,118.0,

In [17]:
assembled_data=data.select('Scaled_features','Outcome')
assembled_data.show()

+--------------------+-------+
|     Scaled_features|Outcome|
+--------------------+-------+
|[1.78063837321943...|    1.0|
|[0.29677306220323...|    0.0|
|[2.37418449762590...|    1.0|
|[0.29677306220323...|    0.0|
|[0.0,4.2849165233...|    1.0|
|[1.48386531101619...|    0.0|
|[0.89031918660971...|    1.0|
|[2.96773062203238...|    0.0|
|[0.59354612440647...|    1.0|
|[2.37418449762590...|    1.0|
|[1.18709224881295...|    0.0|
|[2.96773062203238...|    1.0|
|[2.96773062203238...|    0.0|
|[0.29677306220323...|    1.0|
|[1.48386531101619...|    1.0|
|[2.07741143542266...|    1.0|
|[0.0,3.6906580274...|    1.0|
|[2.07741143542266...|    1.0|
|[0.29677306220323...|    0.0|
|[0.29677306220323...|    1.0|
+--------------------+-------+
only showing top 20 rows



# **Spliting Data**

In [18]:
train,test= assembled_data.randomSplit([0.7,0.3])
train.show()

+--------------------+-------+
|     Scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[0.5...|    0.0|
|(8,[0,1,6,7],[2.0...|    0.0|
|(8,[0,1,6,7],[2.9...|    1.0|
|(8,[1,5,6,7],[2.2...|    0.0|
|(8,[1,5,6,7],[3.0...|    0.0|
|(8,[1,5,6,7],[3.6...|    0.0|
|(8,[1,5,6,7],[4.0...|    1.0|
|(8,[1,5,6,7],[4.3...|    1.0|
|(8,[1,5,6,7],[4.4...|    1.0|
|(8,[1,5,6,7],[4.5...|    1.0|
|(8,[1,5,6,7],[5.2...|    1.0|
|(8,[1,6,7],[2.940...|    0.0|
|[0.0,2.0955431172...|    0.0|
|[0.0,2.3144804578...|    0.0|
|[0.0,2.4395875096...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.6272480873...|    0.0|
|[0.0,2.8461854279...|    0.0|
|[0.0,2.8461854279...|    0.0|
+--------------------+-------+
only showing top 20 rows



In [19]:
test.show()

+--------------------+-------+
|     Scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.8...|    0.0|
|(8,[0,1,6,7],[1.7...|    0.0|
|(8,[1,5,6,7],[3.7...|    1.0|
|[0.0,1.7827754878...|    0.0|
|[0.0,2.6898016132...|    0.0|
|[0.0,2.9087389538...|    0.0|
|[0.0,2.9712924797...|    0.0|
|[0.0,2.9712924797...|    1.0|
|[0.0,3.1276762944...|    0.0|
|[0.0,3.1589530573...|    0.0|
|[0.0,3.1589530573...|    0.0|
|[0.0,3.1902298203...|    0.0|
|[0.0,3.1902298203...|    0.0|
|[0.0,3.2527833462...|    0.0|
|[0.0,3.2527833462...|    0.0|
|[0.0,3.2840601091...|    0.0|
|[0.0,3.3778903979...|    0.0|
|[0.0,3.4091671609...|    1.0|
|[0.0,3.6906580274...|    0.0|
|[0.0,3.7532115533...|    0.0|
+--------------------+-------+
only showing top 20 rows



# **Logistic_Regression**

In [20]:
log_reg = LogisticRegression(labelCol='Outcome',featuresCol='Scaled_features',maxIter=40)
model=log_reg.fit(train)

In [21]:
prediction_test=model.transform(test)
prediction_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.8...|    0.0|[4.29473267497911...|[0.98654333426712...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[2.62970139663863...|[0.93274882090397...|       0.0|
|(8,[1,5,6,7],[3.7...|    1.0|[0.63812138444729...|[0.65432867362353...|       0.0|
|[0.0,1.7827754878...|    0.0|[3.80462110008862...|[0.97821741385713...|       0.0|
|[0.0,2.6898016132...|    0.0|[2.45554443943526...|[0.92096596068620...|       0.0|
|[0.0,2.9087389538...|    0.0|[2.70885433449826...|[0.93754710061824...|       0.0|
|[0.0,2.9712924797...|    0.0|[1.38651333654705...|[0.80003503376674...|       0.0|
|[0.0,2.9712924797...|    1.0|[2.40631104614291...|[0.91730728807534...|       0.0|
|[0.0,3.1276762944...|    0.0|[0.89633108805342...|[0.71019495695427...|    

In [22]:
prediction_test.select('Outcome','prediction').show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 10 rows



In [23]:
# compute raw scores on the test set
predictionAndLabels = prediction_test.select('Outcome','prediction').rdd

In [24]:
predictionAndLabels.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, pr

In [25]:
matrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under Roc curve
print('Area under Roc curve = %s' % matrics.areaUnderROC)



Area under Roc curve = 0.787805759730088


# **Evaluator**

In [26]:
evaluator= MulticlassClassificationEvaluator(labelCol='Outcome',predictionCol='prediction',metricName='accuracy')
accuracy_LR= evaluator.evaluate(prediction_test)
print('Accuracy =',accuracy_LR)

Accuracy = 0.7711864406779662


# **NaiveBayes**

In [27]:
naive_bayes= NaiveBayes(featuresCol='Scaled_features',labelCol='Outcome',smoothing=1.0)
model=naive_bayes.fit(train)

In [28]:
prediction_test= model.transform(test)

In [29]:
prediction_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.8...|    0.0|[-11.969371287470...|[0.54697207373221...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[-17.022592205038...|[0.47321438370467...|       1.0|
|(8,[1,5,6,7],[3.7...|    1.0|[-17.672367365023...|[0.61123421723522...|       0.0|
|[0.0,1.7827754878...|    0.0|[-29.090406410516...|[0.70749707867962...|       0.0|
|[0.0,2.6898016132...|    0.0|[-28.614676527489...|[0.76025247153495...|       0.0|
|[0.0,2.9087389538...|    0.0|[-30.277803425196...|[0.71052939230021...|       0.0|
|[0.0,2.9712924797...|    0.0|[-35.194226205083...|[0.73982577474052...|       0.0|
|[0.0,2.9712924797...|    1.0|[-30.576126729529...|[0.77278254562493...|       0.0|
|[0.0,3.1276762944...|    0.0|[-47.691353501785...|[0.76466811952541...|    

In [30]:
prediction_test.select('Outcome','prediction').show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       1.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 10 rows



In [31]:
predictionAndLabels=prediction_test.select('Outcome','prediction').rdd

In [32]:
# select (predicton ture label ) and compute test error
evaluator= MulticlassClassificationEvaluator(labelCol='Outcome',predictionCol='prediction',metricName='accuracy')
accuracy_NB= evaluator.evaluate(prediction_test)

In [33]:
print('Accuracy ' , accuracy_NB)

Accuracy  0.6694915254237288


In [34]:
metrics=BinaryClassificationMetrics(predictionAndLabels)

# Area under ROC curve
print('Area under Roc = %s ' % metrics.areaUnderROC)



Area under Roc = 0.6912039925140362 


# **GBTClassifier**

In [35]:
gradient_boost_class=GBTClassifier(labelCol='Outcome', featuresCol='Scaled_features')

In [36]:
model= gradient_boost_class.fit(train)

In [37]:
prediction_test=model.transform(test)

In [38]:
prediction_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.8...|    0.0|[1.54519759032564...|[0.95649481246098...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[1.19246517261938...|[0.91567092515505...|       0.0|
|(8,[1,5,6,7],[3.7...|    1.0|[0.12892722693245...|[0.56410879608881...|       0.0|
|[0.0,1.7827754878...|    0.0|[0.50466254102808...|[0.73288804593979...|       0.0|
|[0.0,2.6898016132...|    0.0|[1.24351710427666...|[0.92322785642315...|       0.0|
|[0.0,2.9087389538...|    0.0|[1.50809489910745...|[0.95330019202791...|       0.0|
|[0.0,2.9712924797...|    0.0|[0.97309521893644...|[0.87503065336729...|       0.0|
|[0.0,2.9712924797...|    1.0|[1.57591286531759...|[0.95898060042830...|       0.0|
|[0.0,3.1276762944...|    0.0|[1.07825071721789...|[0.89627475039751...|    

In [39]:
prediction_test.select('Outcome','prediction').show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 10 rows



In [40]:
predictionAndLabels = prediction_test.select('Outcome','prediction').rdd

In [41]:
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under ROC curve
print('Area under ROC curve = %s' % metrics.areaUnderROC)



Area under ROC curve = 0.6878092667566352


In [42]:
evaluator = MulticlassClassificationEvaluator(labelCol='Outcome',predictionCol='prediction',metricName='accuracy')
accuracy_GBT= evaluator.evaluate(prediction_test)

In [43]:
print('Accuracy',accuracy_GBT)

Accuracy 0.7203389830508474


# **RandomForestClassifier**

In [44]:
random_forest_classifier= RandomForestClassifier(labelCol='Outcome',featuresCol='Scaled_features',numTrees=40)
model=random_forest_classifier.fit(train)
prediction_test=model.transform(test)

In [45]:
prediction_test.show(5)

+--------------------+-------+--------------------+--------------------+----------+
|     Scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.8...|    0.0|[38.0964586862648...|[0.95241146715662...|       0.0|
|(8,[0,1,6,7],[1.7...|    0.0|[36.0145700879283...|[0.90036425219820...|       0.0|
|(8,[1,5,6,7],[3.7...|    1.0|[33.6569632033812...|[0.84142408008453...|       0.0|
|[0.0,1.7827754878...|    0.0|[31.6283502406600...|[0.79070875601650...|       0.0|
|[0.0,2.6898016132...|    0.0|[36.3476900082084...|[0.90869225020521...|       0.0|
+--------------------+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [46]:
prediction_test.select('Outcome','prediction').show(5)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    1.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 5 rows



In [47]:
predictionAndLabels = prediction_test.select('Outcome','prediction').rdd

In [48]:
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under ROC curve
print('Area under ROC = %s' % metrics.areaUnderROC)



Area under ROC = 0.7410887253002711


In [49]:
evaluator = MulticlassClassificationEvaluator(labelCol='Outcome',predictionCol='prediction',metricName='accuracy')
accuracy_RF=evaluator.evaluate(prediction_test)
print('Accuracy ', accuracy_RF)

Accuracy  0.7584745762711864


# **Overall OutComes**

In [50]:
print('Accuracy of GBT : ' , accuracy_GBT)
print('Accuracy of LR : ', accuracy_LR)
print('Accuracy of NB : ', accuracy_RF)
print('Accuracy of RF : ', accuracy_RF)

Accuracy of GBT :  0.7203389830508474
Accuracy of LR :  0.7711864406779662
Accuracy of NB :  0.7584745762711864
Accuracy of RF :  0.7584745762711864
