**ANGEL METANOSA AFINDA (NIM: 2301212013)**

## **Setup Apache for Spark**

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 31 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 27.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=11e4c95cf40c50705498ea352600515fc1b6b9f0983bf80395725a3a6c30557f
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [None]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

# **Import Library**

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt  # Untuk visualisasi
import seaborn as sns                 # Untuk visualisasi
plt.style.use("seaborn")

# **Import and Load Dataset**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = spark.read.csv("/content/drive/MyDrive/Magister Informatika/Big Data Analysis/Tugas Clustering/ObesityDataSet_raw_and_data_sinthetic.csv", header=True, inferSchema=True)

In [None]:
df.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Weight: double (nullable = true)
 |-- family_history_with_overweight: string (nullable = true)
 |-- FAVC: string (nullable = true)
 |-- FCVC: double (nullable = true)
 |-- NCP: double (nullable = true)
 |-- CAEC: string (nullable = true)
 |-- SMOKE: string (nullable = true)
 |-- CH2O: double (nullable = true)
 |-- SCC: string (nullable = true)
 |-- FAF: double (nullable = true)
 |-- TUE: double (nullable = true)
 |-- CALC: string (nullable = true)
 |-- MTRANS: string (nullable = true)
 |-- NObeyesdad: string (nullable = true)



# **Data Exploration**

In [None]:
df.show(4,False)

+------+----+------+------+------------------------------+----+----+---+---------+-----+----+---+---+---+----------+---------------------+------------------+
|Gender|Age |Height|Weight|family_history_with_overweight|FAVC|FCVC|NCP|CAEC     |SMOKE|CH2O|SCC|FAF|TUE|CALC      |MTRANS               |NObeyesdad        |
+------+----+------+------+------------------------------+----+----+---+---------+-----+----+---+---+---+----------+---------------------+------------------+
|Female|21.0|1.62  |64.0  |yes                           |no  |2.0 |3.0|Sometimes|no   |2.0 |no |0.0|1.0|no        |Public_Transportation|Normal_Weight     |
|Female|21.0|1.52  |56.0  |yes                           |no  |3.0 |3.0|Sometimes|yes  |3.0 |yes|3.0|0.0|Sometimes |Public_Transportation|Normal_Weight     |
|Male  |23.0|1.8   |77.0  |yes                           |no  |2.0 |3.0|Sometimes|no   |2.0 |no |2.0|1.0|Frequently|Public_Transportation|Normal_Weight     |
|Male  |27.0|1.8   |87.0  |no                       

In [None]:
df.count()

2111

In [None]:
print("Jumlah data sebelum drop null:", df.count())
df = df.na.drop("any")
print("Jumlah data setelah drop null : ",df.count())

Jumlah data sebelum drop null: 2111
Jumlah data setelah drop null :  2111


# **Data Pre-Processing**

## **Convert The Categorical Value to Numeric Value**

In [None]:
# import packages
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# conversion
indexer_family = StringIndexer(inputCol='family_history_with_overweight', outputCol='family_history_with_overweight_numeric').fit(df)
indexed_df_family = indexer_family.transform(df)
indexed_df_family_1  = indexed_df_family.drop('family_history_with_overweight')

indexer_FAVC = StringIndexer(inputCol='FAVC', outputCol='FAVC_numeric').fit(indexed_df_family_1)
indexed_df_FAVC = indexer_FAVC.transform(indexed_df_family_1)
indexed_df_FAVC_1  = indexed_df_FAVC.drop('FAVC')

indexer_CAEC = StringIndexer(inputCol='CAEC', outputCol='CAEC_numeric').fit(indexed_df_FAVC_1)
indexed_df_CAEC = indexer_CAEC.transform(indexed_df_FAVC_1)
indexed_df_CAEC_1  = indexed_df_CAEC.drop('CAEC')

indexer_MTRANS = StringIndexer(inputCol='MTRANS', outputCol='MTRANS_numeric').fit(indexed_df_CAEC_1)
indexed_df_MTRANS = indexer_MTRANS.transform(indexed_df_CAEC_1)
indexed_df_MTRANS_1  = indexed_df_MTRANS.drop('MTRANS')

indexer_SCC = StringIndexer(inputCol='SCC', outputCol='SCC_numeric').fit(indexed_df_MTRANS_1)
indexed_df_SCC = indexer_SCC.transform(indexed_df_MTRANS_1)
indexed_df_SCC_1  = indexed_df_SCC.drop('SCC')

indexer_CALC = StringIndexer(inputCol='CALC', outputCol='CALC_numeric').fit(indexed_df_SCC_1)
indexed_df_CALC = indexer_CALC.transform(indexed_df_SCC_1)
indexed_df_CALC_1 = indexed_df_CALC.drop('CALC')

indexer_SMOKE = StringIndexer(inputCol='SMOKE', outputCol='SMOKE_numeric').fit(indexed_df_CALC_1)
indexed_df_SMOKE = indexer_SMOKE.transform(indexed_df_CALC_1)
indexed_df_SMOKE_1 = indexed_df_SMOKE.drop('SMOKE')

indexer_NObeyesdad = StringIndexer(inputCol='NObeyesdad', outputCol='NObeyesdad_numeric').fit(indexed_df_SMOKE_1)
indexed_df_NObeyesdad = indexer_NObeyesdad.transform(indexed_df_SMOKE_1)
indexed_df_NObeyesdad_1 = indexed_df_NObeyesdad.drop('NObeyesdad')

indexer_Gender = StringIndexer(inputCol='Gender', outputCol='Gender_numeric').fit(indexed_df_NObeyesdad_1)
indexed_df_Gender = indexer_Gender.transform(indexed_df_NObeyesdad_1)
indexed_df_fix = indexed_df_Gender.drop('Gender')

indexed_df_fix.show(2, False)

+----+------+------+----+---+----+---+---+--------------------------------------+------------+------------+--------------+-----------+------------+-------------+------------------+--------------+
|Age |Height|Weight|FCVC|NCP|CH2O|FAF|TUE|family_history_with_overweight_numeric|FAVC_numeric|CAEC_numeric|MTRANS_numeric|SCC_numeric|CALC_numeric|SMOKE_numeric|NObeyesdad_numeric|Gender_numeric|
+----+------+------+----+---+----+---+---+--------------------------------------+------------+------------+--------------+-----------+------------+-------------+------------------+--------------+
|21.0|1.62  |64.0  |2.0 |3.0|2.0 |0.0|1.0|0.0                                   |1.0         |0.0         |0.0           |0.0        |1.0         |0.0          |5.0               |1.0           |
|21.0|1.52  |56.0  |3.0 |3.0|3.0 |3.0|0.0|0.0                                   |1.0         |0.0         |0.0           |1.0        |0.0         |1.0          |5.0               |1.0           |
+----+------+------+

In [None]:
indexed_df_fix.groupBy('NObeyesdad_numeric').count().show()

+------------------+-----+
|NObeyesdad_numeric|count|
+------------------+-----+
|               0.0|  351|
|               1.0|  324|
|               4.0|  290|
|               3.0|  290|
|               2.0|  297|
|               6.0|  272|
|               5.0|  287|
+------------------+-----+



# **Vector Assembler**

In [None]:
#In PySpark, all columns except the target need to be converted to a vector, which we call it features.
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler( 
inputCols = ['Gender_numeric', 'Age', 'Height', 'Weight', 'family_history_with_overweight_numeric', 
           'FAVC_numeric', 'FCVC', 'NCP', 'CAEC_numeric', 'SMOKE_numeric', 'CH2O', 'SCC_numeric', 'FAF',
           'TUE', 'CALC_numeric', 'MTRANS_numeric'], 
outputCol = 'features')
output = assembler.transform (indexed_df_fix)

In [None]:
output.show(2, False)

+----+------+------+----+---+----+---+---+--------------------------------------+------------+------------+--------------+-----------+------------+-------------+------------------+--------------+--------------------------------------------------------------------+
|Age |Height|Weight|FCVC|NCP|CH2O|FAF|TUE|family_history_with_overweight_numeric|FAVC_numeric|CAEC_numeric|MTRANS_numeric|SCC_numeric|CALC_numeric|SMOKE_numeric|NObeyesdad_numeric|Gender_numeric|features                                                            |
+----+------+------+----+---+----+---+---+--------------------------------------+------------+------------+--------------+-----------+------------+-------------+------------------+--------------+--------------------------------------------------------------------+
|21.0|1.62  |64.0  |2.0 |3.0|2.0 |0.0|1.0|0.0                                   |1.0         |0.0         |0.0           |0.0        |1.0         |0.0          |5.0               |1.0           |[1.0,21.0,

## **Split Data**

In [None]:
#To train our model, we combine “features” and “target” as input/output.
final_data = output.select('features', 'NObeyesdad_numeric')

In [None]:
#Then, we can split final_data to train and test as follows:
train, test = final_data.randomSplit([0.7, 0.3])

In [None]:
train.show(2, False)

+------------------------------------------------------------------+------------------+
|features                                                          |NObeyesdad_numeric|
+------------------------------------------------------------------+------------------+
|(16,[0,1,2,3,4,6,7,8,10],[1.0,21.0,1.52,42.0,1.0,3.0,1.0,1.0,1.0])|6.0               |
|(16,[0,1,2,3,4,6,7,8,10],[1.0,21.0,1.52,42.0,1.0,3.0,1.0,1.0,1.0])|6.0               |
+------------------------------------------------------------------+------------------+
only showing top 2 rows



In [None]:
test.show(2, False)

+------------------------------------------------------------------+------------------+
|features                                                          |NObeyesdad_numeric|
+------------------------------------------------------------------+------------------+
|(16,[0,1,2,3,4,6,7,8,10],[1.0,21.0,1.52,42.0,1.0,3.0,1.0,1.0,1.0])|6.0               |
|(16,[0,1,2,3,4,6,7,8,10],[1.0,21.0,1.52,42.0,1.0,3.0,1.0,1.0,1.0])|6.0               |
+------------------------------------------------------------------+------------------+
only showing top 2 rows



# **Classification Experiments**

## **Logistic Regression**

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'features',labelCol = "NObeyesdad_numeric")
model_log = lr.fit(train)

In [None]:
#Train
predict_train_log = model_log.transform(train)
predict_train_log.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  264|
|       1.0|  221|
|       4.0|  202|
|       3.0|  208|
|       2.0|  202|
|       6.0|  188|
|       5.0|  163|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget = predict_train_log.select("NObeyesdad_numeric", "prediction")

acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.8922651933701657
Precission =  0.8921708646932092
F1 Score =  0.8914202289735079
Recall =  0.8922651933701657


In [None]:
#Test
predict_test_log  = model_log.transform(test)
predict_test_log.show(5)

+--------------------+------------------+--------------------+--------------------+----------+
|            features|NObeyesdad_numeric|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+--------------------+----------+
|(16,[0,1,2,3,4,6,...|               6.0|[-6.7911899365715...|[6.00890304553780...|       6.0|
|(16,[0,1,2,3,4,6,...|               6.0|[-6.7911899365715...|[6.00890304553780...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[-3.3381031304816...|[4.81760417286245...|       3.0|
|(16,[0,1,2,3,4,6,...|               5.0|[-5.1258044549833...|[6.71399434785180...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[-4.0767700134274...|[3.55725326482999...|       5.0|
+--------------------+------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
predict_test_log.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  106|
|       1.0|  104|
|       4.0|   85|
|       3.0|   86|
|       2.0|   97|
|       6.0|  102|
|       5.0|   83|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget = predict_test_log.select("NObeyesdad_numeric", "prediction")

In [None]:
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.8914027149321267
Precission =  0.8913490211925605
F1 Score =  0.8898511907891172
Recall =  0.8914027149321267


## **Random** Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features',labelCol = "NObeyesdad_numeric", seed = 2500, maxDepth=10, numTrees=30)
model=rf.fit(train)

In [None]:
predict_train = model.transform(train)
predict_train.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|   92|
|       1.0|  103|
|       4.0|   96|
|       3.0|   86|
|       2.0|   95|
|       6.0|   95|
|       5.0|   96|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget = predict_train.select("NObeyesdad_numeric", "prediction")

acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.9979281767955801
Precission =  0.9979387760022583
F1 Score =  0.9979282630660631
Recall =  0.9979281767955803


In [None]:
#Test
predict_test  = model.transform(test)
predict_test.show(20)

+--------------------+------------------+--------------------+--------------------+----------+
|            features|NObeyesdad_numeric|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+--------------------+----------+
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       6.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[1.05882352941176...|[0.03529411764705...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,2.22...|[0.0,0.0,0.0,0.07...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,1.17...|[0.0,0.0,0.0,0.03...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,2.05...|[0.0,0.0,0.0,0.06...|       5.0|
|(16,[0,1,2,3,4,6,...|               3.0|[1.89189189189189...|[0.06306306306306...|       5.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.6,0.0,

In [None]:
predict_test.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|   92|
|       1.0|  103|
|       4.0|   96|
|       3.0|   86|
|       2.0|   95|
|       6.0|   95|
|       5.0|   96|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget = predict_test.select("NObeyesdad_numeric", "prediction")

In [None]:
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.9351432880844646
Precission =  0.935809777735825
F1 Score =  0.9352609919866893
Recall =  0.9351432880844646


## **Decision Tree**

In [None]:
#Decision Tree
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=2, labelCol='NObeyesdad_numeric',featuresCol='features')
model_dt= dt.fit(train)

In [None]:
#Train
predictions_train = model_dt.transform(train)
predictions_train.show(10)

+--------------------+------------------+--------------------+--------------------+----------+
|            features|NObeyesdad_numeric|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+--------------------+----------+
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,11.0...|[0.0,0.0,0.0,0.04...|       6.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,11.0...|[0.0,0.0,0.0,0.04...|       6.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,11.0...|[0.0,0.0,0.0,0.04...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[180.0,0.0,3.0,19...|[0.26470588235294...|       3.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,11.0...|[0.0,0.0,0.0,0.04...|       6.0|
|(16,[0,1,2,3,4,6,...|               3.0|[180.0,0.0,3.0,19...|[0.26470588235294...|       3.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,11.0...|[0.0,0.0,0.0,0.04...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,

In [None]:
predictions_train.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       1.0|  221|
|       3.0|  680|
|       2.0|  277|
|       6.0|  270|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget4 = predictions_train.select("NObeyesdad_numeric", "prediction")

In [None]:
acc = evaluatorMulti.evaluate(predictionAndTarget4, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget4, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget4, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget4, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.5462707182320442
Precission =  0.3738480800498373
F1 Score =  0.42796381312438225
Recall =  0.5462707182320441


In [None]:
#Test
predictions_test = model.transform(test)
predictions_test.show(10)

+--------------------+------------------+--------------------+--------------------+----------+
|            features|NObeyesdad_numeric|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+--------------------+----------+
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       6.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       6.0|
|(16,[0,1,2,3,4,6,...|               5.0|[1.05882352941176...|[0.03529411764705...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,2.22...|[0.0,0.0,0.0,0.07...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,1.17...|[0.0,0.0,0.0,0.03...|       5.0|
|(16,[0,1,2,3,4,6,...|               5.0|[0.0,0.0,0.0,2.05...|[0.0,0.0,0.0,0.06...|       5.0|
|(16,[0,1,2,3,4,6,...|               3.0|[1.89189189189189...|[0.06306306306306...|       5.0|
|(16,[0,1,2,3,4,6,...|               6.0|[0.6,0.0,

In [None]:
predictions_test.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|   92|
|       1.0|  103|
|       4.0|   96|
|       3.0|   86|
|       2.0|   95|
|       6.0|   95|
|       5.0|   96|
+----------+-----+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="NObeyesdad_numeric", predictionCol="prediction")
predictionAndTarget3 = predictions_test.select("NObeyesdad_numeric", "prediction")

In [None]:
acc = evaluatorMulti.evaluate(predictionAndTarget3, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget3, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget3, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget3, {evaluatorMulti.metricName: "weightedRecall"})


print("Accuracy = ", acc)
print("Precission = ", weightedPrecision)
print("F1 Score = ", f1)
print("Recall = ", weightedRecall)

Accuracy =  0.9351432880844646
Precission =  0.935809777735825
F1 Score =  0.9352609919866893
Recall =  0.9351432880844646
