# **Wine Random Forest Classification**

In [1]:
# install
!pip install Pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# start spark session
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.getOrCreate()

In [4]:
spark

In [5]:
# read sklearn inbuilt data
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
wine = load_wine(as_frame=True)
wine = wine.frame
wine = spark.createDataFrame(wine)

In [6]:
wine.show()

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+
|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|target|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+
|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|     0|
|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|                         3.4| 1050.0|     0|
|  13.16|      2.36|2.67|             18.6|    101.0|          2.8|      3.24|                 

In [7]:
wine.printSchema()

root
 |-- alcohol: double (nullable = true)
 |-- malic_acid: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcalinity_of_ash: double (nullable = true)
 |-- magnesium: double (nullable = true)
 |-- total_phenols: double (nullable = true)
 |-- flavanoids: double (nullable = true)
 |-- nonflavanoid_phenols: double (nullable = true)
 |-- proanthocyanins: double (nullable = true)
 |-- color_intensity: double (nullable = true)
 |-- hue: double (nullable = true)
 |-- od280/od315_of_diluted_wines: double (nullable = true)
 |-- proline: double (nullable = true)
 |-- target: long (nullable = true)



In [8]:
wine.columns

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline',
 'target']

In [9]:
from pyspark.ml.feature import VectorAssembler 

In [10]:
featureassembler= VectorAssembler(inputCols=['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline'], outputCol='Features')

In [11]:
output = featureassembler.transform(wine)

In [12]:
output.show()

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+--------------------+
|alcohol|malic_acid| ash|alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity| hue|od280/od315_of_diluted_wines|proline|target|            Features|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+------+--------------------+
|  14.23|      1.71|2.43|             15.6|    127.0|          2.8|      3.06|                0.28|           2.29|           5.64|1.04|                        3.92| 1065.0|     0|[14.23,1.71,2.43,...|
|   13.2|      1.78|2.14|             11.2|    100.0|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|                         3.4| 1050.0|     0|[13.2,1.78,2

In [13]:
modeldata=output.select('Features','target')

In [14]:
modeldata.show()

+--------------------+------+
|            Features|target|
+--------------------+------+
|[14.23,1.71,2.43,...|     0|
|[13.2,1.78,2.14,1...|     0|
|[13.16,2.36,2.67,...|     0|
|[14.37,1.95,2.5,1...|     0|
|[13.24,2.59,2.87,...|     0|
|[14.2,1.76,2.45,1...|     0|
|[14.39,1.87,2.45,...|     0|
|[14.06,2.15,2.61,...|     0|
|[14.83,1.64,2.17,...|     0|
|[13.86,1.35,2.27,...|     0|
|[14.1,2.16,2.3,18...|     0|
|[14.12,1.48,2.32,...|     0|
|[13.75,1.73,2.41,...|     0|
|[14.75,1.73,2.39,...|     0|
|[14.38,1.87,2.38,...|     0|
|[13.63,1.81,2.7,1...|     0|
|[14.3,1.92,2.72,2...|     0|
|[13.83,1.57,2.62,...|     0|
|[14.19,1.59,2.48,...|     0|
|[13.64,3.1,2.56,1...|     0|
+--------------------+------+
only showing top 20 rows



In [15]:
# split data
train_data,test_data=modeldata.randomSplit([0.8,0.2])

In [16]:
train_data.show()

+--------------------+------+
|            Features|target|
+--------------------+------+
|[11.64,2.06,2.46,...|     1|
|[11.65,1.67,2.62,...|     1|
|[11.84,0.89,2.58,...|     1|
|[11.84,2.89,2.23,...|     1|
|[11.96,1.09,2.3,2...|     1|
|[12.08,1.13,2.51,...|     1|
|[12.21,1.19,1.75,...|     1|
|[12.29,1.61,2.21,...|     1|
|[12.33,1.1,2.28,1...|     1|
|[12.37,1.13,2.16,...|     1|
|[12.37,1.17,1.92,...|     1|
|[12.37,1.21,2.56,...|     1|
|[12.64,1.36,2.02,...|     1|
|[12.67,0.98,2.24,...|     1|
|[12.7,3.87,2.4,23...|     1|
|[12.72,1.81,2.2,1...|     1|
|[12.85,1.6,2.52,1...|     0|
|[12.93,3.8,2.65,1...|     0|
|[12.99,1.67,2.6,3...|     1|
|[13.03,0.9,1.71,1...|     1|
+--------------------+------+
only showing top 20 rows



In [17]:
# Random Forest classification model
from pyspark.ml.classification import RandomForestClassifier

In [18]:
rf = RandomForestClassifier(featuresCol='Features', labelCol='target')

In [19]:
rf = rf.fit(train_data)

In [20]:
# prediction
y_pred = rf.transform(test_data)

In [21]:
y_pred.show()

+--------------------+------+--------------------+--------------------+----------+
|            Features|target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|[11.66,1.88,1.92,...|     1|[0.97727272727272...|[0.04886363636363...|       1.0|
|[12.0,0.92,2.0,19...|     1|      [0.0,20.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|[12.16,1.61,2.31,...|     1|      [0.0,20.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|[12.17,1.45,2.53,...|     1|      [0.0,20.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|[12.33,0.99,1.95,...|     1|      [7.0,13.0,0.0]|     [0.35,0.65,0.0]|       1.0|
|[12.37,0.94,1.36,...|     1|      [0.0,18.0,2.0]|       [0.0,0.9,0.1]|       1.0|
|[13.05,3.86,2.32,...|     1|[3.03333333333333...|[0.15166666666666...|       2.0|
|[13.29,1.97,2.68,...|     0|[19.9545454545454...|[0.99772727272727...|       0.0|
|[13.3,1.72,2.14,1...|     0|[18.9545454545454...|[0.94772727272727...|       0.0|
|[13

In [22]:
# confusion matrix
y_pred.groupBy('target', 'prediction').count().show()

+------+----------+-----+
|target|prediction|count|
+------+----------+-----+
|     1|       2.0|    1|
|     0|       0.0|   11|
|     1|       1.0|   17|
|     2|       2.0|   10|
+------+----------+-----+



In [23]:
from sklearn.metrics import confusion_matrix
pred=y_pred.select("prediction").collect()
orig=y_pred.select("target").collect()
print(confusion_matrix(orig, pred))

[[11  0  0]
 [ 0 17  1]
 [ 0  0 10]]


(f1|accuracy|weightedPrecision|weightedRecall|weightedTruePositiveRate| weightedFalsePositiveRate|weightedFMeasure|truePositiveRateByLabel| falsePositiveRateByLabel|precisionByLabel|recallByLabel|fMeasureByLabel| logLoss|hammingLoss)

In [24]:
# evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
evaluator = MulticlassClassificationEvaluator(
    labelCol='target', predictionCol='prediction')

In [26]:
accuracy = evaluator.evaluate(y_pred)
accuracy

0.9746031746031746

In [27]:
# close connection to spark
spark.stop()