In [4]:
import findspark
findspark.init(r"E:\Spark\spark")

In [5]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [8]:
config = SparkConf()
config.setAppName("MlibExample")
config.setMaster("local[*]")
sc = SparkContext(conf=config)
spark = SparkSession.builder.appName("MlibExample").master("local").getOrCreate()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=MlibExample, master=local) created by getOrCreate at <ipython-input-3-d928510d7e92>:5 

In [9]:
sc

''

In [12]:
sc = spark.sparkContext

In [13]:
sc

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### load data

In [15]:
dataFrame = spark.read.csv("datasets/iris/bezdekIris.data", inferSchema=True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_width", "label")

In [16]:
type(dataFrame)

pyspark.sql.dataframe.DataFrame

In [17]:
dataFrame.show(5)

+-------+-------+-------+---------+-----------+
|sep_len|sep_wid|pet_len|pet_width|      label|
+-------+-------+-------+---------+-----------+
|    5.1|    3.5|    1.4|      0.2|Iris-setosa|
|    4.9|    3.0|    1.4|      0.2|Iris-setosa|
|    4.7|    3.2|    1.3|      0.2|Iris-setosa|
|    4.6|    3.1|    1.5|      0.2|Iris-setosa|
|    5.0|    3.6|    1.4|      0.2|Iris-setosa|
+-------+-------+-------+---------+-----------+
only showing top 5 rows



###  preapare data

In [54]:
assembler = VectorAssembler(inputCols=["sep_len", "sep_wid", "pet_len", "pet_width"],\
                            outputCol="X")
assembled_vectors = assembler.transform(dataFrame)
assembled_vectors.show(5)

+-------+-------+-------+---------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_width|      label|                X|
+-------+-------+-------+---------+-----------+-----------------+
|    5.1|    3.5|    1.4|      0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|      0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|      0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|      0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|    5.0|    3.6|    1.4|      0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-------+-------+-------+---------+-----------+-----------------+
only showing top 5 rows



In [55]:
X = assembled_vectors.drop("sep_len", "sep_wid", "pet_len", "pet_width")

In [56]:
X.show(10)

+-----------+-----------------+
|      label|                X|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
|Iris-setosa|[5.0,3.6,1.4,0.2]|
|Iris-setosa|[5.4,3.9,1.7,0.4]|
|Iris-setosa|[4.6,3.4,1.4,0.3]|
|Iris-setosa|[5.0,3.4,1.5,0.2]|
|Iris-setosa|[4.4,2.9,1.4,0.2]|
|Iris-setosa|[4.9,3.1,1.5,0.1]|
+-----------+-----------------+
only showing top 10 rows



In [57]:
label_encoder = StringIndexer(inputCol="label", outputCol="Y")
final_df = label_encoder.fit(X).transform(X)

In [58]:
final_df.show(10)

+-----------+-----------------+---+
|      label|                X|  Y|
+-----------+-----------------+---+
|Iris-setosa|[5.1,3.5,1.4,0.2]|0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|0.0|
|Iris-setosa|[5.0,3.6,1.4,0.2]|0.0|
|Iris-setosa|[5.4,3.9,1.7,0.4]|0.0|
|Iris-setosa|[4.6,3.4,1.4,0.3]|0.0|
|Iris-setosa|[5.0,3.4,1.5,0.2]|0.0|
|Iris-setosa|[4.4,2.9,1.4,0.2]|0.0|
|Iris-setosa|[4.9,3.1,1.5,0.1]|0.0|
+-----------+-----------------+---+
only showing top 10 rows



### split data 

In [95]:
(train, test) = final_df.randomSplit([0.75,0.25])

### fit model

In [96]:
decision_tree = DecisionTreeClassifier(labelCol="Y", featuresCol="X")
decision_tree_cls = decision_tree.fit(train)

In [97]:
decision_tree_cls

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4cd69da2729886285103) of depth 5 with 17 nodes

### evaluation

In [98]:
predictions = decision_tree_cls.transform(test)

In [99]:
predictions.show(5)

+-----------+-----------------+---+--------------+-------------+----------+
|      label|                X|  Y| rawPrediction|  probability|prediction|
+-----------+-----------------+---+--------------+-------------+----------+
|Iris-setosa|[4.7,3.2,1.6,0.2]|0.0|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.1]|0.0|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.3]|0.0|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.4,1.6,0.2]|0.0|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0|[39.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------+-----------------+---+--------------+-------------+----------+
only showing top 5 rows



In [100]:
predictions.select("Y", "prediction").show(5)

+---+----------+
|  Y|prediction|
+---+----------+
|0.0|       0.0|
|0.0|       0.0|
|0.0|       0.0|
|0.0|       0.0|
|0.0|       0.0|
+---+----------+
only showing top 5 rows



In [101]:
evaluator = MulticlassClassificationEvaluator( labelCol="Y", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [102]:
accuracy

0.9722222222222222

### implement random forest

In [103]:
from pyspark.ml.classification import RandomForestClassifier

In [104]:
random_forest = RandomForestClassifier(labelCol="Y", featuresCol="X", numTrees=100)
random_forest_cls = random_forest.fit(train)

In [105]:
random_forest_prediction = random_forest_cls.transform(test)

In [106]:
random_forest_prediction.show(5)

+-----------+-----------------+---+---------------+-------------+----------+
|      label|                X|  Y|  rawPrediction|  probability|prediction|
+-----------+-----------------+---+---------------+-------------+----------+
|Iris-setosa|[4.7,3.2,1.6,0.2]|0.0|[100.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.1]|0.0|[100.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.0,1.4,0.3]|0.0|[100.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.8,3.4,1.6,0.2]|0.0|[100.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0|[100.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------+-----------------+---+---------------+-------------+----------+
only showing top 5 rows



In [107]:
random_forest_evaluator = MulticlassClassificationEvaluator(labelCol="Y", predictionCol="prediction", metricName="accuracy")

In [108]:
random_forest_evaluator.evaluate(random_forest_prediction)

0.9722222222222222

###  naive bayes

In [109]:
from pyspark.ml.classification import NaiveBayes

In [110]:
nb = NaiveBayes(labelCol="Y", featuresCol="X")
nb_cls = nb.fit(train)

In [111]:
nb_cls_predictions = nb_cls.transform(test)

In [112]:
nb_evaluator = MulticlassClassificationEvaluator(labelCol="Y", predictionCol="prediction")
nb_evaluator.evaluate(nb_cls_predictions)

0.9442663817663818