In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').getOrCreate()

In [2]:
df = spark.read.csv('iris.CSV', inferSchema=True, header=True)
df.show(5)

+------------+-----------+------------+-----------+-------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|
+------------+-----------+------------+-----------+-------+-----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|
+------------+-----------+------------+-----------+-------+-----------+
only showing top 5 rows



## При помощи VectorAssembler преобразовать все колонки с признаками в одну (использовать Pipeline — опционально).

In [3]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

In [4]:
pipeline = Pipeline(stages=
                    [
                        StringIndexer(inputCol='variety', outputCol='varietyInd'),
                        OneHotEncoder(inputCol='varietyInd', outputCol='varietyOne'),
                        VectorAssembler(inputCols=['sepal_length', 'sepal_width', 'petal_length'], outputCol='features')
                    ]
                   )

In [5]:
pipelineTrained = pipeline.fit(df)

In [6]:
pipelineTrained.transform(df).show()

+------------+-----------+------------+-----------+-------+-----------+----------+-------------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_num|varietyInd|   varietyOne|     features|
+------------+-----------+------------+-----------+-------+-----------+----------+-------------+-------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0|       0.0|(2,[0],[1.0])|[5.1,3.5,1.4]|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0|       0.0|(2,[0],[1.0])|[4.9,3.0,1.4]|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0|       0.0|(2,[0],[1.0])|[4.7,3.2,1.3]|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0|       0.0|(2,[0],[1.0])|[4.6,3.1,1.5]|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0|       0.0|(2,[0],[1.0])|[5.0,3.6,1.4]|
|         5.4|        3.9|         1.7|        0.4| Setosa|          0|       0.0|(2,[0],[1.0])|[5.4,3.9,1.7]|
|

## Разбить данные на train и test.

In [7]:
df_features = pipelineTrained.transform(df)

In [8]:
train, test = df_features.randomSplit([0.8, 0.2], seed=777)

In [9]:
train.show()

+------------+-----------+------------+-----------+----------+-----------+----------+-------------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|variety_num|varietyInd|   varietyOne|     features|
+------------+-----------+------------+-----------+----------+-----------+----------+-------------+-------------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.3,3.0,1.1]|
|         4.4|        2.9|         1.4|        0.2|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.4,2.9,1.4]|
|         4.4|        3.2|         1.3|        0.2|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.4,3.2,1.3]|
|         4.5|        2.3|         1.3|        0.3|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.5,2.3,1.3]|
|         4.6|        3.2|         1.4|        0.2|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.6,3.2,1.4]|
|         4.6|        3.4|         1.4|        0.3|    Setosa|          0|       0.0|(2,

## Создать модель логистической регреcсии или модель дерева и обучить её.

In [10]:
from pyspark.ml.classification import LogisticRegression

In [11]:
lr = LogisticRegression(featuresCol='features', labelCol='variety_num')
lrModel = lr.fit(train)

In [12]:
train_res = lrModel.transform(train)
test_res = lrModel.transform(test)

In [13]:
train_res.show()

+------------+-----------+------------+-----------+----------+-----------+----------+-------------+-------------+--------------------+--------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   variety|variety_num|varietyInd|   varietyOne|     features|       rawPrediction|         probability|prediction|
+------------+-----------+------------+-----------+----------+-----------+----------+-------------+-------------+--------------------+--------------------+----------+
|         4.3|        3.0|         1.1|        0.1|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.3,3.0,1.1]|[597.928886939294...|       [1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.4,2.9,1.4]|[445.935700342105...|[1.0,5.7795693595...|       0.0|
|         4.4|        3.2|         1.3|        0.2|    Setosa|          0|       0.0|(2,[0],[1.0])|[4.4,3.2,1.3]|[691.017748505309...|       [1.0,0.0,0.0]|       0.0

## Воспользоваться MulticlassClassificationEvaluator для оценки качества на train и test множестве.

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
ev = MulticlassClassificationEvaluator(labelCol='variety_num')

In [16]:
ev.evaluate(train_res)

0.9586396884202526

In [17]:
ev.evaluate(test_res)

1.0