In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 1. Создаём Spark-сессию
spark = (
    SparkSession.builder
        .appName("IrisClassification")
        .getOrCreate()
)

# 2. Загружаем датасет

df = spark.read.csv("iris.csv", header=True, inferSchema=True)

df.show(5)

# 3. Преобразование столбца с названием класса
label_indexer = StringIndexer(
    inputCol="species",
    outputCol="label"
)

# 4. Признаки для модели
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)

# 5. Модель классификации (логистическая регрессия)
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=20
)

# 6. Создаём Pipeline
pipeline = Pipeline(stages=[label_indexer, assembler, lr])

# 7. Разделяем на train/test
train, test = df.randomSplit([0.8, 0.2], seed=42)

# 8. Обучаем модель
model = pipeline.fit(train)

# 9. Предсказываем
predictions = model.transform(test)

predictions.select("features", "label", "prediction").show(10)

# 10. Оценка качества модели
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"\nAccuracy: {accuracy:.4f}")

# 11. Сохранение модели
model.save("iris_model_sparkml")

print("\nМодель сохранена в папку iris_model_sparkml")

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

1. Создаём Spark-сессию

In [2]:
spark = (
    SparkSession.builder
        .appName("IrisClassification")
        .getOrCreate()
)

 2. Загружаем датасет

In [4]:
df = spark.read.csv("IRIS.csv", header=True, inferSchema=True)

df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



3. Преобразование столбца с названием класса

In [5]:
label_indexer = StringIndexer(
    inputCol="species",
    outputCol="label"
)


4. Признаки для модели

In [6]:
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)

5. Модель классификации (логистическая регрессия)

In [7]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=20
)


6. Создаём Pipeline

In [8]:
pipeline = Pipeline(stages=[label_indexer, assembler, lr])


7. Разделяем на train/test выборки

In [9]:
train, test = df.randomSplit([0.8, 0.2], seed=42)


8. Обучаем модель

In [10]:
model = pipeline.fit(train)


9. Предсказываем

In [11]:
predictions = model.transform(test)

predictions.select("features", "label", "prediction").show(10)


+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.4,3.0,1.3,0.2]|  2.0|       2.0|
|[4.6,3.2,1.4,0.2]|  2.0|       2.0|
|[4.6,3.6,1.0,0.2]|  2.0|       2.0|
|[4.8,3.1,1.6,0.2]|  2.0|       2.0|
|[4.9,3.1,1.5,0.1]|  2.0|       2.0|
|[5.0,2.3,3.3,1.0]|  0.0|       0.0|
|[5.0,3.5,1.3,0.3]|  2.0|       2.0|
|[5.1,3.5,1.4,0.2]|  2.0|       2.0|
|[5.3,3.7,1.5,0.2]|  2.0|       2.0|
|[5.4,3.0,4.5,1.5]|  0.0|       0.0|
+-----------------+-----+----------+
only showing top 10 rows



10. Оцениваем качество модели

In [12]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 1.0000


11. Сохраняем модель

In [13]:
model.save("iris_model_sparkml")

print("\nМодель сохранена в папку iris_model_sparkml")


Модель сохранена в папку iris_model_sparkml
