1. Build a Classification Model with Spark with a dataset of your choice

Initialize Spark Session

In [17]:
from pyspark.sql import SparkSession

try:
    spark = SparkSession.builder \
        .appName("ClassificationModel") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    print("Spark session initialized successfully.")
except Exception as e:
    print(f"Error initializing Spark session: {e}")
    exit(1)

Spark session initialized successfully.


Load Dataset (Iris)

In [18]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target

df = spark.createDataFrame(iris_df)
df.show(5)


+-----------------+----------------+-----------------+----------------+-----+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|label|
+-----------------+----------------+-----------------+----------------+-----+
|              5.1|             3.5|              1.4|             0.2|    0|
|              4.9|             3.0|              1.4|             0.2|    0|
|              4.7|             3.2|              1.3|             0.2|    0|
|              4.6|             3.1|              1.5|             0.2|    0|
|              5.0|             3.6|              1.4|             0.2|    0|
+-----------------+----------------+-----------------+----------------+-----+
only showing top 5 rows



Data Preprocessing (Vector Assembler + Label Indexing)

In [19]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

feature_cols = iris.feature_names
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df).select("features", "label")
df_assembled.show(5)


+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|    0|
|[4.9,3.0,1.4,0.2]|    0|
|[4.7,3.2,1.3,0.2]|    0|
|[4.6,3.1,1.5,0.2]|    0|
|[5.0,3.6,1.4,0.2]|    0|
+-----------------+-----+
only showing top 5 rows



Train/Test Split

In [20]:
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

Train Classification Model (Logistic Regression)

In [21]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='label')
lr_model = lr.fit(train_data)

Make Predictions

In [22]:
predictions = lr_model.transform(test_data)
predictions.select("features", "label", "prediction").show(20)

+-----------------+-----+----------+
|         features|label|prediction|
+-----------------+-----+----------+
|[4.6,3.1,1.5,0.2]|    0|       0.0|
|[4.8,3.4,1.6,0.2]|    0|       0.0|
|[4.9,3.1,1.5,0.1]|    0|       0.0|
|[5.4,3.7,1.5,0.2]|    0|       0.0|
|[4.6,3.6,1.0,0.2]|    0|       0.0|
|[5.0,3.0,1.6,0.2]|    0|       0.0|
|[5.0,3.2,1.2,0.2]|    0|       0.0|
|[5.4,3.4,1.5,0.4]|    0|       0.0|
|[4.4,3.2,1.3,0.2]|    0|       0.0|
|[5.0,3.5,1.3,0.3]|    0|       0.0|
|[5.1,3.4,1.5,0.2]|    0|       0.0|
|[5.1,3.8,1.6,0.2]|    0|       0.0|
|[5.1,3.8,1.9,0.4]|    0|       0.0|
|[5.9,3.0,4.2,1.5]|    1|       1.0|
|[5.8,2.7,3.9,1.2]|    1|       1.0|
|[6.8,2.8,4.8,1.4]|    1|       1.0|
|[5.1,2.5,3.0,1.1]|    1|       1.0|
|[5.7,2.8,4.1,1.3]|    1|       1.0|
|[5.7,3.0,4.2,1.2]|    1|       1.0|
|[5.8,2.6,4.0,1.2]|    1|       1.0|
+-----------------+-----+----------+
only showing top 20 rows



Accuracy check

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 1.0000


Stop Spark Session

In [24]:
spark.stop()