# Decison Tree SPARKML

In [None]:
!pip install pyspark pillow numpy

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import numpy as np
import os
from PIL import Image


In [None]:
spark = SparkSession.builder.appName("Image_Classification").getOrCreate()


In [None]:
def extract_features_from_image(image_path):
    try:
        img = Image.open(image_path).convert("L")  # Convert to grayscale
        img = img.resize((64, 64))  # Resize to fixed shape
        img_array = np.array(img).flatten()  # Flatten into 1D array
        return img_array.tolist()
    except:
        return None  # Handle errors


In [None]:
def load_image_data(folder, label):
    data = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        features = extract_features_from_image(img_path)
        if features:
            data.append((features, label))
    return data

In [None]:
cat_data = load_image_data("cats", label=0)
dog_data = load_image_data("dogs", label=1)

In [None]:
columns = ["features", "label"]
image_data = cat_data + dog_data
df = spark.createDataFrame(image_data, columns)

In [None]:
vector_assembler = VectorAssembler(inputCols=["features"], outputCol="features_vector")
df = vector_assembler.transform(df).select("features_vector", "label")

In [None]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [None]:
dt = DecisionTreeClassifier(featuresCol="features_vector", labelCol="label")
model = dt.fit(train_data)

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Decision Tree Model Accuracy: {accuracy * 100:.2f}%")


In [None]:
model.save("pyspark_decision_tree_model")