In [4]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import pickle

# Set dataset path
dataset_path = os.path.abspath("../dataset")


# Select 4 classes
classes = [
    "Tomato___Early_blight",
    "Tomato___Late_blight",
    "Tomato___Leaf_Mold",
    "Tomato___healthy"
]

# Resize and normalize images
img_size = 128
data = []
labels = []

for class_name in classes:
    folder_path = os.path.join(dataset_path, class_name)
    for img_file in os.listdir(folder_path):
        img_path = os.path.join(folder_path, img_file)
        img = cv2.imread(img_path)
        if img is not None:
            img = cv2.resize(img, (img_size, img_size))
            img = img / 255.0  # Normalize pixel values
            data.append(img)
            labels.append(class_name)

print(f"✅ Total images loaded: {len(data)}")

# Convert to NumPy arrays
X = np.array(data)
y = np.array(labels)

# One-hot encode labels
lb = LabelBinarizer()
y = lb.fit_transform(y)

# Save label classes for later use in GUI
with open("../model/label_classes.pkl", "wb") as f:
    pickle.dump(lb.classes_, f)

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ X_train shape:", X_train.shape)
print("✅ y_train shape:", y_train.shape)


✅ Total images loaded: 5452
✅ X_train shape: (4361, 128, 128, 3)
✅ y_train shape: (4361, 4)
