In [1]:
import pandas as pd
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Path to your CSV file and image directory
file_path = 'full_metadata.csv'  # Adjust this path as needed
 # Set this to the folder where the images are located

# Load dataset
plant_data = pd.read_csv(file_path)
plant_data = plant_data.head(250)  # Load only the first 100 samples


# Load images and resize them
image_size = (128, 128)

def load_images(plant_data, image_size):
    images = []
    labels = []
    
    for idx, row in plant_data.iterrows():
        img_path = os.path.join(row['path'])  # Assuming the CSV has a column 'image_file'
        img = cv2.imread(img_path)
        
        if img is not None:
            img = cv2.resize(img, image_size)
            images.append(img)
            labels.append(row['toxicity'])  # Assuming 'toxicity' column (0: Non-toxic, 1: Toxic)
        else:
            print(f"Warning: Could not read image {img_path}")
            
    return np.array(images), np.array(labels)

# Preprocess images and split dataset
X, y = load_images(plant_data, image_size)
X = X / 255.0  # Normalize the image data

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save preprocessed data for use in model training
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

print(f"Data preprocessing completed: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples.")


Data preprocessing completed: 200 training samples, 50 test samples.
