In [6]:
# Week 1: Data Collection & Preprocessing
# Project: AI-Driven Crop Health Monitoring and Yield Prediction

import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# -------------------------------
# PART 1: Crop Disease Detection (Image Dataset Preprocessing)
# -------------------------------

def preprocess_images(image_dir, target_size=(224,224)):
    """
    Load, resize, normalize images for CNN models
    Args:
        image_dir: str -> Path to dataset folder (PlantVillage or custom)
        target_size: tuple -> image size for CNN input
    Returns:
        X: numpy array of images
        y: numpy array of labels (int encoded)
        class_dict: dict -> mapping class index to class name
    """
    X = []
    y = []
    class_names = sorted(os.listdir(image_dir))
    class_dict = {i: name for i, name in enumerate(class_names)}

    for label, crop_class in class_dict.items():
        crop_dir = os.path.join(image_dir, crop_class)
        if not os.path.isdir(crop_dir):
            continue
        for file in os.listdir(crop_dir):
            img_path = os.path.join(crop_dir, file)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    continue
                img = cv2.resize(img, target_size)
                img = img / 255.0  # normalize
                X.append(img)
                y.append(label)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

    return np.array(X), np.array(y), class_dict


# Example usage (uncomment after downloading PlantVillage dataset):
# X_img, y_img, class_dict = preprocess_images("data/PlantVillage")
# X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(X_img, y_img, test_size=0.2, stratify=y_img)


# -------------------------------
# PART 2: Crop Yield Prediction (Structured Data Preprocessing)
# -------------------------------

def preprocess_yield_data(csv_path):
    """
    Preprocess structured crop yield data
    Args:
        csv_path: str -> path to structured dataset (FAO/Kaggle)
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test (numpy arrays)
    """
    df = pd.read_csv(csv_path)

    # Example: expected columns -> ['Soil_pH','N','P','K','Moisture','Temperature','Rainfall','Yield']
    # Handle missing values
    df = df.fillna(df.mean())

    # Features & Target
    X = df.drop(columns=['Yield'])
    y = df['Yield']

    # Standardize numeric features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-Val-Test split
    X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test


# Example usage (after downloading structured dataset):
# X_train, X_val, X_test, y_train, y_val, y_test = preprocess_yield_data("data/crop_yield.csv")


# -------------------------------
# PART 3: Data Augmentation (For Images – Optional, helps CNN generalization)
# -------------------------------

def get_image_generator():
    """
    Create an ImageDataGenerator for augmentation
    """
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return datagen

# Example usage (after train split):
# datagen = get_image_generator()
# datagen.fit(X_train_img)
