<a href="https://colab.research.google.com/github/YahyaHajji/AI_Simple_Image_Classification/blob/master/image_classification_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ====================================================
# üß† DATASET CREATION WITH ADVANCED IMAGE SCRAPING
# ====================================================

!pip install duckduckgo-search pillow tensorflow scikit-learn matplotlib --quiet

import os, io, requests, warnings
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from duckduckgo_search import DDGS
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

warnings.filterwarnings('ignore')

print("TensorFlow version:", keras.__version__)

In [None]:
# ====================================================
# 1Ô∏è‚É£ Dataset structure
# ====================================================

classes = ['car', 'truck', 'frog']
os.makedirs("dataset", exist_ok=True)

for cls in classes:
    os.makedirs(f"dataset/{cls}", exist_ok=True)

print("‚úì Dataset structure ready")

In [None]:
# ====================================================
# üîß Setup
# ====================================================
!pip install duckduckgo-search pillow requests

import os, io, requests
from PIL import Image
from duckduckgo_search import DDGS

# Define your classes
classes = ['car', 'truck', 'frog']

# Create folders for dataset
for cls in classes:
    os.makedirs(f'dataset/{cls}', exist_ok=True)


# ====================================================
# 1Ô∏è‚É£ Image fetching function
# ====================================================
def fetch_images(search_term, limit=60):
    """Fetch image URLs using DuckDuckGo safe search."""
    results = []
    with DDGS() as ddgs:
        for r in ddgs.images(
            keywords=search_term,
            region='wt-wt',
            safesearch='moderate',  # ‚úÖ valid options: 'off', 'moderate', 'on'
            max_results=limit
        ):
            # Extract valid image URLs
            if "image" in r and r["image"].startswith("http"):
                results.append(r["image"])
    return results


# ====================================================
# 2Ô∏è‚É£ Image validation function
# ====================================================
def is_image_valid(img, min_size=80):
    """Validate image by size, shape, and color mode."""
    try:
        if img.mode != 'RGB':
            return False
        w, h = img.size
        if w < min_size or h < min_size:
            return False
        if abs(w / h - 1) > 0.7:  # too wide/tall ratio
            return False
        return True
    except:
        return False


# ====================================================
# 3Ô∏è‚É£ Download and validate images
# ====================================================
def download_images_advanced(classes, images_per_class=10):
    """Download validated images for each class until limit reached."""
    for cls in classes:
        print(f"\nüîç Searching images for: {cls}")
        urls = fetch_images(cls, limit=100)
        count = 0

        for idx, url in enumerate(urls):
            if count >= images_per_class:
                break
            try:
                response = requests.get(url, timeout=8)
                img = Image.open(io.BytesIO(response.content)).convert("RGB")

                if is_image_valid(img):
                    img.save(f"dataset/{cls}/img_{count+1}.jpg")
                    count += 1
                    print(f"‚úì Saved {cls}/img_{count}.jpg ‚úÖ")
                else:
                    print(f"‚úó Skipped (invalid image): {url}")

            except Exception as e:
                print(f"‚ö†Ô∏è Error {idx+1}: {e}")

        print(f"‚úÖ {count} valid images saved for '{cls}'.")


# ====================================================
# 4Ô∏è‚É£ Run the scraper
# ====================================================
download_images_advanced(classes)
print("\nüéØ All classes now have up to 10 valid images each!")


In [None]:
import os
from PIL import Image
import numpy as np

# Define image size and classes, ensure consistency with previous cells
IMG_SIZE = (128, 128)
classes = ['car', 'truck', 'frog']
class_names = classes # Renaming for consistency with visualization cell
class_to_idx = {cls: i for i, cls in enumerate(classes)}

X = [] # To store image data
y = [] # To store image labels
base_dir = "dataset"

print("Loading images and labels from dataset directory...")
for cls_name in classes:
    class_path = os.path.join(base_dir, cls_name)
    class_idx = class_to_idx[cls_name]
    if os.path.exists(class_path):
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            try:
                img = Image.open(img_path).resize(IMG_SIZE).convert("RGB") # Ensure RGB mode
                img_array = np.array(img)
                if img_array.shape == (*IMG_SIZE, 3): # Ensure correct shape for RGB images
                    X.append(img_array)
                    y.append(class_idx)
                else:
                    print(f"Skipping {img_path}: unexpected shape {img_array.shape}")
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
    else:
        print(f"Warning: Class directory not found: {class_path}")

X = np.array(X)
y = np.array(y)

print(f"Loaded {len(X)} images with labels.")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

In [None]:
# ====================================================
# 5Ô∏è‚É£ Split dataset into train/test folders
# ====================================================

import shutil
import random

def split_dataset(base_dir='dataset', output_dir='data_split', train_ratio=0.8):
    """Split images from each class into train and test folders."""
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)

    for subset in ['train', 'test']:
        for cls in os.listdir(base_dir):
            os.makedirs(f"{output_dir}/{subset}/{cls}", exist_ok=True)

    for cls in os.listdir(base_dir):
        images = os.listdir(f"{base_dir}/{cls}")
        random.shuffle(images)
        split_idx = int(len(images) * train_ratio)
        train_files = images[:split_idx]
        test_files = images[split_idx:]

        for f in train_files:
            shutil.copy(f"{base_dir}/{cls}/{f}", f"{output_dir}/train/{cls}/{f}")
        for f in test_files:
            shutil.copy(f"{base_dir}/{cls}/{f}", f"{output_dir}/test/{cls}/{f}")

        print(f"‚úÖ {cls}: {len(train_files)} train | {len(test_files)} test")

    print("\nüéØ Dataset successfully split into 'train' and 'test' folders!")


# Run the split
split_dataset()


In [None]:
# ====================================================
# ‚úÖ Split the dataset safely + visualize samples
# ====================================================

import warnings
warnings.filterwarnings("ignore", message="datetime.datetime.utcnow", category=DeprecationWarning)

from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

# Check dataset balance before splitting
class_distribution = Counter(y)
print("üìä Class distribution before split:", class_distribution)

# Ensure that each class has at least 2 samples
if min(class_distribution.values()) < 2:
    print("‚ö†Ô∏è Not enough samples per class ‚Äî disabling stratify for safety.")
    stratify_param = None
else:
    stratify_param = y

# Split dataset into training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=stratify_param
)

# Convert labels to categorical (one-hot encoding)
num_classes = len(class_names)
y_train_cat = keras.utils.to_categorical(y_train, num_classes)
y_val_cat = keras.utils.to_categorical(y_val, num_classes)

print(f"‚úÖ Train set: {X_train.shape[0]} images")
print(f"‚úÖ Validation set: {X_val.shape[0]} images")
print(f"‚úÖ Classes: {class_names}")
print(f"‚úÖ Image shape: {X_train.shape[1:]}")

# ====================================================
# üñºÔ∏è Visualize some random samples from the training set
# ====================================================

plt.figure(figsize=(10, 5))
indices = np.random.choice(len(X_train), 6, replace=False)

for i, idx in enumerate(indices):
    plt.subplot(2, 3, i+1)
    plt.imshow(X_train[idx])
    plt.title(class_names[y_train[idx]])
    plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# ====================================================
# 7Ô∏è‚É£ Load dataset and prepare data generators
# ====================================================

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = (128, 128)
BATCH_SIZE = 16

train_dir = "data_split/train"
test_dir = "data_split/test"

# Data augmentation for better generalization
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

test_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

test_gen = test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

num_classes = len(train_gen.class_indices)
print(f"\nüìö Detected {num_classes} classes: {list(train_gen.class_indices.keys())}")


In [None]:
# ====================================================
# 8Ô∏è‚É£ Define a simple CNN model
# ====================================================

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(*IMG_SIZE, 3)),
    MaxPooling2D(2,2),

    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),

    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(2,2),

    Flatten(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


In [None]:
# ====================================================
# 9Ô∏è‚É£ Train the CNN model
# ====================================================

EPOCHS = 10

history = model.fit(
    train_gen,
    validation_data=test_gen,
    epochs=EPOCHS
)


In [None]:
# ====================================================
# üîü Visualize accuracy and loss
# ====================================================

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Accuracy')
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.legend()

plt.show()


In [None]:
# ====================================================
# 11Ô∏è‚É£ Test predictions on random test images
# ====================================================

import numpy as np
import random

class_names = list(train_gen.class_indices.keys())

def show_random_predictions(n=5):
    plt.figure(figsize=(12, 5))
    for i in range(n):
        cls = random.choice(class_names)
        img_name = random.choice(os.listdir(f"{test_dir}/{cls}"))
        img_path = f"{test_dir}/{cls}/{img_name}"

        img = Image.open(img_path).resize(IMG_SIZE)
        img_array = np.expand_dims(np.array(img) / 255.0, axis=0)
        pred = model.predict(img_array, verbose=0)
        pred_cls = class_names[np.argmax(pred)]

        plt.subplot(1, n, i+1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"True: {cls}\nPred: {pred_cls}")
    plt.show()

show_random_predictions()
