<a href="https://colab.research.google.com/github/aliikhwan99/cassava-leaf-disease-classification/blob/main/cassava_leaf_disease_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

cassava_leaf_disease_classification_path = kagglehub.competition_download('cassava-leaf-disease-classification')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import os
import shutil

# Access the Dataset

In [None]:
# Acces teh Dataset
import os

#list all files in the input directory
input_dir = '/kaggle/input/cassava-leaf-disease-classification'
os.listdir(input_dir)

# Load CSV and JSON Files

In [None]:
import pandas as pd
import json

# Load the train.csv file
train_df = pd.read_csv(os.path.join(input_dir, 'train.csv'))

# Load the JSON file for mapping labels to diseases
with open(os.path.join(input_dir, 'label_num_to_disease_map.json')) as f:
    label_map = json.load(f)

# Display the first few rows of the training data
print(train_df.head())

# Display the label mapping
print(label_map)


# Load and Visualize Images

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# Function to load and display an image
def load_and_display_image(image_id):
    image_path = os.path.join(input_dir, 'train_images', image_id)
    img = Image.open(image_path)
    plt.imshow(img)
    plt.axis('off')  # Hide axes
    plt.show()

# Display a sample image
sample_image_id = train_df['image_id'].iloc[0]
load_and_display_image(sample_image_id)


# Preprocess Images

In [None]:
from PIL import Image
from torchvision import transforms

# Define preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),          # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to load an image using PIL
def load_image(image_id):
    image_path = os.path.join('/kaggle/input/cassava-leaf-disease-classification/train_images', image_id)
    img = Image.open(image_path)
    return img

# Example: Load and preprocess the sample image
sample_image_id = train_df['image_id'].iloc[0]  # Replace with actual image ID if needed
img = load_image(sample_image_id)               # Load image
img_tensor = preprocess(img)                    # Apply preprocessing transformations

# Check the shape of the processed image tensor
print(img_tensor.shape)


# Define the Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

# Define your CNN model (simple architecture for image classification)
class CassavaCNN(nn.Module):
    def __init__(self, num_classes=5):  # Assuming 5 classes for Cassava leaf disease
        super(CassavaCNN, self).__init__()

        # Define the layers of the CNN
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        # Fully connected layers
        self.fc1 = nn.Linear(128 * 8 * 8, 512)  # Assuming input image size is 64x64
        self.fc2 = nn.Linear(512, num_classes)

        # Dropout layer
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))

        x = x.view(-1, 128 * 8 * 8)  # Flatten the feature map

        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x


# Prepare Dataset and DataLoader

In [None]:
import os
print(os.listdir("/kaggle/input"))


In [None]:
import os
print(os.listdir("/kaggle/input/cassava-leaf-disease-classification"))


In [None]:
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms

# Load the CSV with image file names and labels
train_df = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/train.csv')

# Shuffle the dataset and select the first 1000 samples
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the dataset
train_df = train_df[:1000]  # Select the first 1000 samples

# Custom Dataset class to handle loading of images and their labels
class CassavaDataset(Dataset):
    def __init__(self, data_frame, root_dir, transform=None):
        self.data_frame = data_frame
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 0]  # Assuming first column contains image names
        img_path = f"{self.root_dir}/{img_name}"
        image = Image.open(img_path)

        label = self.data_frame.iloc[idx, 1]  # Assuming second column contains labels

        if self.transform:
            image = self.transform(image)

        return image, label

# Define transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define batch size
batch_size = 32  # You can adjust this value depending on your memory constraints

# Create the dataset using the first 1000 samples
train_dataset = CassavaDataset(
    data_frame=train_df,
    root_dir='/kaggle/input/cassava-leaf-disease-classification/train_images',
    transform=transform
)

# Split the dataset into 800 for training and 200 for testing
train_size = 800
test_size = 200
train_subset, test_subset = random_split(train_dataset, [train_size, test_size])

# Create DataLoaders for training and testing
train_loader = DataLoader(dataset=train_subset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_subset, batch_size=batch_size, shuffle=False)

# Now 'train_loader' contains 800 samples for training and 'test_loader' contains 200 samples for testing


In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Define the custom dataset class
class CassavaDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        image = Image.open(img_name)
        label = int(self.data.iloc[idx, 1])

        if self.transform:
            image = self.transform(image)

        return image, label

# Define preprocessing
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Paths to CSV and image directory
csv_file = '/kaggle/input/cassava-leaf-disease-classification/train.csv'
img_dir = '/kaggle/input/cassava-leaf-disease-classification/train_images'

# Create the dataset
train_dataset = CassavaDataset(csv_file=csv_file, img_dir=img_dir, transform=preprocess)

# Create the DataLoader
batch_size = 32
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Example usage (loop through one batch of data)
for images, labels in dataloader:
    print(images.shape, labels.shape)  # Should print: torch.Size([32, 3, 224, 224]), torch.Size([32])
    break


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models

# Load a pre-trained model
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 5)  # 5 classes for the diseases

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Set the model to training mode
model.train()

# Define number of epochs
num_epochs = 1  # You can adjust this value as needed

# Training loop (simplified example)
for epoch in range(num_epochs):
    for images, labels in dataloader:  # Use a DataLoader to load your data
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


In [None]:
# Ensure the lengths of test_image_ids and predictions match
print(f"Number of test images (expected): 1000")
print(f"Number of test image IDs: {len(test_image_ids)}")
print(f"Number of predictions: {len(predictions)}")

# Check if they are different
if len(test_image_ids) != len(predictions) or len(test_image_ids) != 1000:
    print("Mismatch detected or incorrect number of samples!")

    # Print some sample information for debugging
    print("First few image IDs:", test_image_ids[:5])
    print("First few predictions:", predictions[:5])

    # Check the size of the test DataLoader batches
    num_images_processed = 0
    all_predictions = []

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            images, _ = batch  # Assuming the test DataLoader has labels as _ placeholder
            batch_size = images.size(0)
            num_images_processed += batch_size  # Count how many images are processed

            print(f"Batch {i}: Batch size = {batch_size}, Total images processed so far = {num_images_processed}")

            # Generate predictions for the current batch
            preds = model(images)  # Assuming you're using model for prediction
            preds = preds.argmax(dim=1).cpu().numpy()  # Convert to class labels if necessary

            all_predictions.extend(preds)  # Append predictions from the batch

            print(f"Predictions for Batch {i}: {len(preds)}")  # Check number of predictions per batch

    # Final check after processing all batches
    print(f"Total images processed: {num_images_processed}")
    print(f"Total predictions made: {len(all_predictions)}")

    if len(all_predictions) != 1000:
        print("Total predictions don't match the expected 1,000 samples!")

    # Raise error after diagnostic check
    raise ValueError("Mismatch between the number of test image IDs and predictions.")
else:
    print("Everything looks correct, proceeding to create submission.")

# Proceed to create the submission DataFrame only if lengths match
submission_df = pd.DataFrame({
    'image_id': test_image_ids[:1000],  # Ensure only 1,000 IDs are used
    'label': predictions[:1000]         # Ensure only 1,000 predictions are used
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)


# Evaluate the Model (Optional)

In [None]:
# Set the model to evaluation mode
model.eval()

# Disable gradient calculation for faster inference
with torch.no_grad():
    correct = 0
    total = 0

    for images, labels in test_loader:  # Assuming you have created a test_loader
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f"Test Accuracy: {100 * correct / total:.2f}%")


# Save the Model (Optional)

In [None]:
# Save the model checkpoint
torch.save(model.state_dict(), 'resnet50_cassava.pth')


In [None]:
# Load the model checkpoint
model.load_state_dict(torch.load('resnet50_cassava.pth'))
