In [141]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import torch.multiprocessing as mp
from tqdm import tqdm

In [142]:
class CNN(nn.Module):
    def __init__(self, num_filters=[32, 64, 128, 256, 512], kernel_size=3, num_dense=256, num_classes=10, activation=F.relu):
        super(CNN, self).__init__()
        self.activation = activation  # Set activation function
        self.pool = nn.MaxPool2d(2, 2)

        # Convolutional layers
        self.convs = nn.ModuleList()
        in_channels = 3  # Initial input channels (RGB)
        for out_channels in num_filters:
            self.convs.append(nn.Conv2d(in_channels, out_channels, kernel_size))
            in_channels = out_channels  # Update input channels for next layer

        # Dummy input to calculate flattened size
        self.flattened_size = self._get_flattened_size((3, 224, 224))
        
        # Fully connected layers
        self.fc1 = nn.Linear(self.flattened_size, num_dense)
        self.fc2 = nn.Linear(num_dense, num_classes)

    def _get_flattened_size(self, input_shape):
        """Passes a dummy tensor through conv layers to compute flattened size."""
        with torch.no_grad():
            x = torch.zeros(1, *input_shape)
            for conv in self.convs:
                x = self.pool(self.activation(conv(x)))
            return x.numel()

    def forward(self, x):
        for conv in self.convs:
            x = self.pool(self.activation(conv(x)))
        x = x.view(x.size(0), -1)  # Flatten
        x = self.activation(self.fc1(x))
        x = self.fc2(x)
        return x
        
model = CNN()

In [167]:
x = torch.randn(1,512,5,5)
x = torch.flatten(x, 1)
x = model.fc1(x)
x.shape

torch.Size([1, 256])

In [130]:
transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize images to 224x224 (change if needed)
        transforms.ToTensor(),          # Convert images to PyTorch tensors
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize
    ])

# Define dataset paths
train_dir = "../inaturalist_12k/train"
val_dir = "../inaturalist_12k/val"

# Load datasets
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset = datasets.ImageFolder(root=val_dir, transform=transform)

# Check class mapping (optional)
# print("Class names:", train_dataset.classes)  # List of class names
# print("Class indices:", train_dataset.class_to_idx)  # Mapping class → index

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device('cpu')
print(f"Using device: {device}")

Using device: mps


In [138]:
learning_rate = 1e-3
batch_size = 64
max_epochs = 5

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [139]:
loss_fn = nn.CrossEntropyLoss() # Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Optimizer

In [140]:
model.to(device)
for epoch in tqdm(range(max_epochs)):
    running_loss = 0
    counter = 0
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        pred_labels = model(images)
        loss = loss_fn(pred_labels, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if counter % 10 == 9:
            print(f'[{epoch + 1}, {counter + 1:5d}] loss: {running_loss / 10:.3f}')
            running_loss = 0.0
        counter += 1

  0%|                                                            | 0/5 [00:00<?, ?it/s]

[1,    10] loss: 2.622
[1,    20] loss: 2.271
[1,    30] loss: 2.192
[1,    40] loss: 2.179
[1,    50] loss: 2.149
[1,    60] loss: 2.133
[1,    70] loss: 2.097
[1,    80] loss: 2.080
[1,    90] loss: 2.126
[1,   100] loss: 2.078
[1,   110] loss: 2.093
[1,   120] loss: 2.058
[1,   130] loss: 2.023
[1,   140] loss: 2.026
[1,   150] loss: 2.030


 20%|██████████▍                                         | 1/5 [00:54<03:38, 54.71s/it]

[2,    10] loss: 2.019
[2,    20] loss: 2.025
[2,    30] loss: 1.972
[2,    40] loss: 2.002
[2,    50] loss: 1.977
[2,    60] loss: 2.015
[2,    70] loss: 1.984
[2,    80] loss: 1.964
[2,    90] loss: 1.939
[2,   100] loss: 1.970
[2,   110] loss: 2.008
[2,   120] loss: 1.928
[2,   130] loss: 1.996
[2,   140] loss: 2.000
[2,   150] loss: 1.952


 40%|████████████████████▊                               | 2/5 [01:47<02:41, 53.82s/it]

[3,    10] loss: 1.924
[3,    20] loss: 1.871
[3,    30] loss: 1.917
[3,    40] loss: 2.025
[3,    50] loss: 1.925
[3,    60] loss: 1.876
[3,    70] loss: 1.897
[3,    80] loss: 1.885
[3,    90] loss: 1.894
[3,   100] loss: 1.880
[3,   110] loss: 1.912
[3,   120] loss: 1.884
[3,   130] loss: 1.950
[3,   140] loss: 1.956
[3,   150] loss: 1.894


 60%|███████████████████████████████▏                    | 3/5 [02:40<01:46, 53.19s/it]

[4,    10] loss: 1.803
[4,    20] loss: 1.848
[4,    30] loss: 1.828
[4,    40] loss: 1.898
[4,    50] loss: 1.834
[4,    60] loss: 1.790
[4,    70] loss: 1.895
[4,    80] loss: 1.857
[4,    90] loss: 1.812
[4,   100] loss: 1.794
[4,   110] loss: 1.835
[4,   120] loss: 1.882
[4,   130] loss: 1.818
[4,   140] loss: 1.913
[4,   150] loss: 1.875


 80%|█████████████████████████████████████████▌          | 4/5 [03:33<00:53, 53.24s/it]

[5,    10] loss: 1.761
[5,    20] loss: 1.742
[5,    30] loss: 1.756
[5,    40] loss: 1.741
[5,    50] loss: 1.790
[5,    60] loss: 1.788
[5,    70] loss: 1.765
[5,    80] loss: 1.827
[5,    90] loss: 1.778
[5,   100] loss: 1.798
[5,   110] loss: 1.875
[5,   120] loss: 1.739
[5,   130] loss: 1.825
[5,   140] loss: 1.797
[5,   150] loss: 1.753


100%|████████████████████████████████████████████████████| 5/5 [04:25<00:00, 53.17s/it]


In [136]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    model.eval()
    for images, labels in val_loader:
        # calculate outputs by running images through the network
        images, labels = images.to(device), labels.to(device)
        pred_labels = model(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(pred_labels, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Accuracy of the network on the 10000 test images: 34 %
