<a href="https://colab.research.google.com/github/bartolomeoadrian/computational-vision/blob/main/04_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuración

In [20]:
BATCH_SIZE = 64
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
MOMENTUM = 0.9
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 1 - Cargar librerías

In [17]:
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from PIL import Image
from torch.utils.data import DataLoader

# 2 - Cargar y normalizar imágenes

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

FROG_CLASS_IDX = 6

def binary_target_transform(label: int) -> int:
    """Map CIFAR‑10 labels to {0: not_frog, 1: frog}."""
    return 1 if label == FROG_CLASS_IDX else 0


trainset = torchvision.datasets.CIFAR10(
    root="./data", train=True, download=True, transform=transform, target_transform=binary_target_transform
)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root="./data", train=False, download=True, transform=transform, target_transform=binary_target_transform
)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

100%|██████████| 170M/170M [00:10<00:00, 16.1MB/s]


# 3 - Definir una red neuronal convolucional

In [5]:
class FrogNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)  # Binary output (frog / not_frog)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = FrogNet().to(DEVICE)

# 4 - Definir una función de pérdida y un optimizador

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

# 5 - Entrenar a la red

In [21]:
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    net.train()
    for i, (inputs, labels) in enumerate(trainloader, 1):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 0:
            print(f"[Epoch {epoch+1}/{NUM_EPOCHS} | Batch {i}] Loss: {running_loss/100:.4f}")
            running_loss = 0.0

[Epoch 1/20 | Batch 100] Loss: 0.1743
[Epoch 1/20 | Batch 200] Loss: 0.1807
[Epoch 1/20 | Batch 300] Loss: 0.1832
[Epoch 1/20 | Batch 400] Loss: 0.1853
[Epoch 1/20 | Batch 500] Loss: 0.1750
[Epoch 1/20 | Batch 600] Loss: 0.1798
[Epoch 1/20 | Batch 700] Loss: 0.1722
[Epoch 2/20 | Batch 100] Loss: 0.1763
[Epoch 2/20 | Batch 200] Loss: 0.1790
[Epoch 2/20 | Batch 300] Loss: 0.1761
[Epoch 2/20 | Batch 400] Loss: 0.1666
[Epoch 2/20 | Batch 500] Loss: 0.1652
[Epoch 2/20 | Batch 600] Loss: 0.1748
[Epoch 2/20 | Batch 700] Loss: 0.1708
[Epoch 3/20 | Batch 100] Loss: 0.1684
[Epoch 3/20 | Batch 200] Loss: 0.1760
[Epoch 3/20 | Batch 300] Loss: 0.1756
[Epoch 3/20 | Batch 400] Loss: 0.1765
[Epoch 3/20 | Batch 500] Loss: 0.1661
[Epoch 3/20 | Batch 600] Loss: 0.1595
[Epoch 3/20 | Batch 700] Loss: 0.1660
[Epoch 4/20 | Batch 100] Loss: 0.1668
[Epoch 4/20 | Batch 200] Loss: 0.1610
[Epoch 4/20 | Batch 300] Loss: 0.1625
[Epoch 4/20 | Batch 400] Loss: 0.1702
[Epoch 4/20 | Batch 500] Loss: 0.1688
[Epoch 4/20 

# 6 - Prueba de la red

In [23]:
net.eval()
correct, total = 0, 0
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        outputs = net(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}% ({correct}/{total})")

Test Accuracy: 94.81% (9481/10000)


# 7 - Predicción

In [24]:
toads = [
    'https://raw.githubusercontent.com/bartolomeoadrian/computational-vision/refs/heads/main/assets/images/toad_1.jpg',
    'https://raw.githubusercontent.com/bartolomeoadrian/computational-vision/refs/heads/main/assets/images/toad_2.jpg',
    'https://raw.githubusercontent.com/bartolomeoadrian/computational-vision/refs/heads/main/assets/images/toad_3.jpg'
]

images = []

# Download the images
for i, image_url in enumerate(toads):
  response = requests.get(image_url)
  with open(f'toad_{i}.jpg', 'wb') as f:
      f.write(response.content)
      images.append(f'toad_{i}.jpg')

# Prediction function
def predict_image(path: str):
    img = Image.open(path).convert("RGB")
    img = img.resize((32, 32))  # CIFAR‑10 resolution
    tensor = transform(img).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        outputs = net(tensor)
        _, pred = torch.max(outputs, 1)

    return "frog" if pred.item() == 1 else "not_frog"

# Predict
for i, image in enumerate(images):
    result = predict_image(image)
    print(f'Toad {i} is: {result}')

Toad 0 is: frog
Toad 1 is: frog
Toad 2 is: not_frog


# Conclusión

El modelo fue entrenado con imagenes de ranas, no de sapos, aún asi se puede notar que dependiendo con el grado de epochs que se entrene a la red obtenedremos cierto grado de predicción acertada sobre las tres imagenes de sapos a probar. Para mejorar este modelo, se podría mejorar el dataset para tener imagenes de sapos e incluso tambien realizar un trabajo de segmentación sobre las imagenes a predecir para aislarlas de su fondo y obtener mejores resultados.
Se podría tambien modificar la última capa de la CNN para que sea clasificación en vez de booleana, de esta manera podría haber una posibilidad mas grande de obtener los resultados esperados