In [22]:
import torch
import torch.nn as nn
import torchvision.transforms.functional as F
from ultralytics import YOLO
from PIL import Image, ImageDraw
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\breno\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [23]:
args = {
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

In [24]:
class CatDogClassifier(nn.Module):
    def __init__(self):
        super(CatDogClassifier, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(2, 2)
        )
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * 16 * 16, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)  # Saída como logits
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x  # Saída como logits

In [25]:
net = CatDogClassifier().to(args['device'])
net.load_state_dict(torch.load('cat_dog_classifier.pth', map_location=torch.device(args['device'])))
net.eval()


CatDogClassifier(
  (conv_layers): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=32768, out_features=128, bias=True)
    (2): ReLU()
    (

In [26]:
def infer_image(model, image_path, device):

    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  

    image = image.to(device)

    model.eval()
    
    with torch.no_grad():
        # Faz a inferência
        output = model(image)
        probability = torch.sigmoid(output).item() 

    print(f'Probabilidade: {probability}')
    if probability > 0.5:
        return "Cachorro"
    else:
        return "Gato"


In [27]:
inference = infer_image(net, 'inferences/gato.jpg', args['device'])
print(inference)


Probabilidade: 1.2716908486254397e-06
Gato


In [28]:
def detect_with_faster_rcnn_and_classify(image_path, faster_rcnn, classifier, device, threshold=0.5):
    """
    Combina Faster R-CNN para detecção de objetos e um modelo de classificação para rotular gatos/cachorros.
    
    Args:
        image_path (str): Caminho para a imagem.
        faster_rcnn (nn.Module): Modelo Faster R-CNN pré-treinado.
        classifier (nn.Module): Modelo de classificação treinado.
        device (torch.device): Dispositivo (CPU/GPU).
        threshold (float): Limiar de confiança para detecção.
    
    Returns:
        PIL.Image: Imagem com bounding boxes e classificações desenhadas.
    """
    # Carregar a imagem
    image = Image.open(image_path).convert("RGB")
    original_image = image.copy()
    draw = ImageDraw.Draw(original_image)

    # Transformação usada pelo Faster R-CNN
    transform = transforms.Compose([transforms.ToTensor()])
    image_tensor = transform(image).to(device)

    # Detectar regiões de interesse com Faster R-CNN
    faster_rcnn.eval()
    with torch.no_grad():
        predictions = faster_rcnn([image_tensor])

    boxes = predictions[0]['boxes']
    scores = predictions[0]['scores']

    # Transformação usada pelo classificador
    classifier_transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Para cada bounding box detectada
    for box, score in zip(boxes, scores):
        if score > threshold:
            # Recortar a região da imagem original
            x1, y1, x2, y2 = map(int, box.tolist())
            cropped_image = image.crop((x1, y1, x2, y2))

            # Pré-processar a região recortada
            cropped_tensor = classifier_transform(cropped_image).unsqueeze(0).to(device)

            # Classificar a região com o modelo de classificação
            classifier.eval()
            with torch.no_grad():
                output = classifier(cropped_tensor)
                probability = torch.sigmoid(output).item()
                label = "Cachorro" if probability > 0.5 else "Gato"

            # Desenhar a bounding box e a classificação na imagem original
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
            draw.text((x1, y1), f"{label}", fill="red")
    
    return original_image

In [29]:
image_path = r"inferences\humano_com_gato2.jpg"
image_path_2 = r'test_set/test_set/cats/cat.4010.jpg'
faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True).to(args['device'])
result = detect_with_faster_rcnn_and_classify(image_path, faster_rcnn, net, args['device'])
result.show()




In [34]:
def detect_with_yolo_and_classify(image_path, yolo_model_path, classifier_model, device, threshold=0.5):
    """
    Usa YOLO para detectar bounding boxes e um modelo de classificação para rotular gatos/cachorros.

    Args:
        image_path (str): Caminho para a imagem.
        yolo_model_path (str): Caminho para o modelo YOLO pré-treinado ou personalizado.
        classifier_model (torch.nn.Module): Modelo de classificação treinado.
        device (torch.device): Dispositivo (CPU/GPU).
        threshold (float): Limiar de confiança para detecção.

    Returns:
        PIL.Image: Imagem com bounding boxes desenhadas e classificações.
    """
    # Carregar o modelo YOLO
    yolo_model = YOLO(yolo_model_path)

    # Fazer a detecção com YOLO
    results = yolo_model(image_path)

    # Carregar a imagem original
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    # Transformação usada pelo classificador
    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Iterar pelos resultados de detecção
    for result in results[0].boxes.data:
        x1, y1, x2, y2, confidence, class_id = result.tolist()
        if confidence > threshold:
            # Recortar a região da imagem original
            cropped_image = image.crop((x1, y1, x2, y2))

            # Pré-processar a região recortada
            cropped_tensor = transform(cropped_image).unsqueeze(0).to(device)

            # Classificar a região com o modelo de classificação
            classifier_model.eval()
            with torch.no_grad():
                output = classifier_model(cropped_tensor)
                probability = torch.sigmoid(output).item()
                label = "Cachorro" if probability > 0.5 else "Gato"

            # Desenhar bounding box e a classe
            draw.rectangle([x1, y1, x2, y2], outline="red", width=3)
            draw.text((x1, y1), f"{label}", fill="red")

    return image

In [37]:
image_path = r"inferences\humano_com_gato.jpg"
result = detect_with_yolo_and_classify(image_path, 'yolov8n.pt', net, args['device'])
result.show()



image 1/1 c:\Users\breno\OneDrive\Documentos\GitHub\study-deeplearning\notebooks\Estudos Individuais\Cat or Dog\inferences\humano_com_gato.jpg: 384x640 2 persons, 1 cat, 1 couch, 0.0ms
Speed: 0.0ms preprocess, 0.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)
