In [1]:
import torch
from ultralytics import YOLO
from PIL import Image
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import os
import cv2
HOME = os.getcwd()
yolo_path = os.path.join(HOME, 'train17/weights/best.pt')
# Load the YOLOv8 model with your trained weights (weights.pt)
yolo_model = YOLO(yolo_path)  # Update with correct path to your .pt file

# Load the CLIP model and processor from HuggingFace
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_yolo_predictions(image_path, confidence_threshold=0.5):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB format

    results = yolo_model(image_path)  # Run inference

    if len(results[0].boxes) == 0:
        print("No objects detected.")
        return [], [], img  # Return empty lists if no objects detected

    predictions = results[0].boxes.data.cpu().numpy()  # Convert to NumPy array

    # Extract bounding boxes and class names
    bboxes = predictions[:, :4]  # Extract xmin, ymin, xmax, ymax
    confidences = predictions[:, 4]  # Confidence scores
    class_ids = predictions[:, 5].astype(int)  # Convert to integer class IDs

    # Filter based on confidence threshold
    valid_indices = confidences > confidence_threshold
    bboxes = bboxes[valid_indices]
    class_ids = class_ids[valid_indices]

    # Convert class IDs to class names
    class_names = [yolo_model.names[class_id] for class_id in class_ids]

    return bboxes, class_names, img

def crop_image(image, bbox):
    """ Crop the image using the bounding box (xmin, ymin, xmax, ymax). """
    x_min, y_min, x_max, y_max = map(int, bbox)
    
    # Convert NumPy array to PIL image before cropping
    pil_img = Image.fromarray(image)
    
    cropped_img = pil_img.crop((x_min, y_min, x_max, y_max))
    return cropped_img

def get_clip_predictions(cropped_img, class_names):
    """ Get CLIP predictions for a cropped image and a list of class names. """
    
    # Process image and text correctly (remove .tolist())
    inputs = clip_processor(text=class_names, images=cropped_img, return_tensors="pt", padding=True)
    
    # Get the CLIP model's predictions
    outputs = clip_model(**inputs)
    
    return outputs.logits_per_image

# Define the main pipeline
def process_image(image_path):
    # Get YOLO predictions (bounding boxes and class names)
    bboxes, class_names, img = get_yolo_predictions(image_path)

    # For each bounding box, crop the image and pass it through CLIP model
    for bbox, class_name in zip(bboxes, class_names):
        # Crop the image
        cropped_img = crop_image(img, bbox)
        
        # Get CLIP predictions for the cropped image
        probabilities = get_clip_predictions(cropped_img, [class_name])
        
        # Get the class with the highest probability
        predicted_class_idx = probabilities.argmax(dim=1).item()
        predicted_class = class_names[predicted_class_idx]

        print(f"Predicted class: {predicted_class}, with probability: {probabilities[0][predicted_class_idx].item():.4f}")

        # Optionally, display the cropped image
        plt.imshow(cropped_img)
        plt.title(f"Predicted: {predicted_class}")
        plt.show()



  return torch.load(file, map_location='cpu'), file  # load


In [2]:
print(HOME)

/data/tphuawir/medBox_2025/MedicineBoxRecognition/code/pipeline


In [3]:
process_image("/data/tphuawir/medBox_2025/MedicineBoxRecognition/code/sample/unisom.jpg")

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [4]:
import torch
from ultralytics import YOLO
from PIL import Image
import matplotlib.pyplot as plt
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import os
import cv2
HOME = os.getcwd()

In [5]:
class BasePredictor:
    def predict(self, image, class_names):
        raise NotImplementedError

## CLIP-based Predictor
class CLIPPredictor(BasePredictor):
    def __init__(self, clip_model, clip_processor):
        self.clip_model = clip_model
        self.clip_processor = clip_processor

    def predict(self, image, class_names):
        inputs = self.clip_processor(text=class_names, images=image, return_tensors="pt", padding=True)
        outputs = self.clip_model(**inputs)
        return outputs.logits_per_image

## CLIP-based Predictor
class ClassifierPredictor(BasePredictor):
    def __init__(self, model, transform, class_names):
        self.model = model
        self.transform = transform
        self.class_names = class_names

    def predict(self, image, _):
        img_tensor = self.transform(image).unsqueeze(0)  # e.g., torchvision transform
        with torch.no_grad():
            outputs = self.model(img_tensor)
        return outputs

In [6]:
## Pipeline

class ImageObjectPipeline:
    def __init__(self, yolo_model, predictor, confidence_threshold=0.5):
        self.yolo_model = yolo_model
        self.predictor = predictor
        self.confidence_threshold = confidence_threshold

    def get_yolo_predictions(self, image_path):
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        results = self.yolo_model(image_path)

        if len(results[0].boxes) == 0:
            print("No objects detected.")
            return [], [], img

        predictions = results[0].boxes.data.cpu().numpy()
        bboxes = predictions[:, :4]
        confidences = predictions[:, 4]
        class_ids = predictions[:, 5].astype(int)

        valid_indices = confidences > self.confidence_threshold
        bboxes = bboxes[valid_indices]
        class_ids = class_ids[valid_indices]

        class_names = [self.yolo_model.names[class_id] for class_id in class_ids]

        return bboxes, class_names, img

    def crop_image(self, image, bbox):
        x_min, y_min, x_max, y_max = map(int, bbox)
        pil_img = Image.fromarray(image)
        return pil_img.crop((x_min, y_min, x_max, y_max))

    def process_image(self, image_path):
        bboxes, class_names, img = self.get_yolo_predictions(image_path)

        for bbox, class_name in zip(bboxes, class_names):
            cropped_img = self.crop_image(img, bbox)
            probs = self.predictor.predict(cropped_img, [class_name])

            predicted_class_idx = probs.argmax(dim=1).item()
            predicted_class = class_names[predicted_class_idx]

            print(f"Predicted class: {predicted_class}, with probability: {probs[0][predicted_class_idx].item():.4f}")
            plt.imshow(cropped_img)
            plt.title(f"Predicted: {predicted_class}")
            plt.show()


In [7]:
yolo_path = os.path.join(HOME, 'train17/weights/best.pt')
# Load the YOLOv8 model with your trained weights (weights.pt)
yolo_model = YOLO(yolo_path)  # Update with correct path to your .pt file

# Load the CLIP model and processor from HuggingFace
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



  return torch.load(file, map_location='cpu'), file  # load


In [8]:
clip_pipeline = ImageObjectPipeline(yolo_model, CLIPPredictor(clip_model, clip_processor))
# classifier_pipeline = ImageObjectPipeline(yolo_model, ClassifierPredictor(my_classifier_model, transform, class_names))


clip_pipeline.process_image("/data/tphuawir/medBox_2025/MedicineBoxRecognition/code/sample/unisom.jpg")
# classifier_pipeline.process_image(img_path)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
