In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.74-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu1

In [None]:
# prompt: read all the images from data/input_images/(image) and perform the same as you did above

from ultralytics import YOLO
import os

!pip install ultralytics

# Load a pretrained YOLOv8n model
model = YOLO('yolov8n.pt')

input_dir = "data/input_images/"
output_dir = "data/output/"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

for image_filename in os.listdir(input_dir):
  if image_filename.endswith(('.jpg', '.jpeg', '.png')):  # Process common image formats
    image_path = os.path.join(input_dir, image_filename)

    # Perform object detection on the image
    results = model(image_path)

    # Save the identified output image
    output_filename = os.path.join(output_dir, f"identified_{image_filename}")
    results[0].save(output_filename)

    print(f"Identified output for {image_filename} saved to: {output_filename}")


Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 77.8MB/s]



image 1/1 /content/data/input_images/cycle.jpg: 352x640 5 persons, 2 bicycles, 10 cars, 312.7ms
Speed: 18.2ms preprocess, 312.7ms inference, 35.2ms postprocess per image at shape (1, 3, 352, 640)
Identified output for cycle.jpg saved to: data/output/identified_cycle.jpg


In [None]:
# prompt: segment the image form data/input_images and save the segments as different images in segmented_images2 folder

from ultralytics import YOLO # Import the YOLO library
import os
import cv2

# Load a pretrained YOLOv8n model
model = YOLO('yolov8n.pt') # Re-initialize the YOLO model

input_dir = "data/input_images/"  # Directory containing identified images
output_dir = "data/segmented_images2/"  # Directory to save segmented images

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

for image_filename in os.listdir(input_dir):
    if image_filename.endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(input_dir, image_filename)
        img = cv2.imread(image_path)

        # Perform object detection on the image
        results = model(image_path, save=False)

        # Iterate through detected objects and segment
        for idx, det in enumerate(results[0].boxes.data):
            class_id = int(det[5])
            class_name = model.names[class_id]
            xmin, ymin, xmax, ymax, conf, _ = det

            # Extract object segment
            cropped_img = img[int(ymin):int(ymax), int(xmin):int(xmax)]

            # Save segmented object with label as filename
            output_filename = os.path.join(output_dir, f"{class_name}{idx+1}.jpg")
            cv2.imwrite(output_filename, cropped_img)

            print(f"Segmented {class_name}{idx+1} from {image_filename} saved to: {output_filename}")


image 1/1 /content/data/input_images/cycle.jpg: 352x640 5 persons, 2 bicycles, 10 cars, 222.8ms
Speed: 8.1ms preprocess, 222.8ms inference, 3.3ms postprocess per image at shape (1, 3, 352, 640)
Segmented person1 from cycle.jpg saved to: data/segmented_images2/person1.jpg
Segmented car2 from cycle.jpg saved to: data/segmented_images2/car2.jpg
Segmented car3 from cycle.jpg saved to: data/segmented_images2/car3.jpg
Segmented car4 from cycle.jpg saved to: data/segmented_images2/car4.jpg
Segmented person5 from cycle.jpg saved to: data/segmented_images2/person5.jpg
Segmented car6 from cycle.jpg saved to: data/segmented_images2/car6.jpg
Segmented person7 from cycle.jpg saved to: data/segmented_images2/person7.jpg
Segmented person8 from cycle.jpg saved to: data/segmented_images2/person8.jpg
Segmented car9 from cycle.jpg saved to: data/segmented_images2/car9.jpg
Segmented bicycle10 from cycle.jpg saved to: data/segmented_images2/bicycle10.jpg
Segmented car11 from cycle.jpg saved to: data/segme

In [None]:
# prompt: now caption the output of the above code in the table, name the table as captioning the (input_images name). in the table, give seq number, then name output of segmented_images (eg. car2) and caption of segments

# Load the captioning model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

input_dir = "data/segmented_images2/"  # Directory containing segmented images

# Prepare the table data
table_data = []
seq_number = 1
for image_filename in os.listdir(input_dir):
    if image_filename.endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(input_dir, image_filename)

        # Load and preprocess the image
        raw_image = Image.open(image_path).convert('RGB')
        inputs = processor(raw_image, return_tensors="pt")

        # Generate caption
        out = model.generate(**inputs)
        caption = processor.batch_decode(out, skip_special_tokens=True)[0]

        table_data.append([seq_number, image_filename, caption])
        seq_number += 1

# Print the table (replace 'input_images name' with the actual name)
print(f"### Captioning the (input_images name)")
print("| Seq Number | Segmented Image | Caption |")
print("|---|---|---|")
for row in table_data:
    print(f"| {row[0]} | {row[1]} | {row[2]} |")




### Captioning the (input_images name)
| Seq Number | Segmented Image | Caption |
|---|---|---|
| 1 | bicycle10.jpg | a man riding a bike down a street |
| 2 | person8.jpg | a woman is bending her leg |
| 3 | car4.jpg | a white car driving down a street |
| 4 | car14.jpg | a car is seen in the middle of a traffic jam in the capital of cairo |
| 5 | car11.jpg | a blur of a car driving down a road |
| 6 | car12.jpg | a man in a white shirt is walking down the street |
| 7 | person1.jpg | a woman riding a bike down a street |
| 8 | bicycle17.jpg | a man riding a bike down a street |
| 9 | car9.jpg | a sign that says no parking |
| 10 | car3.jpg | a police car is parked on the side of the road |
| 11 | car16.jpg | a man is seen in the middle of a traffic cone |
| 12 | person13.jpg | a man riding a bike down a street |
| 13 | car2.jpg | a group of people walking down the street |
| 14 | person5.jpg | a man walking down a street with a dog |
| 15 | car6.jpg | a white car driving down a stree

In [None]:
# prompt: create an app using streamlit, using all above code where take input image from user, segment the image and show it. create a table below with the caption with the same layout as above.

!pip install streamlit
!pip install ultralytics

import streamlit as st
from ultralytics import YOLO
import cv2
from PIL import Image
import os
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load the YOLO model
model = YOLO('yolov8n.pt')

# Load the captioning model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model_caption = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def segment_image(image, output_dir):
    results = model(image, save=False)
    segmented_images = []

    for idx, det in enumerate(results[0].boxes.data):
        class_id = int(det[5])
        class_name = model.names[class_id]
        xmin, ymin, xmax, ymax, conf, _ = det

        cropped_img = image[int(ymin):int(ymax), int(xmin):int(xmax)]
        output_filename = os.path.join(output_dir, f"{class_name}{idx+1}.jpg")
        cv2.imwrite(output_filename, cropped_img)
        segmented_images.append((output_filename, class_name))

    return segmented_images

def generate_caption(image_path):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt")
    out = model_caption.generate(**inputs)
    caption = processor.batch_decode(out, skip_special_tokens=True)[0]
    return caption

# Streamlit app
st.title("Image Segmentation and Captioning")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image = Image.open(uploaded_file)
    st.image(image, caption='Uploaded Image.', use_column_width=True)

    if st.button("Segment and Caption"):
        temp_dir = "temp_segmentation"
        os.makedirs(temp_dir, exist_ok=True)

        # Segment the image
        segmented_images = segment_image(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR), temp_dir)

        # Display segmented images and captions
        table_data = []
        seq_number = 1
        for image_path, class_name in segmented_images:
            caption = generate_caption(image_path)
            st.image(Image.open(image_path), caption=f"{class_name}", use_column_width=True)
            table_data.append([seq_number, os.path.basename(image_path), caption])
            seq_number += 1

        # Display the table
        st.markdown("### Captioning Results")
        st.markdown("| Seq Number | Segmented Image | Caption |")
        st.markdown("|---|---|---|")
        for row in table_data:
            st.markdown(f"| {row[0]} | {row[1]} | {row[2]} |")


Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.37.1-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m48.2 MB

2024-08-08 15:46:13.078 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [None]:
# prompt: save all the script into my desktop in a folder named as 1.Project

!zip -r /content/1.Project.zip /content
from google.colab import files
files.download("/content/1.Project.zip")


  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_update_check.json (deflated 23%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.08.06/ (stored 0%)
  adding: content/.config/logs/2024.08.06/13.31.38.932663.log (deflated 93%)
  adding: content/.config/logs/2024.08.06/13.32.20.145101.log (deflated 57%)
  adding: content/.config/logs/2024.08.06/13.32.20.715204.log (deflated 56%)
  adding: content/.config/logs/2024.08.06/13.32.09.494122.log (deflated 85%)
  adding: content/.config/logs/2024.08.06/13.31.59.628425.log (deflated 57%)
  adding: content/.config/logs/2024.08.06/13.32.10.540609.log (deflated 58%)
  ad

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>