In [40]:
import ssl
import certifi
import torch
import open_clip
import cv2
import os
import numpy as np
import pandas as pd
from ultralytics import YOLO
from PIL import Image

# Fix SSL issues
ssl._create_default_https_context = ssl._create_unverified_context
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

# Load YOLOv8 model
yolo_model = YOLO("yolov8n.pt")  # Use "yolov8m.pt" or "yolov8l.pt" for better accuracy

# Load OpenCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
tokenizer = open_clip.get_tokenizer("ViT-B-32")

# Define text descriptions for classification
text_descriptions = [
    "A Saree",
    "A woman wearing a saree",
    "A traditional Indian saree with embroidery",
    "A long dress",
    "A person wearing a t-shirt",
    "A person wearing pants",
    "A Kurta",
    "A t-shirt",
    "A Necklace"


]

# Tokenize text descriptions for CLIP
text_inputs = open_clip.tokenize(text_descriptions).to(device)

# Define the folder containing frames
frame_folder = "/Users/aarushia/Downloads/shows/frames/kyuki saas/ksbkbt_s1e1_thres30"  # Update with the correct path

# Ensure the frame folder exists
if not os.path.exists(frame_folder):
    raise FileNotFoundError(f"Frame folder not found: {frame_folder}")

# Get all frame images
frame_files = sorted([f for f in os.listdir(frame_folder) if f.endswith((".jpg", ".png"))])

# Create an empty DataFrame to store results
df_results = pd.DataFrame(columns=["Frame", "Person_ID", "Best_Match", "Score"])

# Loop through all frames
for frame_id, frame_file in enumerate(frame_files):
    frame_path = os.path.join(frame_folder, frame_file)
    image = cv2.imread(frame_path)
    if image is None:
        continue  # Skip if the image cannot be loaded

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for PIL

    # Run YOLOv8 detection on the frame
    results = yolo_model(image)

    # Process each detected object
    person_count = 0
    for result in results:
        for box, cls in zip(result.boxes.xyxy, result.boxes.cls):
            if int(cls) == 0:  # Class 0 in COCO dataset is "Person"
                x1, y1, x2, y2 = map(int, box)

                # Crop lower body (remove face region)
                height = y2 - y1
                y1 += int(height * 0.3)  # Remove top 30% to focus on clothing

                cropped_image = image_rgb[y1:y2, x1:x2]
                if cropped_image.shape[0] == 0 or cropped_image.shape[1] == 0:
                    continue  # Skip if cropping resulted in an empty image

                cropped_pil = Image.fromarray(cropped_image)  # Convert to PIL format

                # Preprocess cropped image for CLIP
                image_tensor = preprocess(cropped_pil).unsqueeze(0).to(device)

                # Compute CLIP image and text embeddings
                with torch.no_grad():
                    image_features = model.encode_image(image_tensor)
                    text_features = model.encode_text(text_inputs)

                # Compute cosine similarity
                similarity = (image_features @ text_features.T).softmax(dim=-1).cpu().numpy()

                # Get best matching label
                best_match_idx = similarity.argmax()
                best_match = text_descriptions[best_match_idx]
                best_match_score = similarity[0][best_match_idx]

                # Append to DataFrame
                df_results = pd.concat([df_results, pd.DataFrame([{
                    "Frame": frame_file,
                    "Person_ID": person_count + 1,
                    "Best_Match": best_match,
                    "Score": round(best_match_score, 4)
                }])], ignore_index=True)

                person_count += 1  # Increment person count in the frame


print(df_results)





0: 384x640 5 potted plants, 71.1ms
Speed: 5.9ms preprocess, 71.1ms inference, 10.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 chair, 1 couch, 1 potted plant, 41.2ms
Speed: 1.7ms preprocess, 41.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 30.9ms
Speed: 1.5ms preprocess, 30.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 chairs, 1 couch, 1 bed, 40.5ms
Speed: 1.1ms preprocess, 40.5ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)


  df_results = pd.concat([df_results, pd.DataFrame([{



0: 384x640 (no detections), 34.8ms
Speed: 1.4ms preprocess, 34.8ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 couch, 1 bed, 29.9ms
Speed: 1.3ms preprocess, 29.9ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.2ms
Speed: 1.1ms preprocess, 40.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 1 couch, 1 bed, 1 dining table, 24.5ms
Speed: 8.3ms preprocess, 24.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 31.6ms
Speed: 1.1ms preprocess, 31.6ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 couch, 1 bed, 1 dining table, 24.3ms
Speed: 1.7ms preprocess, 24.3ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 51.8ms
Speed: 1.1ms preprocess, 51.8ms inference, 0.4ms postprocess per image at shape (1, 

In [41]:
df_results.to_csv("results_2.csv", index=False)  # Save results to CSV file

In [45]:
import ssl
import certifi
import torch
import open_clip
import cv2
import os
import numpy as np
import pandas as pd
from ultralytics import YOLO
from PIL import Image

# Fix SSL issues
ssl._create_default_https_context = ssl._create_unverified_context
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

# Load YOLOv8 model
yolo_model = YOLO("yolov8n.pt")  # Use "yolov8m.pt" or "yolov8l.pt" for better accuracy

# Load OpenCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
tokenizer = open_clip.get_tokenizer("ViT-B-32")

# Define text descriptions for classification
text_descriptions = [
    "A Saree",
    "A woman wearing a saree",
    "A traditional Indian saree with embroidery",
    "A long dress",
    "t-shirt",
    "pants",
    "A Kurta",
    "A t-shirt",
    "A Necklace",
    "A bag",
    "A chair",
    "A car",
    "A plant",
    "Spectacles",
    "A pair of shoes",
    "A lamp",
    "A watch",
    "A spoon",
    "Flowers",
    "A bed",
    "A rangoli",
    "A book",
    "A ring"
]

# Tokenize text descriptions for CLIP
text_inputs = open_clip.tokenize(text_descriptions).to(device)

# Define the folder containing frames
frame_folder = "/Users/aarushia/Downloads/shows/frames/kyuki saas/ksbkbt_s1e1_thres30"  # Update with the correct path

# Ensure the frame folder exists
if not os.path.exists(frame_folder):
    raise FileNotFoundError(f"Frame folder not found: {frame_folder}")

# Get all frame images
frame_files = sorted([f for f in os.listdir(frame_folder) if f.endswith((".jpg", ".png"))])

# Create an empty DataFrame to store results
df_results = pd.DataFrame(columns=["Frame", "Object_ID", "YOLO_Label", "Best_Match", "Score"])

# Loop through all frames
for frame_id, frame_file in enumerate(frame_files):
    frame_path = os.path.join(frame_folder, frame_file)
    image = cv2.imread(frame_path)
    if image is None:
        continue  # Skip if the image cannot be loaded

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for PIL

    # Run YOLOv8 detection on the frame
    results = yolo_model(image)

    # Process each detected object
    object_count = 0
    for result in results:
        for box, cls in zip(result.boxes.xyxy, result.boxes.cls):
            x1, y1, x2, y2 = map(int, box)

            # Get YOLO's class label
            yolo_label = yolo_model.names[int(cls)]  # Retrieve object name

            # Crop the detected object
            cropped_image = image_rgb[y1:y2, x1:x2]
            if cropped_image.shape[0] == 0 or cropped_image.shape[1] == 0:
                continue  # Skip if cropping resulted in an empty image

            cropped_pil = Image.fromarray(cropped_image)  # Convert to PIL format

            # Preprocess cropped image for CLIP
            image_tensor = preprocess(cropped_pil).unsqueeze(0).to(device)

            # Compute CLIP image and text embeddings
            with torch.no_grad():
                image_features = model.encode_image(image_tensor)
                text_features = model.encode_text(text_inputs)

            # Compute cosine similarity
            similarity = (image_features @ text_features.T).softmax(dim=-1).cpu().numpy()

            # Get best matching label
            best_match_idx = similarity.argmax()
            best_match = text_descriptions[best_match_idx]
            best_match_score = similarity[0][best_match_idx]

            # Append to DataFrame
            df_results = pd.concat([df_results, pd.DataFrame([{
                "Frame": frame_file,
                "Object_ID": object_count + 1,
                "YOLO_Label": yolo_label,
                "Best_Match": best_match,
                "Score": round(best_match_score, 4)
            }])], ignore_index=True)

            object_count += 1  # Increment object count in the frame

# Save results to CSV
df_results.to_csv("detected_objects_clip_1.csv", index=False)

# Display the results
print(df_results)





0: 384x640 5 potted plants, 48.5ms
Speed: 2.1ms preprocess, 48.5ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)


  df_results = pd.concat([df_results, pd.DataFrame([{



0: 384x640 1 chair, 1 couch, 1 potted plant, 26.3ms
Speed: 1.1ms preprocess, 26.3ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 24.0ms
Speed: 4.3ms preprocess, 24.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 4 chairs, 1 couch, 1 bed, 24.0ms
Speed: 1.1ms preprocess, 24.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 44.3ms
Speed: 1.6ms preprocess, 44.3ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 1 chair, 1 couch, 1 bed, 63.2ms
Speed: 1.2ms preprocess, 63.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 40.4ms
Speed: 1.3ms preprocess, 40.4ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 1 couch, 1 bed, 1 dining table, 79.0ms
Speed: 2.3ms preprocess, 79.0ms inference, 0.5ms postprocess per image at shape (1,

In [42]:
# uses gpt to generate text descriptions for detected objects in frames CHECK THIS HAVENT RUN YET
import ssl
import certifi
import torch
import open_clip
import cv2
import os
import numpy as np
import pandas as pd
from ultralytics import YOLO
from PIL import Image
from transformers import pipeline

# Fix SSL issues
ssl._create_default_https_context = ssl._create_unverified_context
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

# Load YOLOv8 model
yolo_model = YOLO("yolov8n.pt")  # Use "yolov8m.pt" or "yolov8l.pt" for better accuracy

# Load OpenCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, _ = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
tokenizer = open_clip.get_tokenizer("ViT-B-32")

# Define the folder containing frames
frame_folder = "/Users/aarushia/Downloads/shows/frames/kyuki saas/ksbkbt_s1e1_thres30"  # Update with the correct path

# Ensure the frame folder exists
if not os.path.exists(frame_folder):
    raise FileNotFoundError(f"Frame folder not found: {frame_folder}")

# Get all frame images
frame_files = sorted([f for f in os.listdir(frame_folder) if f.endswith((".jpg", ".png"))])

# Create an empty DataFrame to store results
df_results = pd.DataFrame(columns=["Frame", "Object_ID", "YOLO_Label", "Best_Match", "Score"])

# Load GPT model for text generation (to create dynamic descriptions)
generator = pipeline("text-generation", model="gpt-3.5-turbo")

# Loop through all frames
for frame_id, frame_file in enumerate(frame_files):
    frame_path = os.path.join(frame_folder, frame_file)
    image = cv2.imread(frame_path)
    if image is None:
        continue  # Skip if the image cannot be loaded

    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert to RGB for PIL

    # Run YOLOv8 detection on the frame
    results = yolo_model(image)

    # Process each detected object
    object_count = 0
    for result in results:
        for box, cls in zip(result.boxes.xyxy, result.boxes.cls):
            x1, y1, x2, y2 = map(int, box)

            # Get YOLO's class label
            yolo_label = yolo_model.names[int(cls)]

            # Crop the detected object
            cropped_image = image_rgb[y1:y2, x1:x2]
            if cropped_image.shape[0] == 0 or cropped_image.shape[1] == 0:
                continue  # Skip if cropping resulted in an empty image

            cropped_pil = Image.fromarray(cropped_image)  # Convert to PIL format

            # Generate text descriptions dynamically using GPT
            description_prompt = f"Describe an object that looks like {yolo_label} in an image."
            text_descriptions = generator(description_prompt, max_length=50, num_return_sequences=5)
            text_descriptions = [t['generated_text'] for t in text_descriptions]

            # Preprocess cropped image for CLIP
            image_tensor = preprocess(cropped_pil).unsqueeze(0).to(device)

            # Tokenize text descriptions
            text_inputs = open_clip.tokenize(text_descriptions).to(device)

            # Compute CLIP image and text embeddings
            with torch.no_grad():
                image_features = model.encode_image(image_tensor)
                text_features = model.encode_text(text_inputs)

            # Compute cosine similarity
            similarity = (image_features @ text_features.T).softmax(dim=-1).cpu().numpy()

            # Get best matching label
            best_match_idx = similarity.argmax()
            best_match = text_descriptions[best_match_idx]
            best_match_score = similarity[0][best_match_idx]

            # Append to DataFrame
            df_results = pd.concat([df_results, pd.DataFrame([{
                "Frame": frame_file,
                "Object_ID": object_count + 1,
                "YOLO_Label": yolo_label,
                "Best_Match": best_match,
                "Score": round(best_match_score, 4)
            }])], ignore_index=True)

            object_count += 1  # Increment object count in the frame

# Save results to CSV
df_results.to_csv("detected_objects_clip.csv", index=False)

# Display the results
print(df_results)




OSError: gpt-3.5-turbo is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`