In [None]:
import os
import json
import torch
from PIL import Image
import clip  # OpenAI CLIP package
from tqdm import tqdm  # For progress bar with ETA
from collections import Counter

# -------------------------
# Setup: Paths and Device
# -------------------------
image_dir = '/root/Downloads/Human Action Recognition/test'                # Folder containing your images (update as needed)
output_file = 'harTest_annotated.json'  # Output JSON file for pseudo-labels

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load the CLIP Model
# -------------------------
# We use the "ViT-B/32" CLIP model along with its preprocessing function.
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

# -------------------------
# Define Candidate Activity Labels
# -------------------------
# Update this list with your desired activity categories.
activity_labels = [
    "standing",
    "sitting",
    "walking",
    "running",
    "jumping",
    "lying down",
    "bending"
]

# Create text prompts for zero-shot classification
text_prompts = [f"a photo of a person {label}" for label in activity_labels]
text_tokens = clip.tokenize(text_prompts).to(device)

# Create person count detection prompts
person_count_prompts = [
    "a photo of a single person",
    "a photo of multiple people",
    "a photo with no people"
]
person_count_tokens = clip.tokenize(person_count_prompts).to(device)

# Create activity clarity prompts
activity_clarity_prompts = [
    "a photo with a clearly identifiable human activity",
    "a photo with an ambiguous or unclear human activity"
]
activity_clarity_tokens = clip.tokenize(activity_clarity_prompts).to(device)

# -------------------------
# Initialize Annotations Dictionary and Counters
# -------------------------
annotations = {}
activity_distribution = Counter()
total_images = 0
valid_images = 0

# -------------------------
# Get List of Image Files
# -------------------------
image_files = [f for f in os.listdir(image_dir)
               if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]

# -------------------------
# Loop Over Images to Generate Pseudo-Labels with Progress Bar
# -------------------------
for img_name in tqdm(image_files, desc="Processing images", unit="image"):
    img_path = os.path.join(image_dir, img_name)
    try:
        image = Image.open(img_path).convert("RGB")
        total_images += 1
    except Exception as e:
        print(f"\nError opening {img_path}: {e}")
        continue

    # Preprocess the image for CLIP
    image_input = preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        # Compute image features and normalize them
        image_features = model.encode_image(image_input)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        # 1. Check if the image contains a single person
        person_count_features = model.encode_text(person_count_tokens)
        person_count_features /= person_count_features.norm(dim=-1, keepdim=True)
        person_count_similarity = (100.0 * image_features @ person_count_features.T).softmax(dim=-1)
        person_count_idx = person_count_similarity.argmax().item()

        # Skip if not a single person
        if person_count_idx != 0:  # 0 corresponds to "a photo of a single person"
            continue

        # 2. Check if the activity is clearly identifiable
        clarity_features = model.encode_text(activity_clarity_tokens)
        clarity_features /= clarity_features.norm(dim=-1, keepdim=True)
        clarity_similarity = (100.0 * image_features @ clarity_features.T).softmax(dim=-1)
        clarity_idx = clarity_similarity.argmax().item()

        # Skip if activity is ambiguous
        if clarity_idx != 0:  # 0 corresponds to "a photo with a clearly identifiable human activity"
            continue

        # 3. Classify the activity for valid images
        text_features = model.encode_text(text_tokens)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        activity_similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        # Get confidence scores for all activities
        confidence_scores = activity_similarity[0].cpu().numpy()
        best_idx = activity_similarity.argmax().item()
        predicted_activity = activity_labels[best_idx]
        confidence = float(confidence_scores[best_idx])

        # Only consider as valid if confidence is above threshold
        if confidence < 0.5:  # You can adjust this threshold
            continue

        # This is a valid image with a clearly identifiable activity
        valid_images += 1
        activity_distribution[predicted_activity] += 1

    # Save the pseudo-label for valid images in the annotations dictionary
    annotations[img_name] = {
        'pseudo_label': predicted_activity,
    }

# -------------------------
# Save the Pseudo-Labels to a JSON File
# -------------------------
with open(output_file, 'w') as f:
    json.dump(annotations, f, indent=4)

# -------------------------
# Display Summary Statistics
# -------------------------
print(f"\nAnnotation complete. Results saved to {output_file}")
print(f"Total images processed: {total_images}")
print(f"Valid images (single person with clear activity): {valid_images}")
print("\nActivity Distribution:")
print("-" * 40)

# Sort activities by frequency (most common first)
for activity, count in activity_distribution.most_common():
    percentage = (count / valid_images) * 100 if valid_images > 0 else 0
    print(f"{activity}: {count} images ({percentage:.1f}%)")