In [None]:
import os
import sys
import subprocess
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import json
from datetime import datetime

# Configuration
VIDEO_FOLDER = '/kaggle/input/sequences'
OUTPUT_FOLDER = '/kaggle/working/detection_results'
MIN_AREA_PERCENTAGE = 0.005 

def setup_environment():
    """Install required packages and setup environment"""
    print("🚀 Setting up Person Detection Pipeline...")
    
    # Install dependencies
    packages = [
        "torch torchvision transformers",
        "opencv-python-headless",
        "tqdm",
        "matplotlib"
    ]
    
    for package in packages:
        print(f"📦 Installing {package}...")
        result = subprocess.run(f"pip install {package}", shell=True, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"❌ Error installing {package}: {result.stderr}")
        else:
            print(f"✅ {package} installed successfully")

def verify_gpu():
    """Verify GPU availability and setup"""
    print("\n🔧 GPU VERIFICATION")
    print("=" * 40)
    
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"✅ GPU Available: {gpu_name}")
        print(f"💾 GPU Memory: {total_memory:.1f} GB")
        
        # Setup for optimal performance
        torch.cuda.set_device(0)
        torch.backends.cudnn.benchmark = True
        torch.cuda.empty_cache()
        
        return "cuda"
    else:
        print("⚠️ No GPU available, using CPU")
        return "cpu"

class PersonDetector:
    """Clean person detection pipeline using Grounding DINO"""
    
    def __init__(self):
        self.device = verify_gpu()
        print(f"\n🔄 Loading Grounding DINO model on {self.device}...")
        
        # Load Grounding DINO
        model_id = "IDEA-Research/grounding-dino-tiny"
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)
        self.model = self.model.to(self.device)
        self.model.eval()
        
        print(f"✅ Model loaded successfully!")
        print(f"📊 Model parameters: {sum(p.numel() for p in self.model.parameters()):,}")
        
        if self.device == "cuda":
            torch.cuda.empty_cache()
            allocated = torch.cuda.memory_allocated() / 1024**3
            print(f"🔋 GPU Memory Used: {allocated:.2f} GB")
    
    def detect_persons(self, image, confidence_threshold=0.3):
        """Detect persons in image with area filtering"""
        print(f"\n🔍 Detecting persons...")
        
        # Convert to PIL if needed
        if isinstance(image, np.ndarray):
            if len(image.shape) == 3 and image.shape[2] == 3:
                pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            else:
                pil_image = Image.fromarray(image)
        else:
            pil_image = image
        
        width, height = pil_image.size
        total_area = width * height
        min_area = total_area * MIN_AREA_PERCENTAGE
        
        print(f"📏 Image size: {width} x {height}")
        print(f"📊 Total area: {total_area:,} pixels")
        print(f"🎯 Min area (10%): {min_area:,} pixels")
        
        # Prepare inputs
        text_prompt = [["a person"]]
        inputs = self.processor(images=pil_image, text=text_prompt, return_tensors="pt")
        device_inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
                        for k, v in inputs.items()}
        
        # Run detection
        with torch.no_grad():
            outputs = self.model(**device_inputs)
        
        # Process results
        target_sizes = torch.tensor([[height, width]]).to(self.device)
        results = self.processor.post_process_grounded_object_detection(
            outputs,
            input_ids=device_inputs.get('input_ids'),
            box_threshold=confidence_threshold,
            text_threshold=0,
            target_sizes=target_sizes
        )[0]
        
        # Filter detections
        detections = []
        if len(results['boxes']) > 0:
            boxes = results['boxes'].cpu().numpy()
            scores = results['scores'].cpu().numpy()
            
            print(f"📦 Raw detections: {len(boxes)}")
            
            for i, (box, score) in enumerate(zip(boxes, scores)):
                x1, y1, x2, y2 = box.astype(int)
                box_width = x2 - x1
                box_height = y2 - y1
                box_area = box_width * box_height
                area_percentage = (box_area / total_area) * 100
                
                print(f"   Box {i+1}: [{x1}, {y1}, {x2}, {y2}]")
                print(f"   Size: {box_width}x{box_height}, Area: {box_area:,} ({area_percentage:.1f}%)")
                print(f"   Confidence: {score:.3f}")
                
                # Apply area filter
                if box_area >= min_area:
                    detections.append({
                        'box': [x1, y1, x2, y2],
                        'confidence': float(score),
                        'area_pixels': int(box_area),
                        'area_percentage': float(area_percentage)
                    })
                    print(f"   ✅ KEPT (area ≥ 10%)")
                else:
                    print(f"   ❌ FILTERED (area < 10%)")
        
        print(f"🎯 Final detections: {len(detections)}")
        return detections, pil_image
    
    def visualize_detections(self, image, detections, frame_name=""):
        """Visualize detections with cropped bounding boxes"""
        if len(detections) == 0:
            print("⚠️ No detections to visualize")
            return
        
        # Convert image to numpy array
        if isinstance(image, Image.Image):
            img_array = np.array(image)
        else:
            img_array = image
        
        # First show the overview with all detections
        num_detections = len(detections)
        fig_width = min(20, 5 * (num_detections + 1))
        fig, axes = plt.subplots(1, num_detections + 1, figsize=(fig_width, 6))
        
        if num_detections == 1:
            axes = [axes] if not isinstance(axes, list) else axes
        elif isinstance(axes, np.ndarray):
            axes = axes.flatten()
        
        # Show original image with all bounding boxes
        axes[0].imshow(img_array)
        axes[0].set_title(f"Original Frame\n{len(detections)} Person(s) Detected", 
                         fontsize=12, fontweight='bold')
        axes[0].axis('off')
        
        colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (255, 0, 255), (0, 255, 255)]
        
        # Draw all bounding boxes on original image
        for i, det in enumerate(detections):
            x1, y1, x2, y2 = det['box']
            color = np.array(colors[i % len(colors)]) / 255.0
            
            rect = patches.Rectangle((x1, y1), x2-x1, y2-y1,
                                   linewidth=3, edgecolor=color, facecolor='none')
            axes[0].add_patch(rect)
            
            # Add label
            axes[0].text(x1, y1-5, f"Person {i+1}\n{det['confidence']:.2f}",
                        color=color, fontsize=10, fontweight='bold',
                        bbox=dict(boxstyle="round,pad=0.3", facecolor='white', alpha=0.8))
        
        # Show individual cropped detections
        for i, det in enumerate(detections):
            col = i + 1
            x1, y1, x2, y2 = det['box']
            
            # Crop the detection
            crop = img_array[y1:y2, x1:x2]
            
            if crop.size > 0:
                axes[col].imshow(crop)
                title = f"Person {i+1}\nConf: {det['confidence']:.3f}\nArea: {det['area_percentage']:.1f}%"
                axes[col].set_title(title, fontsize=11, fontweight='bold')
            else:
                axes[col].text(0.5, 0.5, "Empty\nCrop", ha='center', va='center',
                              transform=axes[col].transAxes, fontsize=12)
                axes[col].set_title(f"Person {i+1} - Error", fontsize=11)
            
            axes[col].axis('off')
        
        # Main title
        main_title = f"Person Detection Results - {frame_name}"
        if len(detections) > 0:
            main_title += f" | {len(detections)} Person(s) ≥ 10% Area"
        
        plt.suptitle(main_title, fontsize=14, fontweight='bold', y=0.95)
        plt.tight_layout()
        plt.show()
        
        # Now show individual cropped images in console
        print(f"\n🖼️ DISPLAYING CROPPED BOUNDING BOXES - {frame_name}")
        print("=" * 70)
        
        for i, det in enumerate(detections):
            x1, y1, x2, y2 = det['box']
            crop = img_array[y1:y2, x1:x2]
            
            if crop.size > 0:
                print(f"\n👤 PERSON {i+1} CROPPED IMAGE:")
                print(f"   📍 Bounding Box: [{x1}, {y1}, {x2}, {y2}]")
                print(f"   📏 Crop Size: {crop.shape[1]}x{crop.shape[0]} pixels")
                print(f"   📊 Area: {det['area_pixels']:,} pixels ({det['area_percentage']:.1f}%)")
                print(f"   🎯 Confidence: {det['confidence']:.3f}")
                
                # Display the cropped image
                plt.figure(figsize=(6, 8))
                plt.imshow(crop)
                plt.title(f"Person {i+1} - Confidence: {det['confidence']:.3f}\nArea: {det['area_percentage']:.1f}% | Size: {crop.shape[1]}x{crop.shape[0]}", 
                         fontsize=14, fontweight='bold')
                plt.axis('off')
                plt.tight_layout()
                plt.show()
                print(f"   ✅ Cropped image displayed above ↑")
            else:
                print(f"\n❌ PERSON {i+1}: Empty crop (invalid bounding box)")
        
        print("=" * 70)

def process_single_frame(detector, frame_path):
    """Process a single frame"""
    frame_name = os.path.basename(frame_path)
    print(f"\n🎬 PROCESSING: {frame_name}")
    print("=" * 50)
    
    try:
        # Load image
        image = cv2.imread(frame_path)
        if image is None:
            print(f"❌ Could not load image: {frame_path}")
            return None
        
        # Detect persons
        detections, pil_image = detector.detect_persons(image, confidence_threshold=0.5)
        
        if len(detections) == 0:
            print("❌ No valid person detections (≥10% area)")
            return None
        
        # Visualize results
        detector.visualize_detections(pil_image, detections, frame_name)
        
        # Clean GPU memory
        if detector.device == "cuda":
            torch.cuda.empty_cache()
        
        return {
            'frame_name': frame_name,
            'detections_count': len(detections),
            'detections': detections
        }
        
    except Exception as e:
        print(f"❌ Error processing frame: {e}")
        if detector.device == "cuda":
            torch.cuda.empty_cache()
        return None

def process_video_frames(detector, video_path, max_frames=10, skip_frames=30):
    """Process frames from a video with skip frame parameter
    
    Args:
        detector: PersonDetector instance
        video_path: Path to video file
        max_frames: Maximum number of frames to process
        skip_frames: Process every Nth frame (1 = every frame, 30 = every 30th frame)
    """
    video_name = os.path.basename(video_path).split('.')[0]
    print(f"\n📹 PROCESSING VIDEO: {video_name}")
    print("=" * 60)
    
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"❌ Could not open video: {video_path}")
            return None
        
        # Get video properties
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        
        print(f"📊 Video Properties:")
        print(f"   📏 Resolution: {width}x{height}")
        print(f"   🎞️ FPS: {fps}")
        print(f"   📊 Total frames: {total_frames}")
        print(f"   ⏱️ Duration: {total_frames/fps:.1f} seconds")
        print(f"⚙️ Processing Settings:")
        print(f"   🎯 Max frames to process: {max_frames}")
        print(f"   ⏭️ Skip frames: {skip_frames} (process every {skip_frames} frames)")
        print(f"   📈 Estimated frames to process: {min(max_frames, total_frames // skip_frames)}")
        
        # Create output directory
        output_dir = os.path.join(OUTPUT_FOLDER, video_name)
        os.makedirs(output_dir, exist_ok=True)
        
        frame_results = []
        frames_processed = 0
        frame_count = 0
        
        print(f"\n🚀 Starting frame processing...")
        
        while frames_processed < max_frames and cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count % skip_frames == 0:
                frame_filename = f"frame_{frame_count:06d}.jpg"
                frame_path = os.path.join(output_dir, frame_filename)
                cv2.imwrite(frame_path, frame)  
                # Process frame
                result = process_single_frame(detector, frame_path)
                if result:
                    result['frame_number'] = frame_count
                    result['timestamp_seconds'] = frame_count / fps if fps > 0 else 0
                    frame_results.append(result)
                    print(f"✅ Frame processed successfully: {result['detections_count']} detections")
                else:
                    print(f"⚠️ Frame processed: No valid detections")
                
                frames_processed += 1
                
                # Show progress
                remaining = max_frames - frames_processed
                print(f"\n📈 PROGRESS: {frames_processed}/{max_frames} frames processed")
                print(f"📊 Frames with detections: {len(frame_results)}")
                print(f"👥 Total detections so far: {sum(r['detections_count'] for r in frame_results)}")
                if remaining > 0:
                    print(f"⏳ Remaining: {remaining} frames")
                print(f"{'='*80}")
            
            frame_count += 1
        
        cap.release()
        
        # Save results
        results_summary = {
            'video_name': video_name,
            'video_path': video_path,
            'processing_settings': {
                'max_frames': max_frames,
                'skip_frames': skip_frames,
                'frames_actually_processed': frames_processed
            },
            'video_properties': {
                'total_frames': total_frames,
                'fps': fps,
                'width': width,
                'height': height,
                'duration_seconds': total_frames / fps if fps > 0 else 0
            },
            'processing_info': {
                'frames_processed': frames_processed,
                'frames_with_detections': len(frame_results),
                'total_detections': sum(r['detections_count'] for r in frame_results),
                'detection_rate': len(frame_results) / frames_processed if frames_processed > 0 else 0,
                'avg_detections_per_frame': sum(r['detections_count'] for r in frame_results) / frames_processed if frames_processed > 0 else 0
            },
            'frame_results': frame_results,
            'timestamp': datetime.now().isoformat()
        }
        
        results_path = os.path.join(output_dir, f"{video_name}_detection_results.json")
        with open(results_path, 'w') as f:
            json.dump(results_summary, f, indent=2)
        
        print(f"\n✅ VIDEO PROCESSING COMPLETE!")
        print(f"📊 Final Statistics:")
        print(f"   🎞️ Frames processed: {frames_processed}")
        print(f"   🎭 Frames with detections: {len(frame_results)} ({len(frame_results)/frames_processed*100:.1f}%)")
        print(f"   👥 Total persons detected: {sum(r['detections_count'] for r in frame_results)}")
        print(f"   📈 Average persons per frame: {sum(r['detections_count'] for r in frame_results)/frames_processed:.1f}")
        print(f"   💾 Results saved: {results_path}")
        
        return results_summary
        
    except Exception as e:
        print(f"❌ Error processing video: {e}")
        import traceback
        traceback.print_exc()
        return None

def main():
    """Main execution function"""
    print("🚀 PERSON DETECTION PIPELINE")
    print("=" * 60)
    print("🎯 Detects persons with bounding box area ≥ 10%")
    print("📦 Shows cropped detections with confidence values")
    print("🖼️ Displays individual cropped images in console")
    print("⏭️ Configurable frame skipping for efficient processing")
    print("=" * 60)
    
    MAX_FRAMES_PER_VIDEO = 1000    
    SKIP_FRAMES = 10               
    CONFIDENCE_THRESHOLD = 0.5      
    
    print(f"⚙️ PROCESSING CONFIGURATION:")
    print(f"   🎞️ Max frames per video: {MAX_FRAMES_PER_VIDEO}")
    print(f"   ⏭️ Skip frames: {SKIP_FRAMES} (process every {SKIP_FRAMES} frames)")
    print(f"   🎯 Confidence threshold: {CONFIDENCE_THRESHOLD}")
    print(f"   📊 Min area requirement: {MIN_AREA_PERCENTAGE*100}% of image")
    print("=" * 60)
    
    # Setup environment
    setup_environment()
    
    # Check input folder
    if not os.path.exists(VIDEO_FOLDER):
        print(f"❌ Video folder not found: {VIDEO_FOLDER}")
        print("📁 Available paths:")
        if os.path.exists('/kaggle/input'):
            for item in os.listdir('/kaggle/input'):
                print(f"   - /kaggle/input/{item}")
        return
    
    # Find video files
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.wmv']
    video_files = []
    
    for file in os.listdir(VIDEO_FOLDER):
        if any(file.lower().endswith(ext) for ext in video_extensions):
            video_files.append(file)
    
    if not video_files:
        print(f"❌ No video files found in {VIDEO_FOLDER}")
        print("📁 Available files:")
        for item in os.listdir(VIDEO_FOLDER):
            print(f"   - {item}")
        return
    
    print(f"\n🎯 Found {len(video_files)} videos:")
    for i, video in enumerate(video_files, 1):
        print(f"   {i}. {video}")
    
    # Create output directory
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    
    # Initialize detector
    detector = PersonDetector()
    
    # Process videos
    all_results = []
    total_videos = len(video_files)
    
    for i, video_file in enumerate(video_files, 1):
        video_path = os.path.join(VIDEO_FOLDER, video_file)
        print(f"🎬 PROCESSING VIDEO {i}/{total_videos}: {video_file}")
        
        try:
            result = process_video_frames(
                detector, 
                video_path, 
                max_frames=MAX_FRAMES_PER_VIDEO, 
                skip_frames=SKIP_FRAMES
            )
            if result:
                all_results.append(result)
                print(f"✅ Video {i} completed successfully!")
            else:
                print(f"❌ Video {i} processing failed!")
        
        except Exception as e:
            print(f"❌ Error processing video {i}: {e}")
            import traceback
            traceback.print_exc()
        
        # Clean GPU memory between videos
        if detector.device == "cuda":
            torch.cuda.empty_cache()
            print(f"🧹 GPU memory cleared between videos")
        if i < total_videos:
            print(f"\n{'⏳'*30}")
            print(f"⏳ Moving to next video... ({i}/{total_videos} completed)")
            print(f"{'⏳'*30}")
    
    print(f"\n{'🎉'*50}")
    print(f"🎉 ALL VIDEOS PROCESSING COMPLETE!")
    print(f"{'🎉'*50}")
    
    successful_videos = len(all_results)
    failed_videos = total_videos - successful_videos
    
    print(f"📊 OVERALL STATISTICS:")
    print(f"   📹 Total videos found: {total_videos}")
    print(f"   ✅ Videos processed successfully: {successful_videos}")
    print(f"   ❌ Videos failed: {failed_videos}")
    
    if successful_videos > 0:
        total_detections = sum(r['processing_info']['total_detections'] for r in all_results)
        total_frames = sum(r['processing_info']['frames_processed'] for r in all_results)
        total_frames_with_detections = sum(r['processing_info']['frames_with_detections'] for r in all_results)
        
        print(f"   🎞️ Total frames processed: {total_frames}")
        print(f"   🎭 Frames with detections: {total_frames_with_detections} ({total_frames_with_detections/total_frames*100:.1f}%)")
        print(f"   👥 Total persons detected: {total_detections}")
        print(f"   📊 Average persons per frame: {total_detections/total_frames:.1f}")
        
        print(f"\n📋 PER-VIDEO BREAKDOWN:")
        print("-" * 80)
        print(f"{'Video':<25} | {'Frames':<7} | {'Detected':<9} | {'Rate':<6} | {'Persons':<7}")
        print("-" * 80)
        for result in all_results:
            frames = result['processing_info']['frames_processed']
            det_frames = result['processing_info']['frames_with_detections']
            rate = det_frames / frames * 100 if frames > 0 else 0
            persons = result['processing_info']['total_detections']
            print(f"{result['video_name']:<25} | {frames:<7} | {det_frames:<9} | {rate:<6.1f}% | {persons:<7}")
        print("-" * 80)
    
    print(f"📁 All results saved in: {OUTPUT_FOLDER}")
    if detector.device == "cuda":
        torch.cuda.empty_cache()
        print("🧹 Final GPU memory cleanup completed")
    
    print(f"🏁 Pipeline execution finished!")

# Run the pipeline
if __name__ == "__main__":
    main()