In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from google.colab import drive

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# for dirname, _, filenames in os.walk('/content/drive/MyDrive/Capstone/Videos/Virat_Dataset/'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [2]:
!pip install ultralytics
!pip install PyWavelets



In [3]:
import os
import subprocess
import math
import cv2
import numpy as np
import pywt
import torch
from ultralytics import YOLO
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
model = YOLO('yolov8x.pt')

In [5]:
def detect_humans(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = model(frame_rgb)

    # Get bounding box information in a general format.
    boxes = results[0].boxes.xyxy.cpu().numpy()  # (x1, y1, x2, y2) coordinates
    confidences = results[0].boxes.conf.cpu().numpy()  # Confidence scores
    classes = results[0].boxes.cls.cpu().numpy()  # Class IDs

    human_detections = []
    for i, (x1, y1, x2, y2) in enumerate(boxes):
        conf = confidences[i]
        cls = int(classes[i])
        if cls == 0:  # 'person' class
            human_detections.append({
                'bbox': (int(x1), int(y1), int(x2), int(y2)),
                'confidence': float(conf)
            })
    return human_detections


def detect_motion(prev_frame, current_frame, motion_threshold=5000):
    gray_prev = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    gray_current = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)

    frame_diff = cv2.absdiff(gray_prev, gray_current)
    _, thresh = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)

    motion_score = np.sum(thresh)
    return motion_score > motion_threshold

def sliding_window_detection(human_detections_list, window_size=15):
    for i in range(max(0, len(human_detections_list) - window_size), len(human_detections_list)):
        if len(human_detections_list[i]) > 0:
            return True
    return False

In [6]:
input_path = '/content/drive/MyDrive/Capstone/Videos/Virat_Dataset/VIRAT_S_000201_02_000590_000623.mp4'
output_ff_path = '/content/drive/MyDrive/Capstone/Output/v623_frame_filtered_sliding_window_output.mp4'

In [7]:
cap = cv2.VideoCapture(input_path)
if not cap.isOpened():
    raise IOError(f"Cannot open input video file: {input_path}")

width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

In [8]:
ffmpeg_command = [
    'ffmpeg', '-y', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-pix_fmt', 'bgr24',
    '-s', f'{width}x{height}', '-r', str(fps), '-i', '-', '-an', '-c:v', 'libx264',
    '-preset', 'slow', '-crf', '16', output_ff_path
]
process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE)

frame_count = 0
saved_frame_count = 0
human_detections_list = []
prev_frame = None

In [9]:
try:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        human_detections = detect_humans(frame)
        human_detections_list.append(human_detections)

        keep_frame_human = sliding_window_detection(human_detections_list, window_size=15)
        keep_frame_motion = False
        if prev_frame is not None:
            keep_frame_motion = detect_motion(prev_frame, frame)

        if keep_frame_human or keep_frame_motion:
            process.stdin.write(frame.tobytes())
            saved_frame_count += 1

        prev_frame = frame.copy()

        if len(human_detections_list) > 15:
            human_detections_list.pop(0)

        if frame_count % 100 == 0:
            print(f"Processed {frame_count} frames, saved {saved_frame_count} kept frames")

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    cap.release()
    process.stdin.close()
    process.wait()
    print("FFmpeg process finished.")


0: 384x640 1 person, 19 cars, 1 truck, 271.1ms
Speed: 2.6ms preprocess, 271.1ms inference, 2284.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 19 cars, 1 truck, 117.9ms
Speed: 5.8ms preprocess, 117.9ms inference, 20.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19 cars, 1 truck, 69.7ms
Speed: 10.9ms preprocess, 69.7ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19 cars, 1 truck, 80.9ms
Speed: 2.2ms preprocess, 80.9ms inference, 17.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19 cars, 1 truck, 73.0ms
Speed: 4.4ms preprocess, 73.0ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19 cars, 1 truck, 56.8ms
Speed: 2.3ms preprocess, 56.8ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 19 cars, 1 truck, 56.9ms
Speed: 2.2ms preprocess, 56.9ms inference, 2.2ms postprocess per image at 

In [10]:
def print_file_sizes(input_file_path, output_file_path):
    # Get size of input file
    input_size = os.path.getsize(input_file_path)

    # Get size of output file
    output_size = os.path.getsize(output_file_path)

    # Convert sizes to MB for easier readability
    input_size_mb = input_size / (1024 * 1024)
    output_size_mb = output_size / (1024 * 1024)

    print(f"Input file size: {input_size_mb:.2f} MB")
    print(f"Output file size: {output_size_mb:.2f} MB")

In [11]:
print_file_sizes(input_path, output_ff_path)

Input file size: 32.59 MB
Output file size: 23.58 MB


In [12]:
def compress_background(frame, boxes, wavelet='haar', level=1, threshold=0.5):
    mask = np.zeros((frame.shape[0], frame.shape[1]), dtype=np.uint8)
    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        mask[y1:y2, x1:x2] = 255  # Mark foreground (humans)

    channels = cv2.split(frame)
    compressed_channels = []

    for channel in channels:
        coeffs = pywt.wavedec2(channel, wavelet, level=level)
        coeffs_thresholded = [coeffs[0]]
        coeffs_thresholded += [(pywt.threshold(cH, threshold * np.max(cH), mode='soft'),
                                pywt.threshold(cV, threshold * np.max(cV), mode='soft'),
                                pywt.threshold(cD, threshold * np.max(cD), mode='soft'))
                               for cH, cV, cD in coeffs[1:]]

        compressed_background = pywt.waverec2(coeffs_thresholded, wavelet)
        compressed_background = np.clip(compressed_background, 0, 255).astype(np.uint8)

        combined_channel = np.where(mask == 255, channel, compressed_background)
        compressed_channels.append(combined_channel)

    compressed_frame = cv2.merge(compressed_channels)
    return compressed_frame

In [13]:
def compress_video_with_foreground(input_path, output_path, wavelet='haar', level=1, threshold=0.5):
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error: Could not open input video file {input_path}")
        return

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Run YOLOv8 on the frame to detect humans
        results = model(frame)  # Run inference on the frame
        boxes = []

        # Extract bounding boxes from results
        for det in results[0].boxes:  # Access boxes from the results
            if int(det.cls) == 0:  # Class ID for 'person'
                x1, y1, x2, y2 = det.xyxy[0]  # Get bounding box coordinates
                boxes.append([x1, y1, x2, y2])  # Add to boxes
        compressed_frame = compress_background(frame, boxes, wavelet=wavelet, level=level, threshold=threshold)
        out.write(compressed_frame)

        frame_count += 1
        print(f"Processed frame: {frame_count}")

    cap.release()
    out.release()
    print("Video compression with foreground preservation complete. Output saved to:", output_path)

In [14]:
def get_file_size(path):
    size_bytes = os.path.getsize(path)
    size_mb = size_bytes / (1024 * 1024)  # Convert bytes to MB
    return size_mb

In [15]:
input_video_path = '/content/drive/MyDrive/Capstone/Output/v623_frame_filtered_sliding_window_output.mp4'  # Path to input video file
output_video_path = '/content/drive/MyDrive/Capstone/Output/compressed_v623_dwt.mp4'  # Path to save compressed video in MP4 format
wavelet = 'haar'  # Type of wavelet ('haar' or 'db1' for Daubechies 1)
level = 1  # Level of decomposition
threshold = 0.1  # Compression threshold (tune this for more compression)

In [16]:
compress_video_with_foreground(input_video_path, output_video_path, wavelet, level, threshold)


0: 384x640 1 person, 19 cars, 1 truck, 63.1ms
Speed: 2.3ms preprocess, 63.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 1

0: 384x640 3 persons, 19 cars, 1 truck, 63.1ms
Speed: 2.0ms preprocess, 63.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 2

0: 384x640 2 persons, 19 cars, 1 truck, 63.1ms
Speed: 1.7ms preprocess, 63.1ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 3

0: 384x640 2 persons, 19 cars, 1 truck, 63.1ms
Speed: 2.7ms preprocess, 63.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 4

0: 384x640 2 persons, 19 cars, 1 truck, 63.2ms
Speed: 2.1ms preprocess, 63.2ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 5

0: 384x640 2 persons, 19 cars, 1 truck, 63.3ms
Speed: 2.1ms preprocess, 63.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)
Processed frame: 6

0: 384x640 2 per

In [17]:
input_size = get_file_size(input_video_path)
output_size = get_file_size(output_video_path)
print(f"Input video size: {input_size:.2f} MB")
print(f"Output video size: {output_size:.2f} MB")

Input video size: 23.58 MB
Output video size: 15.40 MB
