# S3 Download and Video Count

This notebook uses the same S3 configuration as `app.py` to:
- Count the number of videos in your S3 bucket
- Download data (videos, images, color data) into the local `download_data/` structure

Make sure the environment variables are set before running:
- `S3_BUCKET_NAME`
- `S3_PREFIX` (optional)
- `AWS_REGION` (optional)
- AWS credentials configured in your environment (via env vars or AWS CLI config)


In [2]:
import os
import json
import boto3
from botocore.exceptions import ClientError
from typing import Optional, Dict, Any
S3_BUCKET_NAME="injection-detection"
AWS_REGION="eu-north-1"
S3_PREFIX="prod/"  # optional
# S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
# S3_PREFIX = os.getenv("S3_PREFIX", "")
# AWS_REGION = os.getenv("AWS_REGION")

if not S3_BUCKET_NAME:
    raise ValueError("S3_BUCKET_NAME must be set in environment")

if AWS_REGION:
    s3_client = boto3.client("s3", region_name=AWS_REGION, aws_access_key_id="AKIAV7W7LVQ7X2UG4S4V",
    aws_secret_access_key="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")
else:
    s3_client = boto3.client("s3")

print(f"Using bucket: {S3_BUCKET_NAME}")
if S3_PREFIX:
    print(f"Prefix: {S3_PREFIX}")

# Validate access
try:
    s3_client.head_bucket(Bucket=S3_BUCKET_NAME)
    print("✅ Connected to S3 bucket")
except Exception as e:
    raise RuntimeError(f"Failed to access bucket: {e}")

def _s3_key(key: str) -> str:
    key = key.lstrip("/")
    if S3_PREFIX:
        return f"{S3_PREFIX.rstrip('/')}/{key}"
    return key

def upload_bytes_to_s3(content_bytes: bytes, key: str, content_type: Optional[str] = None, metadata: Optional[dict] = None) -> Optional[str]:
    extra_args: Dict[str, Any] = {}
    if content_type:
        extra_args["ContentType"] = content_type
    if metadata:
        extra_args["Metadata"] = {k: str(v) for k, v in metadata.items() if v is not None}
    try:
        s3_client.put_object(Bucket=S3_BUCKET_NAME, Key=_s3_key(key), Body=content_bytes, **extra_args)
        return key
    except ClientError as e:
        print(f"Failed to upload object to S3: {e}")
        return None


Using bucket: injection-detection
Prefix: prod/
✅ Connected to S3 bucket


In [9]:
from typing import Tuple

def count_videos_in_bucket(prefix: str = "") -> Tuple[int, int]:
    """
    Count .mp4 objects in the bucket under the given prefix.
    Returns: (total_objects_scanned, total_videos_found)
    """
    paginator = s3_client.get_paginator('list_objects_v2')
    search_prefix = _s3_key(prefix)
    total_objects = 0
    video_count = 0

    for page in paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=search_prefix):
        contents = page.get('Contents', [])
        total_objects += len(contents)
        for obj in contents:
            key = obj['Key']
            if key.lower().endswith('.mp4'):
                video_count += 1
    return total_objects, video_count

# Run count
scanned, videos = count_videos_in_bucket('download_data/')
print(f"Scanned objects: {scanned}")
print(f"Videos (.mp4) found: {videos}")


Scanned objects: 142
Videos (.mp4) found: 14


In [3]:
from typing import Tuple

def count_videos_in_bucket(prefix: str = "") -> Tuple[int, int]:
    """
    Count .mp4 objects in the bucket under the given prefix.
    Returns: (total_objects_scanned, total_videos_found)
    """
    paginator = s3_client.get_paginator('list_objects_v2')
    search_prefix = _s3_key(prefix)
    total_objects = 0
    video_count = 0

    for page in paginator.paginate(Bucket=S3_BUCKET_NAME, Prefix=search_prefix):
        contents = page.get('Contents', [])
        total_objects += len(contents)
        for obj in contents:
            key = obj['Key']
            if key.lower().endswith('.mp4'):
                video_count += 1
    return total_objects, video_count

# Run count
scanned, videos = count_videos_in_bucket('download_data/')
print(f"Scanned objects: {scanned}")
print(f"Videos (.mp4) found: {videos}")


Scanned objects: 370
Videos (.mp4) found: 38


In [4]:
import boto3

# s3 = boto3.client("s3")
s3 = boto3.client("s3", region_name=AWS_REGION, aws_access_key_id="AKIAV7W7LVQ7X2UG4S4V",
aws_secret_access_key="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")
bucket = "injection-detection"
prefix = "prod/download_data/"

response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

files = sorted(
    response.get("Contents", []),
    key=lambda x: x["LastModified"],
    reverse=True  # newest first
)

for obj in files:
    print(obj["Key"], obj["LastModified"])


prod/download_data/0ee4ef77-0391-421a-9e54-d8dc3f04b6f2/images/1756962824852_black.png 2025-09-04 05:14:03+00:00
prod/download_data/0ee4ef77-0391-421a-9e54-d8dc3f04b6f2/videos/0ee4ef77-0391-421a-9e54-d8dc3f04b6f2.mp4 2025-09-04 05:14:02+00:00
prod/download_data/0ee4ef77-0391-421a-9e54-d8dc3f04b6f2/color_data/0ee4ef77-0391-421a-9e54-d8dc3f04b6f2.json 2025-09-04 05:13:52+00:00
prod/download_data/e94fbea6-5755-4785-872e-f11c537693c1/images/1756894145943_yellow.png 2025-09-03 10:09:34+00:00
prod/download_data/e94fbea6-5755-4785-872e-f11c537693c1/images/1756894145205_black.png 2025-09-03 10:09:33+00:00
prod/download_data/e94fbea6-5755-4785-872e-f11c537693c1/images/1756894144492_transparent.png 2025-09-03 10:09:31+00:00
prod/download_data/e94fbea6-5755-4785-872e-f11c537693c1/videos/e94fbea6-5755-4785-872e-f11c537693c1.mp4 2025-09-03 10:09:29+00:00
prod/download_data/e94fbea6-5755-4785-872e-f11c537693c1/color_data/e94fbea6-5755-4785-872e-f11c537693c1.json 2025-09-03 10:09:14+00:00
prod/downlo