In [41]:
import pandas as pd
import os
import math
import subprocess
import re

from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient, ContainerClient
from tqdm import tqdm

In [3]:
load_dotenv()

account_url = os.getenv('AZURE_ACCOUNT_URL', 'https://your-account.blob.core.windows.net')
account_name = os.getenv('AZURE_ACCOUNT_NAME', 'your-account-name')
container = os.getenv('AZURE_CONTAINER', 'your-container-name')
sas_token = os.getenv('AZURE_SAS_TOKEN', 'your-sas-token-here')

In [7]:
container_url = f"https://{account_name}.blob.core.windows.net/{container}?{sas_token}"
container_client = ContainerClient.from_container_url(container_url)

## 1. Validate video path

In [4]:
videos_df = pd.read_csv('./../../data/videos.csv')

In [9]:
print(len(videos_df))
videos_df.head()

1643


Unnamed: 0,video_base,start_frame12,t_start_sec,t_end_sec,frame12,window_activity,lat,long,timestamp,id,id_person,camera
0,columpios_cam4-2024-11-21 18:53:10.mp4,166.0,13.833333,17.833333,166,7.713624,25.653061,-100.285843,2024-11-22 00:53:10.041613+00:00,columpios_cam4-2024-11-21_185310.mp4:17527661-...,17527661-d095-4e94-bad5-cf142e0d4a79,columpios_cam4
1,columpioscam3-2024-09-25 11:41:35.mp4,75.0,6.25,10.25,75,7.713624,25.653034,-100.286041,2024-09-25 17:41:35.045100+00:00,columpioscam3-2024-09-25_114135.mp4:96:0,96,columpioscam3
2,columpioscam3-2024-10-03 18:29:30.mp4,155.0,12.916667,16.916667,155,7.713624,25.652994,-100.286118,2024-10-04 00:29:30.051348+00:00,columpioscam3-2024-10-03_182930.mp4:66a106a0-b...,66a106a0-bd03-49e9-8418-8c6ae98d020c,columpioscam3
3,columpioscam1-2024-11-17 10:25:24.mp4,120.0,10.0,14.0,120,7.713624,25.653151,-100.285637,2024-11-17 16:25:24.041494+00:00,columpioscam1-2024-11-17_102524.mp4:85745e56-a...,85745e56-a3e9-4b62-bb2d-c1e8ee152f86,columpioscam1
4,columpios_cam4-2024-11-21 19:16:11.mp4,76.0,6.333333,10.333333,76,7.713624,25.653048,-100.285912,2024-11-22 01:16:11.041568+00:00,columpios_cam4-2024-11-21_191611.mp4:addad6d5-...,addad6d5-1fe6-484f-964d-3302e3899325,columpios_cam4


In [38]:
videos_df.shape

(1643, 12)

In [10]:
def validate_videos_in_azure(video_names, container_client):
    """
    Validate which videos exist in Azure Blob Storage.
    Returns a DataFrame with validation status and total GB size.
    """
    records = []

    print(f"🔍 Validating {len(video_names)} videos in Azure...")
    for name in video_names:
        try:
            blob_client = container_client.get_blob_client(name)
            props = blob_client.get_blob_properties()

            records.append({
                "video_name": name,
                "exists_in_azure": True,
                "size_bytes": props.size
            })
        except Exception as e:
            records.append({
                "video_name": name,
                "exists_in_azure": False,
                "error": str(e)
            })

    df_validation = pd.DataFrame(records)

    total_gb = df_validation.loc[df_validation["exists_in_azure"], "size_bytes"].sum() / (1024 ** 3)

    print(f"\n✅ Found {df_validation['exists_in_azure'].sum()} videos in Azure")
    print(f"❌ Missing {(~df_validation['exists_in_azure']).sum()} videos")
    print(f"Total size of found videos: {total_gb:.2f} GB")

    return df_validation, total_gb

In [11]:
unique_videos = videos_df['video_base'].unique()
validation_results = validate_videos_in_azure(unique_videos, container_client)

🔍 Validating 1326 videos in Azure...

✅ Found 1326 videos in Azure
❌ Missing 0 videos
Total size of found videos: 3.72 GB


## 2. Get videos

In [21]:
temp_dir = "./../../data/temp"
os.makedirs(temp_dir, exist_ok=True)

In [22]:
def download_videos(videos_df):
    for _, row in tqdm(videos_df.iterrows(), total=len(videos_df), desc="Downloading videos"):
        blob_name = row["video_base"]
        local_path = os.path.join(temp_dir, blob_name)

        if os.path.exists(local_path):
            print(f"Skipping {blob_name}, already exists.")
            continue

        try:
            blob_client = container_client.get_blob_client(blob_name)
            with open(local_path, "wb") as f:
                download_stream = blob_client.download_blob()
                f.write(download_stream.readall())
            print(f"Downloaded: {blob_name}")
        except Exception as e:
            print(f"⚠️ Error downloading {blob_name}: {e}")

    print("\nAll downloads complete.")

In [None]:
download_videos(videos_df)

> Actual Size 4GB, Cell output cleared for tidiness

## 3. Trim videos to activity window

In [39]:
output_dir = "./../../data/temp/videos/trimmed"
os.makedirs(output_dir, exist_ok=True)

In [40]:
def sanitize_times(start_t: float, end_t: float, duration: float | None = None):
    """
    Validate and clamp start/end times for trimming.
    Returns (start, end) or None if invalid.
    """
    if math.isnan(start_t) or math.isnan(end_t):
        return None

    s = max(0.0, float(start_t))
    e = float(end_t)

    if duration is not None:
        e = min(e, float(duration))

    if e <= s:
        return None

    return s, e

In [42]:
def safe_filename(name: str) -> str:
    return re.sub(r'[\\/:\s]+', '_', name)

In [44]:
def format_hms(seconds: float) -> str:
    total = int(round(seconds))
    h = total // 3600
    m = (total % 3600) // 60
    s = total % 60
    return f"{h:02d}-{m:02d}-{s:02d}"

> We use emojis for faster debug of logs

In [None]:
trim_records = []

for idx, row in tqdm(videos_df.iterrows(), total=len(videos_df), desc="Trimming videos (ffmpeg copy)"):
    video_name = row["video_base"]
    start_t = float(row["t_start_sec"])
    end_t   = float(row["t_end_sec"])

    in_path = os.path.join(temp_dir, video_name)

    safe_name = safe_filename(os.path.splitext(video_name)[0])

    st_en = sanitize_times(start_t, end_t)
    if st_en is None:
        print(f"⚠️ Invalid times ({start_t}, {end_t}) for {video_name}, skipping.")
        continue
    s, e = st_en

    start_tag = format_hms(s)
    end_tag   = format_hms(e)
    out_name  = f"trimmed_{safe_name}_{start_tag}_to_{end_tag}_{idx}.mp4"
    out_path  = os.path.join(output_dir, out_name)

    if not os.path.exists(in_path):
        print(f"⚠️ Missing file: {video_name}")
        continue
    if os.path.exists(out_path):
        print(f"Already exists: {out_name}")
    else:
        cmd = [
            "ffmpeg",
            "-ss", str(s),
            "-to", str(e),
            "-i", in_path,
            "-c", "copy",
            "-movflags", "+faststart",
            "-y", out_path
        ]
        try:
            subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
            print(f"Trimmed: {out_name}")
        except subprocess.CalledProcessError as e:
            print(f"❌ Error trimming {video_name}: {e}")
            continue

    trim_records.append({
        "video_base": video_name,
        "t_start_sec": s,
        "t_end_sec": e,
        "trimmed_name": out_name,
        "trimmed_path": os.path.abspath(out_path)
    })

print("\nAll trimmed clips saved to:", output_dir)


> Total size: 1.56GB

In [46]:
total_files = len([
    f for f in os.listdir(output_dir)
    if f.endswith(".mp4")
])

print(f"Total trimmed videos: {total_files}")

Total trimmed videos: 1643


In [47]:
trimmed_videos_df = pd.DataFrame(trim_records)
print(f"\nTrimmed videos recorded: {len(trimmed_videos_df)}")


Trimmed videos recorded: 1643


In [48]:
trimmed_videos_df.head()

Unnamed: 0,video_base,t_start_sec,t_end_sec,trimmed_name,trimmed_path
0,columpios_cam4-2024-11-21 18:53:10.mp4,13.833333,17.833333,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...,/Users/alexeidelgado/Desktop/mpgcn-playground-...
1,columpioscam3-2024-09-25 11:41:35.mp4,6.25,10.25,trimmed_columpioscam3-2024-09-25_11_41_35_00-0...,/Users/alexeidelgado/Desktop/mpgcn-playground-...
2,columpioscam3-2024-10-03 18:29:30.mp4,12.916667,16.916667,trimmed_columpioscam3-2024-10-03_18_29_30_00-0...,/Users/alexeidelgado/Desktop/mpgcn-playground-...
3,columpioscam1-2024-11-17 10:25:24.mp4,10.0,14.0,trimmed_columpioscam1-2024-11-17_10_25_24_00-0...,/Users/alexeidelgado/Desktop/mpgcn-playground-...
4,columpios_cam4-2024-11-21 19:16:11.mp4,6.333333,10.333333,trimmed_columpios_cam4-2024-11-21_19_16_11_00-...,/Users/alexeidelgado/Desktop/mpgcn-playground-...


In [53]:
enriched_trimmed_df = trimmed_videos_df.merge(
    videos_df,
    on='video_base',
    how='left'
)

print(f"Original trimmed_videos_df columns: {list(trimmed_videos_df.columns)}")
print(f"Enriched dataframe columns: {list(enriched_trimmed_df.columns)}")
print(f"Shape: {enriched_trimmed_df.shape}")

Original trimmed_videos_df columns: ['video_base', 't_start_sec', 't_end_sec', 'trimmed_name', 'trimmed_path']
Enriched dataframe columns: ['video_base', 't_start_sec_x', 't_end_sec_x', 'trimmed_name', 'trimmed_path', 'start_frame12', 't_start_sec_y', 't_end_sec_y', 'frame12', 'window_activity', 'lat', 'long', 'timestamp', 'id', 'id_person', 'camera']
Shape: (2277, 16)


In [58]:
enriched_trimmed_df = enriched_trimmed_df.drop(columns=[
    't_start_sec_y',
    't_end_sec_y',
    'trimmed_path',
    'frame12'
], errors='ignore')

In [59]:
enriched_trimmed_df.head()

Unnamed: 0,video_base,t_start_sec_x,t_end_sec_x,trimmed_name,start_frame12,window_activity,lat,long,timestamp,id,id_person,camera
0,columpios_cam4-2024-11-21 18:53:10.mp4,13.833333,17.833333,trimmed_columpios_cam4-2024-11-21_18_53_10_00-...,166.0,7.713624,25.653061,-100.285843,2024-11-22 00:53:10.041613+00:00,columpios_cam4-2024-11-21_185310.mp4:17527661-...,17527661-d095-4e94-bad5-cf142e0d4a79,columpios_cam4
1,columpioscam3-2024-09-25 11:41:35.mp4,6.25,10.25,trimmed_columpioscam3-2024-09-25_11_41_35_00-0...,75.0,7.713624,25.653034,-100.286041,2024-09-25 17:41:35.045100+00:00,columpioscam3-2024-09-25_114135.mp4:96:0,96,columpioscam3
2,columpioscam3-2024-10-03 18:29:30.mp4,12.916667,16.916667,trimmed_columpioscam3-2024-10-03_18_29_30_00-0...,155.0,7.713624,25.652994,-100.286118,2024-10-04 00:29:30.051348+00:00,columpioscam3-2024-10-03_182930.mp4:66a106a0-b...,66a106a0-bd03-49e9-8418-8c6ae98d020c,columpioscam3
3,columpioscam1-2024-11-17 10:25:24.mp4,10.0,14.0,trimmed_columpioscam1-2024-11-17_10_25_24_00-0...,120.0,7.713624,25.653151,-100.285637,2024-11-17 16:25:24.041494+00:00,columpioscam1-2024-11-17_102524.mp4:85745e56-a...,85745e56-a3e9-4b62-bb2d-c1e8ee152f86,columpioscam1
4,columpios_cam4-2024-11-21 19:16:11.mp4,6.333333,10.333333,trimmed_columpios_cam4-2024-11-21_19_16_11_00-...,76.0,7.713624,25.653048,-100.285912,2024-11-22 01:16:11.041568+00:00,columpios_cam4-2024-11-21_191611.mp4:addad6d5-...,addad6d5-1fe6-484f-964d-3302e3899325,columpios_cam4


In [60]:
enriched_trimmed_df.to_csv("./../../data/videos-trimmed.csv", index=False)

## 4. Cleanup original videos

In [49]:
original_dir = "./../../data/temp"
trimmed_dir = os.path.join(original_dir, "videos", "trimmed")

if not os.path.exists(original_dir):
    raise FileNotFoundError(f"Folder not found: {original_dir}")
if not os.path.exists(trimmed_dir):
    print(f"Trimmed folder not found: {trimmed_dir}")

to_delete = [
    os.path.join(original_dir, f)
    for f in os.listdir(original_dir)
    if f.endswith(".mp4")
]

print(f"Found {len(to_delete)} original video(s) to delete in {original_dir}")

Found 1326 original video(s) to delete in ./../../data/temp


In [50]:
# confirm manually for safety
confirm = input("Type 'yes' to confirm deletion: ").strip().lower()
if confirm == "yes":
    for f in tqdm(to_delete, desc="Deleting original videos"):
        try:
            os.remove(f)
        except Exception as e:
            print(f"⚠Could not delete {f}: {e}")
    print(f"\nDeleted {len(to_delete)} original videos.")
else:
    print("Deletion cancelled.")

Deleting original videos: 100%|██████████| 1326/1326 [00:00<00:00, 5148.57it/s]


Deleted 1326 original videos.



