Testing out image captioner

In [None]:
from models.image.llava_onevision import LLAVAOneVision
from models.image.internvl import InternVL
image_captioner = LLAVAOneVision()
image_captioner_2 = InternVL()

In [None]:
image_captioner_2.ask_question("choose 100 random videos from a directory in Python")

In [None]:
image_captioner.caption_images()

10k images (src/image_captions.py)

Clustering of words in descriptions

In [5]:
import pandas as pd
from plotnine import *
from collections import Counter
import re
descriptions = pd.read_csv("tenkframe_descriptions.csv")
objects = pd.read_csv("/ccn2/dataset/babyview/outputs_20250312/yoloe/cdi_10k/bounding_box_predictions.csv")

In [None]:
# 1. Count object occurrences
objects_filtered = objects[objects["class_name"] != "person"]

object_counts = objects_filtered["class_name"].value_counts()

object_counts_filtered = object_counts[object_counts > 10]

# 2. Count how often each class name is mentioned in descriptions
caption_text = " ".join(descriptions["caption"]).lower()
caption_counts = {}
for label in object_counts_filtered.index:
    pattern = r'\b' + re.escape(label.lower()) + r's?\b'  # match singular/plural
    caption_counts[label] = len(re.findall(pattern, caption_text))

# 3. Combine into a DataFrame
df = pd.DataFrame({
    "class_name": object_counts_filtered.index,
    "object_count": object_counts_filtered.values,
    "caption_count": [caption_counts[k] for k in object_counts_filtered.index]
})

# 4. Scatter plot using plotnine
plot = (
    ggplot(df, aes(x="object_count", y="caption_count", label="class_name")) +
    geom_point(size=4, color="steelblue") +
    geom_smooth(method="lm") +
    geom_text(nudge_y=0.2, size=10, ha='left') +
    labs(
        title="Objects vs. Mentions in Captions",
        x="Count in objects",
        y="Mentions in captions"
    )  + theme(
        figure_size=(20, 12),  # Increase plot size
        axis_text_x=element_text(rotation=45, hjust=1)
    )
)

plot

print(plot)

In [None]:
plot

In [None]:
import numpy as np
from scipy.stats import pearsonr
correlation, p_value = pearsonr(df["object_count"], df["caption_count"])
print(f"Pearson correlation: {correlation:.2f}")
print(f"p-value: {p_value}")

In [None]:
df

Comparing images to objects

In [None]:
# comparing descriptions to objects mentioned --
df = pd.read_csv("")

Video chunking

1 minute long chunks

In [None]:
from video_utils import split_video
vid_path = "/ccn2/dataset/babyview/unzip_2025/babyview_main_storage/00220001_2024-05-31_1_acd11db79d/00220001_2024-05-31_1_acd11db79d_processed.MP4"
split_video(vid_path, 60, "/ccn2/dataset/babyview/outputs_20250312/activities/chunks/00220001_2024-05-31_1_acd11db79d", keep_audio=False)

Trying out VideoChat_Flash and LLava video "describe this video"

In [None]:
import os
from models.video.videochat_flash import VideoFlash
video_generator = VideoFlash()

Of these four activity types, which do you think these videos fall under? 

Kinetics 400 but for kids?

In [None]:
output_dir = "/ccn2/dataset/babyview/outputs_20250312/activities/chunks/00820001_2024-04-09_2_4aa5f86d25_processed"
activities = ["being held", "eating", "drinking", "playing with toy", "getting changed", "crawling", "crying", "exploring", "cooking", "cleaning", "gardening", "watching tv", "driving", "reading"]
activity, _ = video_generator.caption_video(f"{output_dir}/chunk003.mp4",f"Answer with one word what activity is going on in this video, taken with a camera attached to the head of a child, from the following options: {", ".join(activities)}")
print(activity)

Of these four areas, which do you think these videos are in? Dining room, living room, outside, bedroom. Can we extend Lew-Williams 2023?

In [None]:
caption, _ = video_generator.caption_video(df["chunk_path"][0], question="Of dining room, living room, bedroom, kitchen and outside, which of these do you think this video takes place in?")
print(caption)

Comparing areas in the two videos to objects detected. 

10 minute long video -- maybe try out InternVL2.5_HiCo_R64 and see if we can get timestamp information. also look at descriptions from the other internvideo model.

In [None]:
caption, _ = video_generator.caption_video()
print(caption)

Creating 1000 random video chunks and testing out activity captioning with them. To do this we're using 100 random videos.

In [5]:
import os
import random
import pandas as pd

# Base directory
base_dir = '/ccn2/dataset/babyview/unzip_2025/babyview_main_storage'

# List all immediate subdirectories
subdirs = [os.path.join(base_dir, d) for d in os.listdir(base_dir)
           if os.path.isdir(os.path.join(base_dir, d))]

# Randomly sample up to 100 subdirectories
sampled_subdirs = random.sample(subdirs, min(100, len(subdirs)))

# For each sampled subdir, find the first .mp4 file inside
sampled_video_paths = []
for subdir in sampled_subdirs:
    for file in os.listdir(subdir):
        if file.lower().endswith('.mp4'):
            sampled_video_paths.append(os.path.join(subdir, file))
            break  # Assume one MP4 per subdir

# Save to CSV
output_csv = 'random_video_paths.csv'
df = pd.DataFrame({'video_path': sampled_video_paths})
df.to_csv(output_csv, index=False)

print(f"Saved {len(sampled_video_paths)} video paths to {output_csv}")

Saved 100 video paths to random_video_paths.csv


In [2]:
len(subdirs)

5566

Ok just getting all of the video paths

In [6]:
# For each sampled subdir, find the first .mp4 file inside
all_video_paths = []
for subdir in subdirs:
    for file in os.listdir(subdir):
        if file.lower().endswith('.mp4'):
            all_video_paths.append(os.path.join(subdir, file))
            break  # Assume one MP4 per subdir

# Save to CSV
output_csv = 'all_video_paths.csv'
df = pd.DataFrame({'video_path': all_video_paths})
df.to_csv(output_csv, index=False)

print(f"Saved {len(all_video_paths)} video paths to {output_csv}")

Saved 5566 video paths to all_video_paths.csv


In [None]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from video_utils import split_video
df = pd.read_csv("random_video_paths.csv")
sampled_video_paths = df["video_path"]
for vid_path in tqdm(sampled_video_paths, desc="Chunking videos"):
    curr_video_id = Path(vid_path).stem
    split_video(vid_path, 60, f"/ccn2/dataset/babyview/outputs_20250312/activities/chunks/{curr_video_id}")

In [13]:
locations = ["bathroom", "bedroom", "car", "closet", "garage", "living room", "hallway", "outside", "garage", "kitchen", "deck"]
activities = ["being held", "eating", "drinking", "playing with toy", "getting changed", "crawling", "crying"]

In [None]:
from tqdm import tqdm
descriptions = pd.read_csv("tenkframe_descriptions.csv")
df = []
for image_path, caption in tqdm(zip(descriptions["image_path"], descriptions["caption"]), desc="Getting image descriptions"):
    location = image_captioner_2.caption_image(image_path,f"Answer with one word what location this image is in from the following: {", ".join(locations)}")
    activity = image_captioner_2.caption_image(image_path,f"Answer with one word what activity is going on in this image in from the following: {", ".join(activities)}")
    df.append({
        image_path: image_path,
        location: location,
        activity: activity,
        caption: caption
    })

In [None]:
descriptions["image_path"][0]

In [None]:
from IPython.display import Image, display

def render_image(image_path):
    display(Image(filename=image_path))
render_image(descriptions["image_path"][0])

Time for video chunking. Let's see if we can pull transcripts too that would be neat.

In [2]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from video_utils import split_video_simple
df = pd.read_csv("selected_chunk_transcripts.csv")
sampled_video_paths = df["video_path"]
for vid_path in tqdm(sampled_video_paths, desc="Chunking videos"):
    curr_video_id = Path(vid_path).stem
    split_video_simple(vid_path, 60, f"/ccn2/dataset/babyview/outputs_20250312/activities/chunks/{curr_video_id}")

In [None]:
#

In [None]:
from models.video.llava_video import LLAVAVideo
import pandas as pd
df = pd.read_csv("selected_chunk_transcripts.csv")
video_generator2 = LLAVAVideo()


In [None]:
df["chunk_path"][1]

plotting location frequencies

In [None]:
from plotnine import *
import pandas as pd
df = pd.read_csv("image_activities_locations_10k.csv")
df['location_clean'] = df['location'].str.lower()
keywords = ["bathroom", "bedroom", "car", "closet", "garage", "living room", "hallway", "outside", "garage", "kitchen", "deck"]

# Find the first matching keyword
def first_match(text):
    for word in keywords:
        if word in text:
            return word
    return None

df['matched'] = df['location_clean'].apply(first_match)

# Drop unmatched
df = df.dropna(subset=['matched'])

# Plot
plot = (
    ggplot(df, aes(x='matched')) +
    geom_bar() +
    labs(
        title='Location counts',
        x='Location',
        y='Count'
    )
)


print(plot)

In [None]:
df

In [None]:
plot

In [None]:
from plotnine import *
import pandas as pd
df = pd.read_csv("image_activities_locations_10k.csv")
df['location_clean_p1'] = df['location'].str.lower()
keywords = ["bathroom", "bedroom", "car", "closet", "garage", "living room", "hallway", "outside", "garage", "kitchen", "deck"]

# Find the first matching keyword
def first_match(text):
    for word in keywords:
        if word in text:
            return word
    return None

df['location_clean'] = df['location_clean_p1'].apply(first_match)

# Drop unmatched
df = df.dropna(subset=['location_clean'])
df = df.drop(columns=["location", "location_clean_p1"])
df.to_csv("image_locations_clean.csv", index=False)

df

Acitivities

In [None]:
import pandas as pd
from plotnine import *
df = pd.read_csv("video_activities_locations_all.csv")
df['activity_clean'] = df['activity_transcript'].str.lower()
keywords = ["being held", "eating", "drinking", "playing with toy", "getting changed", "crawling", "crying", "exploring", "cooking", "cleaning", "gardening", "watching tv", "driving", "reading", "on a phone call", "dancing", "packing", "looking at phone", "instrument playing", "exercising", "working on laptop", "nothing"]

# Find the first matching keyword
def first_match(text):
    for word in keywords:
        if word in text:
            return word
    return None

df['matched'] = df['activity_clean'].apply(first_match)

# Drop unmatched
df = df.dropna(subset=['matched'])

# Plot
plot = (
    ggplot(df, aes(x='matched')) +
    geom_bar() +
    labs(
        title='Activity counts',
        x='Activity',
        y='Count'
    )
)


print(plot)


In [None]:
activities = ["being held", "eating", "drinking", "playing with toy", "getting changed", "crawling", "crying", "exploring", "cooking", "cleaning", "gardening", "watching tv", "driving", "reading", "on a phone call", "dancing", "packing", "looking at phone", "instrument playing", "exercising", "working on laptop"]
activities

In [None]:
plot