Imports

In [None]:
!pip install openai
!pip install ffmpeg-python
!pip install av
!pip install scenedetect

In [None]:
import os
import cv2
import time
import json
import torch
import random
import ffmpeg
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import soundfile as sf
from openai import OpenAI
from json import loads,dumps
import matplotlib.pyplot as plt
from scipy.signal import resample
import typing_extensions as typing
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from scenedetect import open_video, VideoStreamCv2, SceneManager
from scenedetect.detectors import ContentDetector

In [None]:
# === Step 1: Load and filter ground truth ===
ground_df = pd.read_csv('/kaggle/input/youtube-data/ground_labels_new.csv')
ground_df['Primary Label'] = ground_df['Primary Label'].str.lower()
ground_df = ground_df[ground_df['Primary Label'].isin(['appropriate', 'inappropriate'])]
ground_df['Video Id'] = ground_df['Video Id'].astype(str).str.strip()

# === Step 2: Filter based on remaining IDs ===
remaining_ids_df = pd.read_csv('/kaggle/input/dynamic-few-shot/filtered_ground_truth.csv')
remaining_ids_df['Video Id'] = remaining_ids_df['Video Id'].astype(str).str.lstrip("'").str.strip()
filtered_ids = remaining_ids_df['Video Id'].tolist()
ground_df = ground_df[ground_df['Video Id'].isin(filtered_ids)]

# === Step 3: Load and merge with transcriptions ===
trans_df = pd.read_csv('/kaggle/input/youtube-data/eng-complete-transcriptions.csv')
trans_df['Video Id'] = trans_df['Video Id'].astype(str).str.strip()

# Keep only the transcription column and Video Id
trans_df = trans_df[['Video Id', 'Transcription']]

# Merge with ground truth
merged_df = pd.merge(ground_df[['Video Id', 'Primary Label']], trans_df, on='Video Id', how='inner')

# === Step 4: Load and merge with metadata ===
meta_df = pd.read_csv('/kaggle/input/dynamic-few-shot/ad_metadata_filtered.csv')
meta_df.rename(columns={"id": "Video Id"}, inplace=True)
meta_df["Video Id"] = meta_df["Video Id"].astype(str).str.strip()

# Keep only required metadata fields
meta_df = meta_df[['Video Id', 'title', 'channelTitle', 'tags', 'description', 'thumbnail']]

# Final merge
df = pd.merge(merged_df, meta_df, on='Video Id', how='inner')

# === Final check ===
print(f"Final merged dataframe shape: {df.shape}")
df.head()

In [None]:
# Extracting video ids and primary labels

video_ids = list(df['Video Id'])
primary_labels = list(df['Primary Label'])
all_transcriptions = list(df['Transcription'])

In [None]:
# Extracting data from transcripts

transcriptions = []
lengths = []

for (i, id_) in enumerate(video_ids):
    transcriptions.append(all_transcriptions[i].split("chunks")[0])
    lengths.append(len(all_transcriptions[i].split("chunks")[0]))


In [None]:
import os 

available_ids = os.listdir('/kaggle/input/youtube-data/Ads/Ads') 
len(available_ids) 

Extracting Images 

In [None]:
def detect_scenes(video_path, threshold = 30):
    """Detect scenes in a video and return scene start and end frames."""
    scene_list = []
    while len(scene_list) < 6 and threshold > 0:
        threshold //= 2
    
        video = open_video(video_path)
        scene_manager = SceneManager()
        scene_manager.add_detector(ContentDetector(threshold=threshold))
    
        scene_manager.detect_scenes(video)
        scene_list = scene_manager.get_scene_list()
    
    return scene_list


def get_top_n_longest_scenes(scene_list, n):
    '''Return the top n longest scenes with start and end frame indices.'''
    scene_durations = [(start, end - start) for start, end in scene_list]
    scene_durations.sort(key=lambda x: x[1], reverse=True)

    # Top n longest scenes with start and end frame indices
    longest_scenes = [(start, start + duration) for start, duration in scene_durations[:n]]
    return longest_scenes


def sort_scenes_by_frame(scenes_list):
    '''Sort scenes by their start frame number.'''
    sorted_scenes = sorted(scenes_list, key=lambda scene: scene[0].get_frames())
    return sorted_scenes


def get_num_grids(video_path):
    '''Get number of grids to be created'''
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = total_frames / fps

    # Calculate number of grids based on the duration
    duration = round(duration, 2)
    if ((duration // 60) + 1) <= 5:
        return int(((duration // 60) + 1))
    else:
        return 5
        

def extract_k_frames_from_scene(video_path, scene, k):
    '''Extract k frames evenly spaced from each scene.'''
    # Extract frame numbers from scene start and end
    start_frame = scene[0].get_frames() + 1
    end_frame = scene[1].get_frames() - 1

    # Create k equally spaced frame indices within the scene's range
    frame_indices = np.linspace(start_frame, end_frame, k, dtype=int)
    
    cap = cv2.VideoCapture(video_path)
    frames = []

    # Extract frames from calculated indices
    for frame_no in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frames.append(frame)
    
    cap.release()
    return frames


def create_image_grid(frames, grid_size=(1000, 1000)):
    '''Arrange 6 frames into a 3x2 grid and resize to the specified grid size.'''
    # Ensure all frames have the same size for concatenation
    frames = [cv2.resize(frame, (640, 360)) for frame in frames]  # Resize to a common size like 640x360
    rows = [np.concatenate(frames[i:i+2], axis=1) for i in range(0, 6, 2)]
    image_grid = np.concatenate(rows, axis=0)
    
    return np.array(Image.fromarray(image_grid).resize(grid_size))

In [None]:
def get_images(video_path, n=6):
    ''' 1. Detect scenes
        2. Get k; where k = num_grids
        3. Get the 6k longest scenes
        4. Sort scenes wrt frame numbers
        5. Extract 1 frame per 6k scene
        6. Create k image grids of 6 frames each
     '''
    scene_list = detect_scenes(video_path)
    k = get_num_grids(video_path)
    longest_scenes = get_top_n_longest_scenes(scene_list, n*k)
    scenes = sort_scenes_by_frame(longest_scenes)

    frames = []
    for scene in scenes:
        frames.extend(extract_k_frames_from_scene(video_path, scene, 1))

    grids = []
    for i in range(k):
        start_idx = i * n
        end_idx = start_idx + n
        grid_frames = frames[start_idx:end_idx]
        grid = create_image_grid(grid_frames, grid_size=(1000, 1000))
        grids.append(grid)

    return grids

In [None]:
def get_images_2(video_path, n=6):
    ''' 
    Extracts image grids from video based on scenes.
    Falls back to uniform sampling if no scenes are detected.
    '''
    # Step 1: Detect scenes
    scene_list = detect_scenes(video_path)

    # Step 2: Get number of grids and total frames needed
    k = get_num_grids(video_path)
    total_frames_needed = n * k

    def extract_nk_frames(video_path, total_frames_needed):
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
        if total_frames == 0:
            cap.release()
            return []
    
        # Get `total_frames_needed` evenly spaced frame indices
        selected_indices = np.linspace(0, total_frames - 1, total_frames_needed, dtype=int)
    
        frames = []
        for idx in selected_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            success, frame = cap.read()
            if success:
                frames.append(frame)
    
            if len(frames) >= total_frames_needed:
                break
    
        cap.release()
        return frames

    # Case 1: No scenes detected → use full video sampling
    if not scene_list:
        frames = extract_nk_frames(video_path, total_frames_needed)

    # Case 2: Scenes detected → extract from each scene
    else:
        available_scenes = len(scene_list)
        frames_per_scene = total_frames_needed // available_scenes
        remaining_frames = total_frames_needed % available_scenes

        if available_scenes == 1:
            frames_per_scene = total_frames_needed
            remaining_frames = 0

        frames = []
        for i, scene in enumerate(scene_list):
            num_frames = frames_per_scene + (1 if i < remaining_frames else 0)
            frames.extend(extract_k_frames_from_scene(video_path, scene, num_frames))

        frames = frames[:total_frames_needed]

    # Step 3: Ensure enough frames for all grids
    if len(frames) < n:
        frames = frames * (n // len(frames)) + frames[:(n % len(frames))]

    # Step 4: Create image grids
    grids = []
    for i in range(k):
        start_idx = i * n
        end_idx = start_idx + n
        grid_frames = frames[start_idx:end_idx]
        if grid_frames:
            grid = create_image_grid(grid_frames, grid_size=(1000, 1000))
            grids.append(grid)

    return grids

In [None]:
apikey = ""

client = OpenAI(api_key=apikey) 

In [None]:
output_schema = {
  "type": "object",
  "properties": {
    "label": {
      "type": "string",
      "enum": ["inappropriate", "appropriate"]
    },
    "justification": {
      "type": "string",
      "minLength": 10
    }, 
    "languages": {
        "type": "array", 
        "items": {
            "type": "string" 
        }
        
    }
  },
  "required": ["label", "justification", "languages"]
} 

In [None]:
import base64
import mimetypes

def encode_image(path):
    with open(path, "rb") as f:
        encoded = base64.b64encode(f.read()).decode("utf-8")
    media_type, _ = mimetypes.guess_type(path)
    return media_type or "image/png", encoded

In [None]:
def classify_video_with_images(text_input, audio_transcription, image_paths, metadata=None):
    """
    Sends text, audio transcription, metadata, and multiple images to the API for classification.

    Parameters:
    - text_input: str, the input text.
    - audio_transcription: str, transcription of the audio.
    - image_paths: list of str, paths to the images.
    - metadata: dict, optional metadata (e.g., title, tags, channel, description).

    Returns:
    - response: The API response.
    """

    # === Encode all images to base64 ===
    encoded_images = []
    for path in image_paths:
        media_type, encoded_data = encode_image(path)
        encoded_images.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:{media_type};base64,{encoded_data}"
            }
        })

    # === Format metadata ===
    if metadata and isinstance(metadata, dict):
        metadata_text = (
            f"Video Title: {metadata.get('title', '')}\n"
            f"Channel Title: {metadata.get('channel_title', '')}\n"
            f"Tags: {', '.join(metadata.get('tags', [])) if isinstance(metadata.get('tags'), list) else metadata.get('tags', '')}\n"
            f"Description: {metadata.get('description', '')}"
        )
    else:
        metadata_text = "No metadata provided."

    # === Prepare request payload ===
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {apikey}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Instruction:\n{text_input}"},
                    {"type": "text", "text": f"Metadata:\n{metadata_text}"},
                    {"type": "text", "text": f"Audio Transcription:\n{audio_transcription}"}
                ] + encoded_images
            }
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "output_schema",
                "schema": output_schema
            }
        },
        "max_tokens": 300,
        "temperature": 0.0
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    return response

In [None]:
import torch
print(torch.cuda.is_available())

Prompt 

In [None]:
prompt =  """"A video can be considered inappropriate for children if it contains physical violence (cartoonish, realistic, or gory), interpersonal violence (bullying, pranks, meanness, belittling, controlling behavior, talking down to others, or manipulation), self-harm or suicide (depictions of harm inflicted on oneself or suicidal thoughts/tendencies), extreme stunts (life-endangering, high-risk activities/challenges that require adult supervision), dangerous products or services (like paintball, airsoft, fireworks, weapons, hunting equipment, graffiti products), scary content (horror, suspense, loud violence, zombies, skeletons, masks, scary clowns, blood, dangerous fire, car crashes, medical procedures or other scary visuals), sexual content (innuendos, sexual behavior, nudity, suggestiveness), intimacy (romantic conduct, couples expressing love, sensual contact, dating, relationships or any expression of love or loss for a significant other,  lyrics about or mentions of love or romance, heartbreak), inappropriate clothing (significant skin exposure or sheer clothing outside of appropriate contexts such as sunbathing, swinmwear,etc), substance use (mention, depictions, or use of alcohol, drugs, tobacco), gambling (betting, gambling, casinos, online or real world casino games, online trading), offensive language (from mild to severe with words such as “stupid” or “idiot”), stereotypes (racial, ethnic, gender, body shaming), crude humor (inappropriate humor, innuendos), weapons (guns, swords, knives, etc but toy guns such as nerf guns are fine), unsuitable media (ads for violent or adult-themed video games, casino games, games with romance or battle or first person-shooter games), fight sports (boxing, wrestling, martial arts), financial content (difference, rolling spot forex, financial spread betting and synonymous products but banking ads are fine), health and beauty services (cosmetic procedures, weight loss, tanning, piercings, tattoos, cosmetics and makeup (no matter their essential benefits) and other products related to external personal care focused on body image but ads on fashion products and essential hygiene products like sunscreen, shampoo, toothpaste, etc. are fine, however, products like sanitary care and hair removal is inappropriate), ads or sponsored content for consumable food or drink products (regardless of nutritional content), where the product is consumed by a person or shown with branding, packaging, or promotional intent are inappropriate; general content like cooking shows, recipes, or scenes where food is simply present or being eaten without promotion are acceptable, pet/animal food products and healthcare, healthcare or medical topics (reproductive health, mental health, substance abuse, eating disorders, pharmaceuticals, health insurance, any nutritional or dietary supplements), sensitive topics (politics, religion, controversial social issues, astrology, occult, paranormal, religious sermons), privacy or safety risks (services or competitions requiring personal information, social networks for strangers, spy services, or video game skin trading), or any explicit depictions or discussions of death or dying. Consider that you are an expert content moderator. Based on the image grids provided containing frames from key scenes from within the video, the video title, channel title, description, thumbnail, tags and its transcription, is the video advertisement inappropriate for children on this definition? If it is, label it as “inappropriate.” If it is not, label it as “appropriate.” Justify your conclusion. Also, mention what language is being spoken if there is any. 
Respond in **JSON format only**. Do **not** include any commentary.

Return only this format (no preamble):

{
  "label": "appropriate" or "inappropriate",
  "justification": "brief reason",
  "language": "language spoken in the video"
}"""

prompt 

Define function

In [None]:
len(video_ids)

In [None]:
import pandas as pd
import requests
import json
import time
import os
import re
from PIL import Image

# === Parameters ===
start = 0
end = 2306
TEMP_THUMB_PATH = "/kaggle/working/temp_thumb.jpg"
img_dir = '/kaggle/working/Images'
os.makedirs(img_dir, exist_ok=True)

# === Output containers ===
ids = []
predicted_labels = []
languages = []
responses = []
ground_truths_ = []
remaining = []

# === Slice metadata ===
metadata_df = df.iloc[start:end].reset_index(drop=True)
print(f"Filtered metadata contains {len(metadata_df)} remaining ads.")

for i, row in metadata_df.iterrows():
    ad_id = row["Video Id"]
    if ad_id not in available_ids:
        continue

    print(f'\n ID: {ad_id}')
    try:
        # === File paths ===
        ad_path = f'/kaggle/input/youtube-data/Ads/Ads/{ad_id}'
        contents = os.listdir(ad_path)
        contents.remove('audio.mp3')
        video_path = os.path.join(ad_path, contents[0])

        # === Transcription ===
        audio = row['Transcription'] if pd.notna(row.get('Transcription')) else ''

        # === Extract frames ===
        try:
            images = get_images(video_path)
        except:
            images = get_images_2(video_path)

        image_paths = []
        for idx, img in enumerate(images):
            image = Image.fromarray(img)
            path = os.path.join(img_dir, f"{ad_id}_{idx + 1}.png")
            image.save(path)
            image_paths.append(path)
            print("Saved:", path)

        # === Metadata ===
        title = row.get('title', '')
        description = row.get('description', '')
        channel_title = row.get('channelTitle', '')
        tags = row.get('tags', '')
        primary_label = row.get('Primary Label', '')

        # === Download & save thumbnail ===
        try:
            r = requests.get(f"https://i.ytimg.com/vi/{ad_id}/default.jpg")
            if r.status_code == 200:
                with open(TEMP_THUMB_PATH, "wb") as f:
                    f.write(r.content)
                image_paths.append(TEMP_THUMB_PATH)
                print("Thumbnail saved:", TEMP_THUMB_PATH)
            else:
                print("Thumbnail download failed.")
        except Exception as e:
            print(f"Thumbnail error: {e}")

        # === JSON input for metadata ===
        json_input = {
            "title": title,
            "description": description,
            "channel_title": channel_title,
            "tags": tags
        }

        # === Inference ===
        try:
            print("Making inference...")
            classification_response = classify_video_with_images(
                prompt, audio, image_paths, json_input
            )
    
            response_json = classification_response.json()
        
            if 'choices' not in response_json:
                raise ValueError(f"Missing 'choices' in response: {response_json}")
        
            message_content = response_json['choices'][0]['message']['content']
            parsed_json = json.loads(message_content)
        
            temp_id = ad_id
            temp_label = primary_label
            temp_predicted_label = parsed_json.get('label')
            temp_response = message_content
            temp_languages = parsed_json.get('language') or parsed_json.get('languages')
        
            ids.append(temp_id)
            ground_truths_.append(temp_label)
            predicted_labels.append(temp_predicted_label)
            responses.append(temp_response)
            languages.append(temp_languages)
        
            print(f"\nCompleted: {i} | ID: {temp_id} | GT: {temp_label} | Pred: {temp_predicted_label}")
            print("Parsed response:", parsed_json)
        
        except Exception as e:
            print(f"Failed to parse classification for {ad_id} | Error: {e}")
            remaining.append(ad_id)

    except Exception as e:
        print('\nImage extraction failed for ', i, 'Error:', str(e))
        remaining.append(video_ids[i])

    time.sleep(2) 

# === Final Summary ===
print("\nRemaining ads with issues:", remaining)

In [None]:
results_dir = '/kaggle/working/results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [None]:
# At the end, print remaining videos 

remaining_df = pd.DataFrame({'Video Id': remaining})

# Save to CSV
remaining_df.to_csv(f'/kaggle/working/results/DAVSP-gpt-metadata-eng-remaining-{start}-{end}.csv', index=False)

print("Remaining videos saved to remaining.csv")
print("Remaining videos with errors:", remaining) 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': ground_truths_,
    'Predicted Label': predicted_labels,
    'Response': responses, 
    'Languages': languages 
})

new_df.head() 

In [None]:
new_df.to_csv(f'/kaggle/working/results/davsp_gpt-metadata-eng-new-{start}-{end}.csv', index=False)

In [None]:
# Changing to binary lists 

predictions = [1 if pred == 'inappropriate' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'inappropriate' else 0 for label in ground_truths_] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 

report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Appropriate', 'Inapproriate'], yticklabels=['Appropriate', 'Inapproriate'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()