# Imports

In [None]:
!pip install anthropic
!pip install ffmpeg-python
!pip install av

In [None]:
import os
import pandas as pd
import anthropic
from json import loads,dumps
import time
from glob import glob
import json
import seaborn as sns
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import cv2
import torch
import random
import ffmpeg
import warnings
import numpy as np
from PIL import Image
from glob import glob
import soundfile as sf
import matplotlib.pyplot as plt
from scipy.signal import resample
import typing_extensions as typing


# Data Loading

**Defining Model** 

In [None]:
os.environ["ANTHROPIC_API_KEY"] = "" 
client = anthropic.Anthropic() 

**Prompt** 

In [None]:
prompt =  """"A video can be considered inappropriate for children if it contains physical violence (cartoonish, realistic, or gory), interpersonal violence (bullying, pranks, meanness, belittling, controlling behavior, talking down to others, or manipulation), self-harm or suicide (depictions of harm inflicted on oneself or suicidal thoughts/tendencies), extreme stunts (life-endangering, high-risk activities/challenges that require adult supervision), dangerous products or services (like paintball, airsoft, fireworks, weapons, hunting equipment, graffiti products), scary content (horror, suspense, loud violence, zombies, skeletons, masks, scary clowns, blood, dangerous fire, car crashes, medical procedures or other scary visuals), sexual content (innuendos, sexual behavior, nudity, suggestiveness), intimacy (romantic conduct, couples expressing love, sensual contact, dating, relationships or any expression of love or loss for a significant other,  lyrics about or mentions of love or romance, heartbreak), inappropriate clothing (significant skin exposure or sheer clothing outside of appropriate contexts such as sunbathing, swinmwear,etc), substance use (mention, depictions, or use of alcohol, drugs, tobacco), gambling (betting, gambling, casinos, online or real world casino games, online trading), offensive language (from mild to severe with words such as “stupid” or “idiot”), stereotypes (racial, ethnic, gender, body shaming), crude humor (inappropriate humor, innuendos), weapons (guns, swords, knives, etc but toy guns such as nerf guns are fine), unsuitable media (ads for violent or adult-themed video games, casino games, games with romance or battle or first person-shooter games), fight sports (boxing, wrestling, martial arts), financial content (difference, rolling spot forex, financial spread betting and synonymous products but banking ads are fine), health and beauty services (cosmetic procedures, weight loss, tanning, piercings, tattoos, cosmetics and makeup (no matter their essential benefits) and other products related to external personal care focused on body image but ads on fashion products and essential hygiene products like sunscreen, shampoo, toothpaste, etc. are fine, however, products like sanitary care and hair removal is inappropriate), ads or sponsored content for consumable food or drink products (regardless of nutritional content), where the product is consumed by a person or shown with branding, packaging, or promotional intent are inappropriate; general content like cooking shows, recipes, or scenes where food is simply present or being eaten without promotion are acceptable, pet/animal food products and healthcare, healthcare or medical topics (reproductive health, mental health, substance abuse, eating disorders, pharmaceuticals, health insurance, any nutritional or dietary supplements), sensitive topics (politics, religion, controversial social issues, astrology, occult, paranormal, religious sermons), privacy or safety risks (services or competitions requiring personal information, social networks for strangers, spy services, or video game skin trading), or any explicit depictions or discussions of death or dying. Consider that you are an expert content moderator. Based on the provided title, description, channel title, tags, and thumbnail, is the video advertisement using this auido inappropriate for children on this definition? If it is, label it as “inappropriate.” If it is not, label it as “appropriate.” Justify your conclusion. Also, mention what language is being spoken if there is any. 
Make the outputs in JSON format: {“label”: “appropriate” or “inappropriate”, “justification”: “brief explanation of the classification”, “language”: “language detected in the video”}."""
prompt 

In [None]:
import base64
import mimetypes

def encode_image(image_path):
    """
    Returns a tuple of (media_type, base64_encoded_image_data)
    suitable for Claude's image input format.
    """
    # Infer MIME type from file extension (e.g., .jpg → image/jpeg)
    mime_type, _ = mimetypes.guess_type(image_path)
    if mime_type is None:
        mime_type = "image/jpeg"  # fallback default

    with open(image_path, "rb") as img_file:
        encoded_data = base64.b64encode(img_file.read()).decode("utf-8")

    return mime_type, encoded_data


def classify_video_with_images(text_input, audio_transcription, image_paths):
    
    image_contents = [
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": img_media_type,
                "data": img_data,
            }
        }
        for image_path in image_paths
        for img_media_type, img_data in [encode_image(image_path)]
    ]
    
    response = client.messages.create(
        model="claude-sonnet-4-20250514",  
        system="You are a content classification assistant that evaluates image frames and audio transcriptions from videos according to strict guidelines and returns response in JSON.",
        max_tokens=1024,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    *image_contents,
                    {"type": "text", "text": text_input},
                    {"type": "text", "text": audio_transcription}
                ],
            }
        ]
    )

    return response

**Running Model on Dataset** 

In [None]:
import pandas as pd
import requests
import json
import re

start_idx = 0
end_idx = 2306

# === Load CSVs ===
ad_df = pd.read_csv('/kaggle/input/metadata/old_ad_details.csv')
ground_truth_df = pd.read_csv('/kaggle/input/metadata/filtered_ground_truth (1).csv')

# === Ensure ID columns are strings ===
ad_df['id'] = ad_df['id'].astype(str)
ground_truth_df['Video Id'] = ground_truth_df['Video Id'].astype(str)

print("ad_df columns:", ad_df.columns)
print("ground_truth_df columns:", ground_truth_df.columns)

# === Filter ad_df to only include rows with IDs present in ground_truth ===
filtered_ad_df = ad_df[ad_df['id'].isin(ground_truth_df['Video Id'])]

# === Drop duplicate IDs (keep first) ===
filtered_ad_df = filtered_ad_df.drop_duplicates(subset='id', keep='first')

# === Merge using ad_df.id and ground_truth_df['Video Id'] ===
merged_df = pd.merge(ground_truth_df, filtered_ad_df, left_on='Video Id', right_on='id', how='inner')

# === Save merged DataFrame ===
merged_df.to_csv('/kaggle/working/merged_df.csv', index=False)

# === Inspect output ===
print("merged_df columns:", merged_df.columns)
print(f"Total merged rows: {len(merged_df)}")

end_idx = len(merged_df)


# Slice by row range
sliced_df = merged_df.iloc[start_idx:end_idx]
print("sliced df", len(sliced_df))
# Output containers
ids = []
predicted_labels = []
languages = []
responses = []
ground_truths_ = []
remaining = []

TEMP_THUMB_PATH = "/kaggle/working/temp_thumb.jpg"


# Process each valid ad
for i in range(start_idx, end_idx):
    row = merged_df.iloc[i]
    ad_id = row['Video Id']
    title = row['title_x']
    description = row['description']
    channel_title = row['channelTitle_x']
    tags = row['tags_x']
    label = row['Primary Label']

    thumbnail_url = f"https://i.ytimg.com/vi/{ad_id}/default.jpg"

    try:
        # Download thumbnail
        img_response = requests.get(thumbnail_url)
        if img_response.status_code == 200:
            with open(TEMP_THUMB_PATH, "wb") as f:
                f.write(img_response.content)
        else:
            print(f"Thumbnail not found for {ad_id}")
            remaining.append(ad_id)
            continue

        # Prepare input JSON
        json_input = {
            "title": title,
            "description": description,
            "channel_title": channel_title,
            "tags": tags,
        }


        try: 
            json_str = json.dumps(json_input, indent=2)

            response = classify_video_with_images(
                text_input=json_str,
                audio_transcription=prompt,  # your prompt here
                image_paths=[TEMP_THUMB_PATH]  # single image as list
            )
        except Exception as e:
            print(f"Error making inference: {e}")
            remaining.append(ad_id)
            continue 

        try:
            print("Completed for video number:", i, ' ', ad_id)
            
            response_text = response.content[0].text.strip()
            if response_text.startswith("```"):
                response_text = re.sub(r"^```(?:json)?\n?", "", response_text)
                response_text = re.sub(r"\n?```$", "", response_text)
            dictionary = json.loads(response_text)
            
            print('True Label:', label, 'Response:', dictionary)

            ids.append(ad_id)
            predicted_labels.append(dictionary['label'])
            languages.append(dictionary['language'])
            responses.append(dictionary['justification'])
            ground_truths_.append(label)
            
        except Exception as e:
            print(f"Error processing response.text: {e}")
            remaining.append(ad_id)
            continue

    except Exception as e:
        print(f"Unexpected error: {e}")
        remaining.append(ad_id)


In [None]:
# At the end, print remaining videos 

remaining_df = pd.DataFrame({'Video Id': remaining})

# Save to CSV
remaining_df.to_csv(f'remaining-{start_idx}-{end_idx}.csv', index=False)

print("Remaining videos saved to remaining.csv")
print("Remaining videos with errors:", remaining) 

In [None]:
for i in range(len(responses)): 
    print('True Label: ', ground_truths_[i], '\tPrediction: ', predicted_labels[i]) 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': ground_truths_,
    'Predicted Label': predicted_labels,
    'Response': responses, 
    'Languages': languages 
})

new_df.head() 

In [None]:
new_df.to_csv(f'/kaggle/working/gemini-only-metadata-{start_idx}-{end_idx}.csv', index=False)

In [None]:
# Changing to binary lists 

predictions = [1 if pred == 'inappropriate' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'inappropriate' else 0 for label in ground_truths_] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 

report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Appropriate', 'Inapproriate'], yticklabels=['Appropriate', 'Inapproriate'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()