Imports

In [None]:
!pip install ffmpeg-python
!pip install av
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate
!pip install anthropic


In [None]:
import anthropic
from json import loads
import time
import json
import os
import cv2
import torch
import random
import ffmpeg
import warnings
import numpy as np
import pandas as pd
from PIL import Image
import soundfile as sf
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import resample
import typing_extensions as typing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Extracting all required ids 

df = pd.read_csv('/kaggle/input/dynamic-few-shot/filtered_ground_truth.csv') 

df['Primary Label'] = df['Primary Label'].str.lower() 
df = df[df['Primary Label'].isin(['appropriate', 'inappropriate'])]

ids_df = pd.read_csv('/kaggle/input/dynamic-few-shot/filtered_ground_truth.csv') 
ids_df['Video Id'] = ids_df['Video Id'].str.lstrip("'") 
ids_ = ids_df['Video Id'].to_list() 
df = df[df['Video Id'].isin(ids_)] 

transcriptions_df = pd.read_csv('/kaggle/input/youtube-data/eng-complete-transcriptions.csv')
df = pd.merge(df, transcriptions_df, on='Video Id', how='inner') 

df

In [None]:
# Extracting video ids and primary labels

video_ids = list(df['Video Id'])
primary_labels = list(df['Primary Label'])
all_transcriptions = list(df['Transcription'])

len(video_ids)

In [None]:
# Extracting data from transcripts

transcriptions = []
lengths = []

for (i, id_) in enumerate(video_ids):
    transcriptions.append(all_transcriptions[i].split("chunks")[0])
    lengths.append(len(all_transcriptions[i].split("chunks")[0]))

available_ids = os.listdir('/kaggle/input/youtube-data/Ads/Ads') 
len(available_ids) 

Extracting Images 

In [None]:
def extract_single_image(video_path, num_frames=6):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the interval at which to sample frames
    interval = max(1, total_frames // num_frames)
    frames = []

    # Extract frames at regular intervals
    for i in range(num_frames):
        frame_no = i * interval
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
        ret, frame = cap.read()
        if ret:
            frames.append(frame)

    cap.release()

    # Ensure the correct number of frames were extracted
    if len(frames) < num_frames:
        raise ValueError("Not enough frames extracted from the video.")

    # Concatenate all frames into a single image
    try:
        concatenated_rows = []
        for i in range(0, num_frames, 2):
            row = np.concatenate(frames[i:i+2], axis=1)
            concatenated_rows.append(row)
        concatenated_image = np.concatenate(concatenated_rows, axis=0)
    except Exception as e:
        raise ValueError(f"Error during frame concatenation: {e}")

    # Resize the final image
    final_image = Image.fromarray(concatenated_image).resize((1000, 1000))
    return np.array(final_image)

In [None]:
os.environ["ANTHROPIC_API_KEY"] = "" 

client = anthropic.Anthropic() 

In [None]:
import base64
import httpx

# Function to encode the image

def encode_image(image_path):
    with open(image_path, "rb") as image_file: 
        image_media_type = "image/png" 
        image_data = base64.b64encode(image_file.read()).decode('utf-8') 
        return image_media_type, image_data 

In [None]:
import torch
print(torch.cuda.is_available())

Define Model 

In [None]:
def classify_video_with_text(text_input, audio_transcription):
    response = client.messages.create(
        model="claude-sonnet-4-20250514",  # Model name
        system="You are a content classification assistant that evaluates audio transcriptions from videos according to strict guidelines and gives a response in JSON format.",
        max_tokens=1024,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_input},
                    {"type": "text", "text": audio_transcription}
                ],
            }
        ]
    )

    return response

Prompt 

In [None]:

prompt = """A video can be considered inappropriate for children if it contains physical violence (cartoonish, realistic, or gory), interpersonal violence (bullying, pranks, meanness, belittling, controlling behavior, talking down to others, or manipulation), self-harm or suicide (depictions of harm inflicted on oneself or suicidal thoughts/tendencies), extreme stunts (life-endangering, high-risk activities/challenges that require adult supervision), dangerous products or services (like paintball, airsoft, fireworks, weapons, hunting equipment, graffiti products), scary content (horror, suspense, loud violence, zombies, skeletons, masks, scary clowns, blood, dangerous fire, car crashes, medical procedures or other scary visuals), sexual content (innuendos, sexual behavior, nudity, suggestiveness), intimacy (romantic conduct, couples expressing love, sensual contact, dating, relationships or any expression of love or loss for a significant other,  lyrics about or mentions of love or romance, heartbreak), inappropriate clothing (significant skin exposure or sheer clothing outside of appropriate contexts such as sunbathing, swinmwear,etc), substance use (mention, depictions, or use of alcohol, drugs, tobacco), gambling (betting, gambling, casinos, online or real world casino games, online trading), offensive language (from mild to severe with words such as “stupid” or “idiot”), stereotypes (racial, ethnic, gender, body shaming), crude humor (inappropriate humor, innuendos), weapons (guns, swords, knives, etc but toy guns such as nerf guns are fine), unsuitable media (ads for violent or adult-themed video games, casino games, games with romance or battle or first person-shooter games), fight sports (boxing, wrestling, martial arts), financial content (difference, rolling spot forex, financial spread betting and synonymous products but banking ads are fine), health and beauty services (cosmetic procedures, weight loss, tanning, piercings, tattoos, cosmetics and makeup (no matter their essential benefits) and other products related to external personal care focused on body image but ads on fashion products and essential hygiene products like sunscreen, shampoo, toothpaste, etc. are fine, however, products like sanitary care and hair removal is inappropriate), ads or sponsored content for consumable food or drink products (regardless of nutritional content), where the product is consumed by a person or shown with branding, packaging, or promotional intent are inappropriate; general content like cooking shows, recipes, or scenes where food is simply present or being eaten without promotion are acceptable, pet/animal food products and healthcare, healthcare or medical topics (reproductive health, mental health, substance abuse, eating disorders, pharmaceuticals, health insurance, any nutritional or dietary supplements), sensitive topics (politics, religion, controversial social issues, astrology, occult, paranormal, religious sermons), privacy or safety risks (services or competitions requiring personal information, social networks for strangers, spy services, or video game skin trading), or any explicit depictions or discussions of death or dying. Consider that you are an expert content moderator. Based on the provided audio transcription, is the video advertisement inappropriate for children on this definition? If it is, label it as “inappropriate.” If it is not, label it as “appropriate.” Justify your conclusion. Also, mention what language is being spoken if there is any. 
Respond in **JSON format only**. Do **not** include any commentary.

Return only this format (no preamble):

{
  "label": "appropriate" or "inappropriate",
  "justification": "brief reason",
  "language": "language spoken in the video"
}"""

prompt 

Define function

In [None]:
len(video_ids)

In [None]:
start = 0
end = 2306

In [None]:
ids = []
labels = []
responses = []
predicted_labels = [] 
languages = [] 
remaining = []
failed_json_logs = []  # To store tuples of (video_id, raw_response)


for i in range(start, end): 

    if video_ids[i] in available_ids:
        try: 
            print(i)

            audio_transcription = transcriptions[i]

            # Make inference
            print("Making inference for i", i)
            classification_response = classify_video_with_text(prompt, audio_transcription)
            print("classification_response", classification_response)
            
            # Extract text from response
            raw_text = classification_response.content[0].text.strip()
            print("Raw response text:", video_ids[i], repr(raw_text))

            # Try direct parsing first
            try:
                if raw_text.startswith("```json"):
                    raw_text = raw_text.replace("```json", "").replace("```", "").strip()
                parsed = json.loads(raw_text)
            except json.JSONDecodeError as e:
                print(f"Direct JSON parsing failed: {e}")
                print("Attempting regex-based extraction...")

                # Fallback: extract JSON block using regex
                json_match = re.search(r'\{[\s\S]*\}', raw_text)
                if json_match:
                    json_str = json_match.group(0)
                    try:
                        parsed = json.loads(json_str)
                        raw_text = json_str  # Save cleaned version
                    except json.JSONDecodeError as e2:
                        print(f"Regex-based JSON parsing error: {e2}")
                        print(f"Extracted JSON:\n{json_str}")
                        remaining.append(video_ids[i])
                        failed_json_logs.append((video_ids[i], raw_text))
                        continue
                else:
                    print("No JSON found in response.")
                    remaining.append(video_ids[i])
                    failed_json_logs.append((video_ids[i], raw_text))
                    continue

            pred_temp = parsed.get('label') or parsed.get('classification')
            lang_temp = parsed.get('language')

            temp_id = video_ids[i]
            temp_label = primary_labels[i]

            print(f"Id: {temp_id}. Primary Label: {temp_label}\nParsed Response: {parsed}")

            ids.append(temp_id) 
            labels.append(temp_label) 
            responses.append(raw_text) 
            predicted_labels.append(pred_temp) 
            languages.append(lang_temp)
            
        except Exception as e: 
            print('failed for ', i)
            print("Error", e)
            remaining.append(video_ids[i])

        time.sleep(30)

print("Remaining video with errors", len(remaining))

In [None]:
results_dir = '/kaggle/working/results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [None]:
# At the end, print remaining videos 
remaining_df = pd.DataFrame({'Video Id': remaining})

# Save to CSV
remaining_df.to_csv('/kaggle/working/results/remaining.csv', index=False)

print("Remaining videos saved to remaining.csv")
print("Remaining videos with errors:", remaining) 

In [None]:
for i in range(len(responses)): 
    print('True Label: ', labels[i], '\tPrediction: ', predicted_labels[i]) 

In [None]:
new_df = pd.DataFrame({
    'Video Id': ids,
    'Primary Label': labels,
    'Predicted Label': predicted_labels,
    'Response': responses, 
    'Languages': languages 
})

new_df.head() 

In [None]:
new_df.to_csv(f'/kaggle/working/results/claude-eng-audio-{start}-{end}.csv', index=False)

In [None]:
# Changing to binary lists 
predictions = [1 if pred == 'inappropriate' else 0 for pred in predicted_labels] 
ground_truths = [1 if label == 'inappropriate' else 0 for label in labels] 

In [None]:
# Obtaining classification report 
from sklearn.metrics import classification_report 
report = classification_report(ground_truths, predictions) 
print(report) 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ground_truths, predictions)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Appropriate', 'Inapproriate'], yticklabels=['Appropriate', 'Inapproriate'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()