Notes: perhaps we can improve model performance on validation set by removing images with purely neutral ratings. Thinking the neutral ratings may dilute the "positive" and "negative" classes so the decision boundary is less clear.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = "/content/drive/MyDrive/Deep Learning/DL_Project/"
# FOLDERNAME = # update your folder name here
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

In [None]:
# !pip install google-api-python-client requests

In [None]:
from googleapiclient.discovery import build
import requests
import os

# API key to grab samples of YouTube thumbnails with age-restricted content
API_key = "ADD YOUR API KEY HERE"

# config
SAVE_DIR = "thumbnails"
os.makedirs(SAVE_DIR, exist_ok=True)

# set up YT client
youtube = build("youtube", "v3", developerKey=API_key)


## Age-restricted videos sample

In [None]:

# trial and error with search terms to get restricted content
queries_mature = ["war footage", "17+ content", "explicit", "blood and gore", "deadly footage", "gun violence", "evil clown"]
#queries_radesky = ["PewDiePie", "Fortnite", "DanTDM", "Minecraft", "MrBeast", "FGTeeV", "Flamingo", "memes", "unspeakable", "try not to laugh", "Roblox", "SML"]
# tried these, none are are restricted
#queries = queries_mature + queries_radesky

age_restricted_videos = {}

for q in queries_mature:
  # search for videos
  request = youtube.search().list(
      q=q,
      part="id,snippet",
      type="video",
      maxResults=200
  )
  response = request.execute()

  # check items in response
  for item in response['items']:
    video_id = item['id'].get("videoId", {})
    title = item['snippet'].get("title", {})
    thumbnail = item['snippet']['thumbnails']['high']['url']

    # get content rating
    video_request = youtube.videos().list(
        part="contentDetails,snippet",
        id=video_id
    )
    video_response = video_request.execute()
    if not video_response["items"]:
      continue
    video_data = video_response["items"][0]
    content_rating = video_data["contentDetails"].get("contentRating", {})
    age_restricted = "ytRating" in content_rating and content_rating["ytRating"] == "ytAgeRestricted"

    if age_restricted:
      print(f"Title: {title}")

      # add to collection
      age_restricted_videos[video_id] = {
          "title": title,
          "thumbnail": thumbnail
      }


In [None]:
len(age_restricted_videos)

One other idea is to generate valence/arousal ratings via NLP on the descriptions but that is also exploratory so let's stick with providing the ratings ourselves!

## Nonrestricted videos sample

In [None]:
# for class balance, grab ~ 30 thumbnails of non-censored and happy content
# making up search terms here, incorporating some of the search terms referenced by Radesky et al.
queries = ["memes", "funny cat videos", "try not to laugh", "Roblox", "travel vlog", "Fortnite"]
nonrestricted_videos = {}

for q in queries:
  request = youtube.search().list(
      q=q,
      part="id,snippet",
      type="video",
      maxResults=5)
  response = request.execute()

  # check items in response
  for item in response['items']:
    video_id = item['id'].get("videoId", {})
    title = item['snippet'].get("title", {})
    thumbnail = item['snippet']['thumbnails']['high']['url']
    video_response = youtube.videos().list(
        part="contentDetails,snippet",
        id=video_id).execute()
    if not video_response["items"]:
      continue
    video_data = video_response["items"][0]
    content_rating = video_data["contentDetails"].get("contentRating", {})
    age_restricted = "ytRating" in content_rating and content_rating["ytRating"] == "ytAgeRestricted"

    if not age_restricted:
      print(f"Title: {title}")
      nonrestricted_videos[video_id] = {
          "title": title,
          "thumbnail": thumbnail
      }



In [None]:
print(len(nonrestricted_videos))
#nonrestricted_videos

# Module for team members to provide valence and arousal ratings

In [None]:
from IPython.display import display
import ipywidgets as widgets
import requests
import os
import pandas as pd
from io import BytesIO
import random

## 1. Practice assigning valence and arousal ratings

In [None]:
NAPS_DIR = FOLDERNAME + "NAPS_data/NAPS_H" # recall FOLDERNAME is just my path in google drive defined in first cell # choosing the High Quality NAPS data folder
print(NAPS_DIR)

In [None]:
def load_file(filepath,file_types):

  #Creating a list that returns all filepaths
  all_files = []

  #os.walk goes through each file and each directory. recursively joins all
  for root,dirs,files in os.walk(filepath):
    for file in files:
      if any(file.endswith(ft) for ft in file_types):
        all_files.append(os.path.join(root,file))

  return all_files

In [None]:
#PULL IN ALL FILES:
NAPS_path = FOLDERNAME + '/NAPS_data'
mapping_path_NAPS = load_file(NAPS_path,["table.csv"])[0] # loads naps_table
NAPS_data = pd.DataFrame(pd.read_csv(mapping_path_NAPS))
NAPS_data["source"] = "NAPS"
list1 = load_file(NAPS_path,[".jpg"]) # list of all NAPS images

# taking methods from other file
image_data = pd.DataFrame(list1, columns = ["image_path"]) # makes a DF out of all images
image_data["image"] = image_data["image_path"].apply(lambda x: x.split("/")[-1].split(".")[-2])
image_data = pd.merge(image_data, NAPS_data, left_on = "image", right_on = "ID", how = "inner") # merges on label
image_data = image_data.drop(["image"], axis = 1) # drops duplicate

In [None]:
# subsample image_data
image_sample = image_data.sample(n=30, random_state=42)
image_sample = image_sample.reset_index(drop=True)
sample = image_sample.iloc[:,[0,1,6,10]] # select just the cols we need
# sample

In [None]:
# turn into dict
formatted_entries = []

for i in range(len(sample["ID"])):
    entry = {
        "id": sample["ID"][i],
        "source_type": "local",
        "path": sample["image_path"][i],
        "true_valence": sample["All_ValenceM"][i],
        "true_arousal": sample["All_ArousalM"][i]
    }
    formatted_entries.append(entry)


In [None]:
all_images = formatted_entries # rename for use in update_image widget function

In [None]:
import time
# Define helper functions

# update save path so it doesn't overwrite
save_path = FOLDERNAME + "annotations.csv"

# different approach to load depending on source
def load_image(entry):
    if entry["source_type"] == "local":
        with open(entry["path"], "rb") as f:
            return f.read()
    elif entry["source_type"] == "url":
        response = requests.get(entry["url"])
        return BytesIO(response.content).getvalue()

# display current im
def update_image():
    entry = all_images[index["i"]]
    img_widget.value = load_image(entry)
    progress_label.value = f"Image {index['i']+1} of {len(all_images)}"

# assign annotation and move on to next # can only take "b" = "button" as argument
def on_next(b):
    entry = all_images[index["i"]]
    annotations.append({
        "id": entry["id"],
        "valence": valence_slider.value,
        "arousal": arousal_slider.value,
        "true_valence": entry.get("true_valence"),
        "true_arousal": entry.get("true_arousal")
    })

    if "true_valence" in entry:
        print(f"True ratings → Valence: {entry['true_valence']:.2f}, Arousal: {entry['true_arousal']:.2f}")
        # add a pause so user can read it before updating image
        time.sleep(2.5)

    index["i"] += 1
    if index["i"] < len(all_images):
        update_image()
    else:
        df = pd.DataFrame(annotations)
        df.to_csv(save_path, index=False)
        print("Annotation complete-- Saved to annotations_combined.csv")

In [None]:
# Configure widget
img_widget = widgets.Image(format='jpg', width=400, height=300)
title_box = widgets.HTML()
valence_slider = widgets.IntSlider(min=1, max=9, description="Valence (low is negative, high is positive):")
arousal_slider = widgets.IntSlider(min=1, max=9, description="Arousal:")
button_next = widgets.Button(description="Next image")
progress_label = widgets.Label()

display(title_box, img_widget, valence_slider, arousal_slider, button_next, progress_label)

annotations = []
index = {"i": 0}


button_next.on_click(on_next)
update_image()



## 2. Assign valence and arousal ratings

In [None]:
# 1. Local images
LOCAL_DIR = FOLDERNAME + "Radesky_YT_Dataset" # recall FOLDERNAME is just my. path in google drive defined in first cell
local_files = [
    {"id": os.path.splitext(f)[0],
     "title": f"Local image: {f}",
     "source_type": "local",
     "path": os.path.join(LOCAL_DIR, f)}
    for f in sorted(os.listdir(LOCAL_DIR)) if f.endswith(('.jpg', '.png'))
]

# 2. URL-based images from dictionaries created via YT API
image_dict = nonrestricted_videos | age_restricted_videos # combine

url_images = [
    {"id": vid,
     "title": data["title"],
     "source_type": "url",
     "url": data["thumbnail"]}
    for vid, data in image_dict.items()
]

# Combine both sources
all_images = local_files + url_images
# shuffle
random.seed(303) # so that we always see in the same order
random.shuffle(all_images)
print(f"Total images to annotate: {len(all_images)}")

# Configure widget
img_widget = widgets.Image(format='jpg', width=400, height=300)
title_box = widgets.HTML()
valence_slider = widgets.IntSlider(min=1, max=9, description="Valence (low is negative, high is positive):")
arousal_slider = widgets.IntSlider(min=1, max=9, description="Arousal:")
button_next = widgets.Button(description="Next image")
progress_label = widgets.Label()

display(title_box, img_widget, valence_slider, arousal_slider, button_next, progress_label)

annotations = []
index = {"i": 0}

button_next.on_click(on_next)
update_image()



In a way this exercise is making me think we could have. aclass imbalance problem bc really we're looking for low valence, high arousal thumbnails to censor, which are not equally balanced in the dataset. I feel like many are neutral. Something we can try to mitigate but can write about in our paper.

In [None]:
# turn dictionary into DF
YT_images_df = pd.DataFrame(all_images)

In [None]:
YT_images_df
# save df to google drive
YT_images_df.to_csv(FOLDERNAME + "YT_images.csv")

In [None]:
FOLDERNAME

#NOW lets pull in total rating...

In [None]:
def load_paths(filepath,file_types):

  #Creating a list that returns all filepaths
  all_files = []

  #os.walk goes through each file and each directory. recursively joins all
  for root,dirs,files in os.walk(filepath):
    for file in files:
      if any(file.endswith(ft) for ft in file_types):
        all_files.append(os.path.join(root,file))

  return all_files

In [None]:
annotation_files = load_paths(FOLDERNAME, ".csv")
annotation_files = [x for x in annotation_files if 'annotations' in x and 'final' not in x]

results = pd.read_csv(annotation_files[0])
for i,link in enumerate(annotation_files):
  if i == 0:
    continue
  df2 = pd.read_csv(link)

  df_combined = pd.concat([results, df2], ignore_index=True)
  results = df_combined.groupby("id", as_index=False).sum()


results['valence'] = results['valence']/len(annotation_files)

results['arousal'] = results['arousal']/len(annotation_files)

print(results.head())

save_path = FOLDERNAME + "final_annotations.csv"

results.to_csv(save_path, index=False)
print("Annotation complete-- Saved to final_annotations.csv")

In [None]:
import requests
import pandas as pd # Import pandas

# load YT dataset
yt_dataset_path = FOLDERNAME + "YT_images.csv"
ytdf = pd.read_csv(yt_dataset_path) # Load the CSV into a DataFrame

# define fx to download images from url
def download_image(image_url, save_path):
    """
    Downloads an image from a given URL and saves it to a specified path.
    Args:
        image_url (str): The URL of the image to download.
        save_path (str): The local path where the image will be saved,
                         including the filename and extension (e.g., "my_image.jpg").
    """
    try:
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        with open(save_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Image downloaded successfully to: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading image: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
save_location_folder = FOLDERNAME + "YT_images_exploratory/"

# Iterate over the DataFrame, not the string path
for index, row in ytdf.iterrows():
  image_url = row["url"]
  # Ensure 'id' column contains valid names for filenames
  image_id = row["id"]
  save_location = save_location_folder + str(image_id) + ".jpg"
  download_image(image_url, save_location)

#^ the data is actually in drive already

# combine all exploratory images into one list of urls
radesky_images_folder = FOLDERNAME + "Radesky_YT_Dataset"
all_images = [save_location_folder, radesky_images_folder]

final_images = []
for folder in all_images:
  final_images.extend(load_file(folder, [".jpg",".png"]))

print(f"Number of images: {len(final_images)}")