<a href="https://colab.research.google.com/github/YinterestingProjects/human-wildlife-interactions/blob/main/API_crawl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# api installs
!pip install --upgrade google-api-python-client
# !pip install --upgrade google-auth-oauthlib google-auth-httplib2 # not using oauth currently but leaving just in case someone wants to change that

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# necessary imports
import pandas as pd
import googleapiclient.discovery
import googleapiclient.errors
from google.colab import drive
import requests
import re
import os
import time
import json

In [3]:
# get list of ids from yt8m api
def get_entity_videoIds(entity_name):
  ''' gets a list of video ids in the YT8M training dataset tagged with a given entity(name)'''

  entity_id = entity2id[entity_name]
  
  url = f'https://storage.googleapis.com/data.yt8m.org/2/j/v/{entity_id}.js'
  response = requests.get(url)
  response.raise_for_status() 
  
  data = response.text
  pattern = r'\w+'
  ids = re.findall(pattern, data)[2:] # video ids start at index 2 onward
  print(f'{entity_name}({entity_id}): {len(ids)} videos found')

  return ids

In [4]:
# convert from id to url
def generate_url(id):
  ''' convert Youtube8M dataset-specific video IDs to true youtube catalog IDs and url '''
 
  call_str = f'http://data.yt8m.org/2/j/i/{id[:2]}/{id}.js'
  res = requests.get(call_str)
  res_lst = res.text.split('"')

  yt_id = res_lst[3]
  url = f'https://www.youtube.com/watch?v={yt_id}'

  return yt_id, url

In [5]:
# get video data (currently grabs more than we need)
def video_data_grabber(id, api_key):
  scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
  api_service_name = "youtube"
  api_version = "v3"

  youtube = googleapiclient.discovery.build(
      api_service_name, api_version, developerKey=api_key
      )
  request = youtube.videos().list(
    part="contentDetails,  id, liveStreamingDetails, localizations, player,  recordingDetails, snippet, statistics, status, topicDetails",
    id= real_id
  )

  response = request.execute()

  return response

In [6]:
# get comments
def comment_grabber(id, api_key):
  scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
  api_service_name = "youtube"
  api_version = "v3"
  
  youtube = googleapiclient.discovery.build(
  api_service_name, api_version, developerKey=api_key
  )

  request = youtube.commentThreads().list(
      part="id, snippet",
      id = id
  )
  
  response = request.execute()
  
  return response

In [7]:

# write results to file
def file_writer(storage_dict, cur_count):
  base_path = "/content/drive/MyDrive/API_output"
  file_name = "batch_" + str(cur_count) + ".txt"
  full_path = os.path.join(base_path, file_name)
  with open (full_path, "w+") as f:
    f.write(json.dumps(storage_dict))
  print("Wrote {}".format(full_path))

In [10]:
# main

# mount drive
drive.mount('/content/drive')

# api keys (has to be a better way to do this)
with open("/content/drive/MyDrive/youtubeapi.txt") as key_file:
  for line in key_file:
    api_key1 = line

with open("/content/drive/MyDrive/youtubeapi2.txt") as key_file2:
  for line in key_file2:
    api_key2 = line


# get all of the video ids
new_url = 'https://research.google.com/youtube8m/csv/2/vocabulary.csv'
new_vocab = pd.read_csv(new_url)
animal_df = new_vocab[(new_vocab.Vertical1 == 'Pets & Animals') | (new_vocab.Vertical2 == 'Pets & Animals')] # Pets & Animal only present in V1&2
summary_df = animal_df.groupby(['Name','KnowledgeGraphId']).agg({'TrainVideoCount':'sum'}).reset_index()
entity2id = dict(zip(summary_df.Name, summary_df.KnowledgeGraphId.str[3:]))
videoIds = get_entity_videoIds('Wildlife')

# go through all values of videoIds and get the video information and comments list
# as it stands comments are having issues so simply recording the values so we can pass back through if any have results
storage_dict = {}
failure_list = []
count = 1
for idx, id in enumerate(videoIds):
  try:
    real_id, vid_url = generate_url(id)
  except:
    failure_list.append((id,idx))
  try:
    video_details = video_data_grabber(real_id, api_key1)
  except:
    video_details = "Invalid lookup."
  try:
    video_comments = comment_grabber(real_id, api_key2)
  except:
    video_comments = "Invalid lookup"
  # create sub dictionary
  video_dict = {"real_id":real_id, "url":vid_url, 
                "details":video_details,"comments":video_comments}
  storage_dict[id] = video_dict
  # sleep for a minute after every 100 lookups just in case there is a rate limit
  if idx % 100 == 0 and idx != 0:
    time.sleep(60)

  # write to file every 400 lookups
  if idx % 400 == 0 and idx != 0:
    file_writer(storage_dict, count)
    # reset storage_dict
    storage_dict = {}
    count += 1

# write final value to file
file_writer(storage_dict, count)
  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Wildlife(01280g): 4243 videos found
Wrote /content/drive/MyDrive/API_output/batch_1.txt
Wrote /content/drive/MyDrive/API_output/batch_2.txt
Wrote /content/drive/MyDrive/API_output/batch_3.txt
Wrote /content/drive/MyDrive/API_output/batch_4.txt
Wrote /content/drive/MyDrive/API_output/batch_5.txt
Wrote /content/drive/MyDrive/API_output/batch_6.txt
Wrote /content/drive/MyDrive/API_output/batch_7.txt
Wrote /content/drive/MyDrive/API_output/batch_8.txt
Wrote /content/drive/MyDrive/API_output/batch_9.txt
Wrote /content/drive/MyDrive/API_output/batch_10.txt
Wrote /content/drive/MyDrive/API_output/batch_11.txt
