In [1]:
%load_ext autoreload
%autoreload 2

In [69]:
import os
from os.path import join, basename
from tqdm import tqdm

from googleapiclient.discovery import build

from utils.io import load_youtube_api

In [5]:
api = load_youtube_api()

### Access `Cricket Australia` channel

In [7]:
channel_url = "https://www.youtube.com/channel/UCkBY0aHJP9BwjZLDYxAQrKg"

In [9]:
channel_id = basename(channel_url)

In [10]:
channel_id

'UCkBY0aHJP9BwjZLDYxAQrKg'

In [11]:
api_service_name = "youtube"
api_version = "v3"

In [12]:
youtube = build(api_service_name, api_version, developerKey=api)

### Playground

In [42]:
request = youtube.channels().list(part="contentDetails",id=channel_id)
# request = youtube.channels().list(part="contentDetails", forUsername="Cricket Australia")

In [43]:
response = request.execute()

In [44]:
response

{'kind': 'youtube#channelListResponse',
 'etag': 'TXUYsCt4yxEkjVYiQ-wpgHkvFWw',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'PpKv4TgmAoNS77VTeBYYywSl-8I',
   'id': 'UCkBY0aHJP9BwjZLDYxAQrKg',
   'contentDetails': {'relatedPlaylists': {'likes': '',
     'uploads': 'UUkBY0aHJP9BwjZLDYxAQrKg'}}}]}

In [46]:
uploads_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

In [47]:
uploads_id

'UUkBY0aHJP9BwjZLDYxAQrKg'

In [63]:
uploads_id

'UUkBY0aHJP9BwjZLDYxAQrKg'

In [77]:
request = youtube.playlistItems().list(part="snippet", playlistId=uploads_id, maxResults=100)
response = request.execute()

### Extract all videos from a channel

In [78]:
scraped = []

# scrape the first 50 videos
request = youtube.playlistItems().list(
    part="snippet",
    playlistId=uploads_id,
    maxResults=50,
)
response = request.execute()
scraped.extend(response["items"])

total_results = response["pageInfo"]["totalResults"]

# keep scraping till you cover all videos
pbar = tqdm(
    total=(total_results // 50) + 1,
    desc="Extracting all videos of a channel",
    bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
)
while len(scraped) < total_results:
    request = youtube.playlistItems().list(
        part="snippet",
        playlistId=uploads_id,
        maxResults=50,
        pageToken=response["nextPageToken"],
    )
    response = request.execute()
    scraped.extend(response["items"])
    
    pbar.update(1)

pbar.close()

Extracting all videos of a channel: 100%|█████████▉| 212/213 [00:16<00:00, 12.60it/s]                                                   


### Get video metadata for all scrapped items

In [92]:
video_ids = []
for i in tqdm(range(len(scraped)), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',):
    video_id = scraped[i]["snippet"]["resourceId"]["videoId"]
    video_ids.append(video_id)

100%|██████████| 10615/10615 [00:00<00:00, 30881.09it/s]                                                                                


In [104]:
start = 0
delta = 50
videos = []

pbar = tqdm(
    total=(total_results // delta) + 1,
    desc="Extracting all videos",
    bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
)
while len(videos) < len(video_ids):
    request = youtube.videos().list(
        part="snippet,statistics,contentDetails",
        id=video_ids[start: start + delta],
    )
    response = request.execute()
    videos.extend(response["items"])
    
    start += delta
    pbar.update(1)

pbar.close()

Extracting all videos: 100%|██████████| 213/213 [01:05<00:00,  3.25it/s]                                                                


In [105]:
len(videos)

10615

In [106]:
videos[0]

{'kind': 'youtube#video',
 'etag': 'dfExaqa8WOjs3Q9neUo-_x7hiVI',
 'id': 'p-K9gVSTQV4',
 'snippet': {'publishedAt': '2012-12-02T10:56:33Z',
  'channelId': 'UCkBY0aHJP9BwjZLDYxAQrKg',
  'title': 'Mickey Arthur post match - Dec 2nd',
  'description': 'Mickey Arthur chats to CATV after day 3 of the Vodafone Test against South Africa at the WACA.',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/p-K9gVSTQV4/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/p-K9gVSTQV4/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/p-K9gVSTQV4/hqdefault.jpg',
    'width': 480,
    'height': 360},
   'standard': {'url': 'https://i.ytimg.com/vi/p-K9gVSTQV4/sddefault.jpg',
    'width': 640,
    'height': 480},
   'maxres': {'url': 'https://i.ytimg.com/vi/p-K9gVSTQV4/maxresdefault.jpg',
    'width': 1280,
    'height': 720}},
  'channelTitle': 'cricket.com.au',
  'tags': ['2003029255001',
   'youtube',
  