In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
import re

In [120]:
class YtApiUtil:
    def __init__(self):
        current_path = current_path = os.getcwd()
        youtube_key_name = "youtube_key.json"
        data_path = current_path + "/" + youtube_key_name
        file = open(data_path)
        file_dict = json.load(file)
        self.API_key = file_dict["API_key"]
        
    def get_all_playlists(self, channel_id, page_token=""):
        API_key = self.API_key
        url = f"https://youtube.googleapis.com/youtube/v3/playlists?part=contentDetails&part=snippet&part=id&channelId={channel_id}&maxResults=50&key={API_key}&pageToken={page_token}"
        resp = requests.get(url)
        
        return json.loads(resp.text)
    
    def get_all_playlist_items(self, playlist_id, page_token=""):
        API_key = self.API_key
        url = f"https://youtube.googleapis.com/youtube/v3/playlistItems?part=snippet&part=contentDetails&maxResults=50&playlistId={playlist_id}&key={API_key}&pageToken={page_token}"
        resp = requests.get(url)
        
        return json.loads(resp.text)
    
    def get_video_info(self, video_id):
        API_key = self.API_key
        url = f"https://youtube.googleapis.com/youtube/v3/videos?part=contentDetails&part=snippet&id={video_id}&key={API_key}"
        resp = requests.get(url)
        
        try:
            video_info = json.loads(resp.text)["items"][0]
        except:
            video_info = None
        
        return video_info
        
class YtApiCrawler(YtApiUtil):
    def __init__(self):
        print("===YtApiCrawler init ===")
        super().__init__()

    def fetch_data(self, url):
        print("===YtApiCrawler fetch_data ===")
        channel_id = url.replace("https://www.youtube.com/channel/","")
        playlists = []
        page_token = ""
        has_next_page = True
        while has_next_page:
            all_playlists = self.get_all_playlists(channel_id, page_token)
            playlists += all_playlists["items"]
            if "nextPageToken" in all_playlists:
                page_token = all_playlists["nextPageToken"]
            else:
                has_next_page = False

        playlist_list = []
        for item in playlists:
            channel_id = item["snippet"]["channelId"]
            channel_name = item["snippet"]["channelTitle"]
            channel_url = url
            playlist_id = item["id"]
            playlist_title = item["snippet"]["title"]

            data = {
                "channel_id": channel_id,
                "channel_name": channel_name,
                "channel_url": channel_url,
                "playlist_id": playlist_id,
                "playlist_title": playlist_title,
                "video_items": []
            }

            playlist_list.append(data)

        playlist_items_list = []
        for playlist_dict in playlist_list:
            playlist_id = playlist_dict["playlist_id"]
            page_token = ""
            has_next_page = True
            while has_next_page:
                playlist_items = self.get_all_playlist_items(playlist_id, page_token)
                playlist_dict["video_items"] += playlist_items["items"]
                if "nextPageToken" in playlist_items:
                    page_token = playlist_items["nextPageToken"]
                else:
                    has_next_page = False
            playlist_items_list.append(playlist_dict)

        return playlist_items_list

    def get_data_json(self, playlist_items_list):
        data_json = []
        for playlist_dict in playlist_items_list:
            for item in playlist_dict["video_items"]:
                video_id = item["snippet"]["resourceId"]["videoId"]
                video_info = self.get_video_info(video_id)
                if video_info:
                    video_url = f"https://www.youtube.com/watch?v={video_id}"
                    published = item["snippet"]["publishedAt"]
                    title = item["snippet"]["title"]
                    description = item["snippet"]["description"]
                    playlist_position = item["snippet"]["position"]
                    
                    if "standard" in item["snippet"]["thumbnails"]:
                        img_link = item["snippet"]["thumbnails"]["standard"]["url"] 
                    else: 
                        img_link = ""

                    if "tags" in video_info["snippet"]:
                        tag_list = video_info["snippet"]["tags"]
                    else:
                        tag_list = []

                    channel_id = playlist_dict["channel_id"]
                    channel_url = playlist_dict["channel_url"]
                    channel_name = playlist_dict["channel_name"]
                    playlist_id = playlist_dict["playlist_id"]
                    playlist_title = playlist_dict["playlist_title"]
                    
                    data = {
                        "channel_id": channel_id,
                        "channel_url": channel_url,
                        "channel_name": channel_name,
                        "video_id": video_id,
                        "video_url": video_url,
                        "published": published,
                        "title": title,
                        "img_link": img_link,
                        "description": description,
                        "tag_list": tag_list,
                        "playlist_id": playlist_id,
                        "playlist_title": playlist_title,
                        "playlist_position": playlist_position
                    }
                    data_json.append(data)
                    print(data)
        
        return data_json


In [119]:
crawler = YtApiCrawler()
url = "https://www.youtube.com/channel/UCLNBEt_42kYuX7fgZiubgXQ"
data = crawler.fetch_data(url)
# data_json = crawler.get_data_json(data)


===YtApiCrawler init ===
===YtApiCrawler fetch_data ===


{'channel_id': 'UCLNBEt_42kYuX7fgZiubgXQ',
 'channel_name': '均一教育平台(軟體組)',
 'channel_url': 'https://www.youtube.com/channel/UCLNBEt_42kYuX7fgZiubgXQ',
 'playlist_id': 'PLtKoBWTkVkTc6I-WrLDDcCgMy2gdXLaQY',
 'playlist_title': '微分享(公開版)',
 'video_items': [{'kind': 'youtube#playlistItem',
   'etag': '-5cKJWFG2BGP-R74yFMXEzX1rRU',
   'id': 'UEx0S29CV1RrVmtUYzZJLVdyTEREY0NnTXkyZ2RYTGFRWS4wMTcyMDhGQUE4NTIzM0Y5',
   'snippet': {'publishedAt': '2022-05-10T05:29:41Z',
    'channelId': 'UCLNBEt_42kYuX7fgZiubgXQ',
    'title': '【微分享】Mob Programming',
    'description': '・分享者：沈家緯\n・分享內容：Mob Programming 大家一起開發\n・來源：均一微分享\n----------------------------------------------------------------------------------------------------\n・「微分享」是均一軟體組每天的都有的 15min 技術分享（一個不希望大家特別準備/隨意/低成本的技術分享）\n・「均一」一個希望幫助孩子成為終身學習者的線上學習平台 https://www.junyiacademy.org/"',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/JLkYTc6U-g0/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.yt

## data["items"][0]["snippet"]["thumbnails"]["medium"]["url"]

In [53]:
item = data["items"][0]
video_id = item["snippet"]["resourceId"]["videoId"]
video_url = f"https://www.youtube.com/watch?v={video_id}"
published = item["snippet"]["publishedAt"]
title = item["snippet"]["title"]
img_link = item["snippet"]["thumbnails"]["maxres"]["url"]
description = item["snippet"]["description"]
playlist_position = item["snippet"]["position"]

playlist_position

0

In [None]:
# get all playlists in channel
GET https://youtube.googleapis.com/youtube/v3/playlists?part=contentDetails&part=snippet&part=id&channelId=UCyDIP-h6xNBAiXITkbvubhA&maxResults=50&key=[YOUR_API_KEY] HTTP/1.1
GET https://youtube.googleapis.com/youtube/v3/search?channelId=UCyDIP-h6xNBAiXITkbvubhA&pageToken=CAUQAA&key=[YOUR_API_KEY] HTTP/1.1

    
# get all playlistitems in playlist
GET https://youtube.googleapis.com/youtube/v3/playlistItems?part=snippet&part=contentDetails&maxResults=50&playlistId=PLI6pJZaOCtF1vPNVcsR6K31FbqZ2V6PVi&key=[YOUR_API_KEY] HTTP/1.1

# get video info by video_id
GET https://youtube.googleapis.com/youtube/v3/videos?part=contentDetails&part=snippet&id=aFx44mglAbs&maxResults=50&key=[YOUR_API_KEY] HTTP/1.1

    
# get all videos by channel_id
GET https://youtube.googleapis.com/youtube/v3/search?part=snippet&channelId={channelId}&key={API_KEY}


In [20]:
import os
import json
import requests
from bs4 import BeautifulSoup
import re

class YoutubeRequestsCrawler:
    def __init__(self):
        print("===crawler init ===")

    def get_yt_playlist_with_video(self, url, channel_name):
        resp = requests.get(url)
        data_soup = BeautifulSoup(resp.text, 'html.parser')
        video_title = data_soup.select_one('meta[name="title"]')["content"]
        query_url = f"https://www.youtube.com/results?sp=mAEB&search_query={video_title}+{channel_name}"
        resp = requests.get(query_url)
        data_soup = BeautifulSoup(resp.text, 'html.parser')
        data_soup_str = str(data_soup)
        
        
        try:
            playlist_pattern = r'{"playlistRenderer":{"playlistId":"(.*?)"'
            playlist_id = re.findall(playlist_pattern, data_soup_str)[0]
        except:
            playlist_id = ""
            
        if playlist_id:
            video_id = url.replace("https://www.youtube.com/watch?v=","")
            query_url = f"https://www.youtube.com/watch?v={video_id}&list={playlist_id}"
            resp = requests.get(query_url)
            data_soup = BeautifulSoup(resp.text, 'html.parser')
            data_soup_str = str(data_soup)
#             print(data_soup_str)
            index_pattern = f'"videoId":"{video_id}","playlistId":"{playlist_id}","index":(.\d*).*'
            playlist_position = re.findall(index_pattern, data_soup_str)[0]
            playlist_position = int(playlist_position) + 1
        else:
            playlist_position = ""

        data = {
            "playlist_id": playlist_id,
            "playlist_position": playlist_position
        }
        print(data)
        
        return data

In [21]:
crawler = YoutubeRequestsCrawler()
url = "https://www.youtube.com/watch?v=mYuinFT5wgs"
channel_name = "數感實驗室Numeracy Lab"
crawler.get_yt_playlist_with_video(url, channel_name)

===crawler init ===
{'playlist_id': 'PLzA1Q82AYGfbivssQveoOojDKG_TkBD3g', 'playlist_position': 7}


{'playlist_id': 'PLzA1Q82AYGfbivssQveoOojDKG_TkBD3g', 'playlist_position': 7}