In [136]:
from googleapiclient.discovery import build
import csv
import os
from dotenv import load_dotenv
from datetime import datetime
load_dotenv(verbose=True)

True

In [165]:
# Constants
DEVELOPER_KEY = os.getenv('API_KEY')
SERVICE_NAME = "youtube"
VERSION = "v3"

# Requests
MAX_RESULTS = 50

# CSV filename
DATA_FILE = f'data/%s-data.csv' %datetime.today().strftime('%Y-%m-%d')

# Interested Playlists
SKIENNA_PLAYLISTS = (
    'PLOtl7M3yp-DX32N0fVIyvn7ipWKNGmwpp', # Analysis of Algorithms 2016
    'PLOtl7M3yp-DVBdLYatrltDJr56AKZ1qXo', # Data Science - Fall 2016
)

MIT_PLAYLISTS = (
    'PLE18841CABEA24090', # Structure and Interpretation
    'PLE7DDD91010BC51F8', # Linear Algebra
    'PLUl4u3cNGP61Oq3tWYp6V_F-5jb5L2iHb', #Introduction to Algorithms
    'PLUl4u3cNGP63WbdFxL8giv4yhgdMGaZNA', # Introduction to Computer Science and Programming
    'PLUl4u3cNGP60ZaGv5SgpIk67YnH1WqCLI', # Learn to Build your own video game with the Unity Game Engine and MS Kinect
    'PLF83B8D8C87426E44', # Fundamentals of Biology
    'PLUl4u3cNGP619EG1wp0kT-7rDE_Az5TNd', # Introduction to Computational Thinking and Data Science
    'PLUl4u3cNGP61-9PEhRognw5vryrSEVLPr', # Quantum Physics I, 2013
)

THREE_BLUE_ONE_BROWN_PLAYLIST = (
    'PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab', # Essence of Linear Algebra
    'PLZHQObOWTQDMsr9K-rj53DwVRMYO3t5Yr', # Essence of Calculus
    'PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi', # Neural networks
    'PLZHQObOWTQDNPOjrT6KVlfJuKtYTftqH6', # Differential Equations
)

HARVARD_UNIVERSITY_PLAYLIST = (
    'PL2SOU6wwxB0v1kQTpqpuu5kEJo2i-iUyf', # Algorithms for Big Data
)

COURSEA_PLAYLIST = (
    'PLVext98k2evjIFqVggHfvecnFu4tTJK_o', # GCP Fundamentals - Google Cloud Platform Fundamentals: Core Infrastructure
    'PLVext98k2evi8mDNRo4MwIgVgSmwM3cS8', # R Programming - Introduction to R by Johns Hopkins University
)

KHAN_ACADEMY_PLAYLIST = (
    'PLSQl0a2vh4HA50QhFIirlEZRXG4yjcoGM', # Journey into cryptography
)

FREECODECAMP_PLAYLIST = (
    'PLWKjhJtqVAbmGw5fN5BQlwuug-8bDmabi', # Introduction to Computer Science, Harvard's CS50
    'PLWKjhJtqVAbluXJKKbCIb4xd7fcRkpzoz', # Introduction to Game Development
)

OTHERS_PLAYLIST = (
    'PL6cactdCCnTLkQah9GKzsJmiLbegy4dEk', # Udemy Ultimate Web Development Tutorial
    'PLC3y8-rFHvwgg3vaYJgHGnModB54rxOk3', # Codevolution - Introduction to React
)

list_of_playlists = SKIENNA_PLAYLISTS + MIT_PLAYLISTS + \
                    THREE_BLUE_ONE_BROWN_PLAYLIST + HARVARD_UNIVERSITY_PLAYLIST + \
                    COURSEA_PLAYLIST + KHAN_ACADEMY_PLAYLIST + \
                    FREECODECAMP_PLAYLIST + OTHERS_PLAYLIST

In [137]:
service = build(SERVICE_NAME, VERSION, developerKey=DEVELOPER_KEY)

In [138]:
# Returns list of all video inside playlist
def get_playlist_item(playlist_id, npt=''):
    res = service.playlistItems().list(
        part="snippet",
        fields="nextPageToken,items(snippet(resourceId(videoId)))",
        playlistId=playlist_id,
        maxResults=MAX_RESULTS,
        pageToken=npt
    ).execute()
    
    if res.get('nextPageToken') != None:
        return res['items'] + get_playlist_item(playlist_id, res['nextPageToken'])

    return res['items']

In [144]:
def get_video_statistics(video_id):
    res = service.videos().list(
      part="snippet,statistics,contentDetails",
      id=video_id,
      fields="items(id,snippet(title,categoryId,publishedAt),statistics,contentDetails(duration))",
      maxResults=MAX_RESULTS
    ).execute()
    return res

In [140]:
def get_playlist_details(playlist_id):
    res = service.playlists().list(
        part="snippet,contentDetails",
        fields="items(snippet(channelId, title, channelTitle, description), contentDetails)",
        id=playlist_id,
        maxResults=MAX_RESULTS
    ).execute()
    return res

In [141]:
def extract_video_id(data):
    return tuple(i['snippet']['resourceId']['videoId'] for i in data)

In [163]:
def scrape(playlist_id):
    with open(DATA_FILE, 'a+', newline='\n', encoding='utf-8') as csvfile:
        fieldnames = ['id', 'title', 'categoryId', 'playlistId', 'duration', 'views', 'uploadedDate']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=',')
        fileEmpty = os.stat(DATA_FILE).st_size == 0

        if fileEmpty:
            writer.writeheader()

        playlist = get_playlist_item(playlist_id)
        playlist_details = get_playlist_details(playlist_id)
        playlist_videos = extract_video_id(playlist)
        
        writer.writerow({})

        for i in range(len(playlist_videos)):
            data = get_video_statistics(playlist_videos[i])['items'][0]
            writer.writerow({
                'id': data['id'],
                'title': data['snippet']['title'],
                'categoryId': data['snippet']['categoryId'],
                'playlistId': playlist_id,
                'duration': data['contentDetails']['duration'],
                'views': data['statistics']['viewCount'],
                'uploadedDate': data['snippet']['publishedAt']
            })

In [164]:
for i in range(len(list_of_playlists)):
    scrape(list_of_playlists[i])