In [1]:
import sys
sys.path.insert(0, '../') # so we can access our config files

import os
import logging
import json
import time

from tqdm.notebook import tqdm
import pandas as pd
import random

# YouTube API Libraries
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from socket import error as SocketError

# Config Imports
from config import youtube

from src.data.youtubeDownloader import Downloader

In [2]:
print(os.environ.get("default_output"))

None


In [3]:
downloader = Downloader()

Successfully built YouTube API object


In [4]:
downloader.setVideoId('dQw4w9WgXcQ')
downloader.getVideoId()

'dQw4w9WgXcQ'

In [5]:
downloader.getVideoMetadata()

{'title': 'Rick Astley - Never Gonna Give You Up (Official Music Video)',
 'description': 'The official video for “Never Gonna Give You Up” by Rick Astley\nTaken from the album ‘Whenever You Need Somebody’ – deluxe 2CD and digital deluxe out 6th May 2022 Pre-order here – https://RickAstley.lnk.to/WYNS2022ID\n\n“Never Gonna Give You Up” was a global smash on its release in July 1987, topping the charts in 25 countries including Rick’s native UK and the US Billboard Hot 100.  It also won the Brit Award for Best single in 1988. Stock Aitken and Waterman wrote and produced the track which was the lead-off single and lead track from Rick’s debut LP “Whenever You Need Somebody”.  The album was itself a UK number one and would go on to sell over 15 million copies worldwide.\n\nThe legendary video was directed by Simon West – who later went on to make Hollywood blockbusters such as Con Air, Lara Croft – Tomb Raider and The Expendables 2.  The video passed the 1bn YouTube views milestone on 28 

In [6]:
downloader.getVideoComments()

{'comments': [{'UgzarqjaaPC7TbFINNx4AaABAg': '1 BILLION views for Never Gonna Give You Up!\xa0 Amazing, crazy, wonderful! Rick ♥️'},
  {'UgzarqjaaPC7TbFINNx4AaABAg.9QM9WCCnud69iqodge5RgQ': 'Never Gonna Give You up'},
  {'UgzarqjaaPC7TbFINNx4AaABAg.9QM9WCCnud69iWXpOmb2MI': '@DVFT Ok'},
  {'UgzarqjaaPC7TbFINNx4AaABAg.9QM9WCCnud69iWXnJvfbi7': 'Rick'},
  {'UgzarqjaaPC7TbFINNx4AaABAg.9QM9WCCnud69hSA85aiyqo': 'because you Ricked'},
  {'UgzarqjaaPC7TbFINNx4AaABAg.9QM9WCCnud69gdz7QaHSLZ': 'Likes'},
  {'UgyB6YuysoRVBA-UzxF4AaABAg': 'When you get an ad and you get saved from the rickroll'},
  {'UgxkpLXW5uyroEF-u814AaABAg': 'But link sayd free robux :('},
  {'UgwschHh8R13reidMDV4AaABAg': 'Admítelo llegaste acá porque te Rickollearon'},
  {'Ugxh6o_f8Uhdxj8UzfN4AaABAg': 'Big fan'},
  {'UgxqdJ2XDBg2FNGYXzx4AaABAg': 'Hey Rick astley'},
  {'Ugz-lVi5tsXRVg_c9Cl4AaABAg': 'Aqui no Brasil escultando essa relíquia, que sempre'},
  {'Ugz0vqb5wEuN-AQwV794AaABAg': "Ain't no way I was rickrolled by a Life is S

In [7]:
downloader.getVideoTranscript()

{'cleaned_transcript': "[Music] you know the rules [Music] gotta make you understand [Music] goodbye [Music] we've known each other for so long your heart's been [Music] going aching [Music] never gonna say goodbye [Music] never gonna make you gonna say cry [Music] i just want to tell you how i'm feeling [Music] [Music] never gonna is you down [Music]",
 'raw_transcript': [{'text': '[Music]', 'start': 0.0, 'duration': 26.359},
  {'text': 'you know the rules', 'start': 22.64, 'duration': 3.719},
  {'text': '[Music]', 'start': 28.33, 'duration': 16.31},
  {'text': 'gotta make you understand', 'start': 40.399, 'duration': 4.241},
  {'text': '[Music]', 'start': 44.92, 'duration': 11.75},
  {'text': 'goodbye', 'start': 54.64, 'duration': 6.079},
  {'text': '[Music]', 'start': 56.67, 'duration': 6.13},
  {'text': "we've known each other", 'start': 60.719, 'duration': 4.16},
  {'text': 'for so long', 'start': 62.8, 'duration': 2.9},
  {'text': "your heart's been", 'start': 64.879, 'duration':

In [8]:
youtube_api = build(youtube.YOUTUBE_API_SERVICE_NAME, youtube.YOUTUBE_API_VERSION, developerKey=youtube.YOUTUBE_DATA_API_KEY)

In [12]:
youtube_api = build(youtube.YOUTUBE_API_SERVICE_NAME, youtube.YOUTUBE_API_VERSION, developerKey=youtube.YOUTUBE_DATA_API_KEY)

In [13]:
def keyExists():
    """ Check if we have a key in our environment

    Returns
    -------
    bool
        If we have a key stored in our environment return True 
    """

    return youtube.YOUTUBE_DATA_API_KEY != None

In [14]:
### Get Video Title, Desc, Tags
def getVideoMetadata(video_id):
    """Download video meta data given an ID

    Method queries the YouTube Data API and retrieves details of the video. These
    details are used as the video desc, title and tags.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.

    Returns
    -------
    dict
        A dictionary with the video desc, title and video tags
    """
    try:
        response = (
            youtube_api.videos()
            .list(part="id,snippet,contentDetails", id=video_id)
            .execute()
        )

        # Get Video Details
        try:
            videoContent = response["items"][0]

            if "snippet" in videoContent.keys():
                videoSnippet = videoContent["snippet"]

                return {
                    "title": videoSnippet["title"],
                    "description": videoSnippet["description"],
                    "tags": videoSnippet["tags"],
                }

        except:
            return {"title": "", "description": "", "tags": ""}

    except (HttpError, SocketError) as error:
        print(
            "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                video_id, error
            )
        )

In [15]:
# Top N comments
def getVideoComments(video_id, num_comments):
    """Download video comments given an ID

    Method queries the YouTube Data API and retrieves the top comments of a video.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    num_comments: int
        The max number of comments we want from the video.

    Returns
    -------
    dict
        A dictionary with the comments
    """

    counter = num_comments
    nextPageToken = ""
    comments = []

    while counter > 0:
        try:
            # For the first page
            if num_comments == counter:
                response = (
                    youtube_api.commentThreads()
                    .list(part="snippet,replies", videoId=video_id, maxResults=100)
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

            else:
                # For all subsequent pages
                response = (
                    youtube_api.commentThreads()
                    .list(
                        part="snippet,replies",
                        pageToken=nextPageToken,
                        videoId=video_id,
                        maxResults=100,
                    )
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

        except (HttpError, SocketError) as error:
            print(
                "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                    video_id, error
                )
            )

    return {"comments":comments}


In [16]:
### Get Video Transcript

def getVideoTranscript(video_id, transcriber=None):
    """Download video transcript given an ID

    Method uses the youtube-transcript library to query the API for a video_id. Method cleans 
    returned output and provides a text blob back.
    
    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    transcriber : YouTubeTranscriptApi
        Our transcription object. Passed in to prevent constant constructor calls.

    Returns
    -------
    dict
        A dictionary with the raw transcript, cleaned transcript, and pauses in the video.
    """

    # If we did not pass in a transcriber
    if transcriber == None:
        transcriber = YouTubeTranscriptApi()

    try:
        raw_transcript = transcriber.get_transcript(video_id)
    except:
        return {"cleaned_transcript" : "", "raw_transcript" : {}}

    cleaned_transcript = " ".join([phrase['text'] for phrase in raw_transcript])

    return {"cleaned_transcript" : cleaned_transcript, "raw_transcript" : raw_transcript}

In [17]:
transcriber = YouTubeTranscriptApi()
video_id = 'dQw4w9WgXcQ'

In [18]:
videoMetaData = getVideoMetadata(video_id)
time.sleep(1)
videoComments = getVideoComments(video_id, 200)
time.sleep(2)
videoTranscript = getVideoTranscript(video_id, transcriber)

In [19]:
video = {
    "title": videoMetaData["title"],
    "description": videoMetaData["description"],
    "tags": videoMetaData["tags"],
    "cleaned_transcript": videoTranscript["cleaned_transcript"],
    "raw_transcript": videoTranscript["raw_transcript"], 
    "comments": [list(comment.values())[0] for comment in videoComments["comments"]],
    "comment_ids" : [list(comment.keys())[0] for comment in videoComments["comments"]],
}


In [20]:
pd.DataFrame(pd.Series(video)).T

Unnamed: 0,title,description,tags,cleaned_transcript,raw_transcript,comments,comment_ids
0,Rick Astley - Never Gonna Give You Up (Officia...,The official video for “Never Gonna Give You U...,"[rick astley, Never Gonna Give You Up, nggyu, ...",[Music] you know the rules [Music] gotta make ...,"[{'text': '[Music]', 'start': 0.0, 'duration':...",[1 BILLION views for Never Gonna Give You Up! ...,"[UgzarqjaaPC7TbFINNx4AaABAg, UgzarqjaaPC7TbFIN..."
