In [None]:
import sys
sys.path.insert(0, '../') # so we can access our config files

import os
import logging
import json
import time

from tqdm.notebook import tqdm
import pandas as pd
import random

# YouTube API Libraries
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from socket import error as SocketError

# Config Imports
from config import youtube

In [None]:
print(os.environ.get("default_output"))

In [None]:
youtube_api = build(youtube.YOUTUBE_API_SERVICE_NAME, youtube.YOUTUBE_API_VERSION, developerKey=youtube.YOUTUBE_DATA_API_KEY)

In [None]:
def keyExists():
    """ Check if we have a key in our environment

    Returns
    -------
    bool
        If we have a key stored in our environment return True 
    """

    return youtube.YOUTUBE_DATA_API_KEY != None

In [None]:
### Get Video Title, Desc, Tags
def getVideoMetadata(video_id):
    """Download video meta data given an ID

    Method queries the YouTube Data API and retrieves details of the video. These
    details are used as the video desc, title and tags.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.

    Returns
    -------
    dict
        A dictionary with the video desc, title and video tags
    """
    try:
        response = (
            youtube_api.videos()
            .list(part="id,snippet,contentDetails", id=video_id)
            .execute()
        )

        # Get Video Details
        try:
            videoContent = response["items"][0]

            if "snippet" in videoContent.keys():
                videoSnippet = videoContent["snippet"]

                return {
                    "title": videoSnippet["title"],
                    "description": videoSnippet["description"],
                    "tags": videoSnippet["tags"],
                }

        except:
            return {"title": "", "description": "", "tags": ""}

    except (HttpError, SocketError) as error:
        print(
            "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                video_id, error
            )
        )

In [None]:
# Top N comments
def getVideoComments(video_id, num_comments):
    """Download video comments given an ID

    Method queries the YouTube Data API and retrieves the top comments of a video.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    num_comments: int
        The max number of comments we want from the video.

    Returns
    -------
    dict
        A dictionary with the comments
    """

    counter = num_comments
    nextPageToken = ""
    comments = []

    while counter > 0:
        try:
            # For the first page
            if num_comments == counter:
                response = (
                    youtube_api.commentThreads()
                    .list(part="snippet,replies", videoId=video_id, maxResults=100)
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

            else:
                # For all subsequent pages
                response = (
                    youtube_api.commentThreads()
                    .list(
                        part="snippet,replies",
                        pageToken=nextPageToken,
                        videoId=video_id,
                        maxResults=100,
                    )
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

        except (HttpError, SocketError) as error:
            print(
                "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                    video_id, error
                )
            )

    return {"comments":comments}


In [None]:
### Get Video Transcript

def getVideoTranscript(video_id, transcriber=None):
    """Download video transcript given an ID

    Method uses the youtube-transcript library to query the API for a video_id. Method cleans 
    returned output and provides a text blob back.
    
    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    transcriber : YouTubeTranscriptApi
        Our transcription object. Passed in to prevent constant constructor calls.

    Returns
    -------
    dict
        A dictionary with the raw transcript, cleaned transcript, and pauses in the video.
    """

    # If we did not pass in a transcriber
    if transcriber == None:
        transcriber = YouTubeTranscriptApi()

    try:
        raw_transcript = transcriber.get_transcript(video_id)
    except:
        return {"cleaned_transcript" : "", "raw_transcript" : {}}

    cleaned_transcript = " ".join([phrase['text'] for phrase in raw_transcript])

    return {"cleaned_transcript" : cleaned_transcript, "raw_transcript" : raw_transcript}

In [None]:
transcriber = YouTubeTranscriptApi()
video_id = 'dQw4w9WgXcQ'

In [None]:
videoMetaData = getVideoMetadata(video_id)
time.sleep(1)
videoComments = getVideoComments(video_id, 200)
time.sleep(2)
videoTranscript = getVideoTranscript(video_id, transcriber)

In [None]:
video = {
    "title": videoMetaData["title"],
    "description": videoMetaData["description"],
    "tags": videoMetaData["tags"],
    "cleaned_transcript": videoTranscript["cleaned_transcript"],
    "raw_transcript": videoTranscript["raw_transcript"], 
    "comments": [list(comment.values())[0] for comment in videoComments["comments"]],
    "comment_ids" : [list(comment.keys())[0] for comment in videoComments["comments"]],
}


In [None]:
pd.DataFrame(pd.Series(video)).T

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv('../data/raw/youtube/videos_0.csv')

In [9]:
df.head()

Unnamed: 0,title,description,tags,cleaned_transcript,raw_transcript,comments,comment_ids
0,Rick Astley - Never Gonna Give You Up (Officia...,The official video for “Never Gonna Give You U...,"['rick astley', 'Never Gonna Give You Up', 'ng...",[Music] you know the rules [Music] gotta make ...,"[{'text': '[Music]', 'start': 0.0, 'duration':...",['1 BILLION views for Never Gonna Give You Up!...,"['UgzarqjaaPC7TbFINNx4AaABAg', 'UgzarqjaaPC7Tb..."
1,Escaping the Rat Race: What School Failed to T...,Escaping the rat race isn’t about quitting a 9...,"['rat race', 'Personal Finance', 'production',...",in 2003 professional boxer and heavyweight cha...,"[{'text': 'in 2003', 'start': 2.639, 'duration...",['► 👇 𝐒𝐮𝐩𝐩𝐨𝐫𝐭 𝐓𝐡𝐞 𝐂𝐡𝐚𝐧𝐧𝐞𝐥 // 𝐁𝐞𝐜𝐨𝐦𝐞 𝐀 𝐏𝐚𝐭𝐫𝐨𝐧! ...,"['UgwupOprd7Pnqw-GYIB4AaABAg', 'UgwupOprd7Pnqw..."
2,William Ackman: Everything You Need to Know Ab...,Everything You Need to Know About Finance and ...,"['Big Think', 'BigThink', 'BigThink.com', 'Edu...","Hi, I'm Bill Ackman. I'm the CEO of Pershing S...","[{'text': ""Hi, I'm Bill Ackman."", 'start': 0.2...","['Want to get Smarter, Faster™?\r\nSubscribe f...","['UgxIFuz7iK8Khu4WTyF4AaABAg', 'UgxIFuz7iK8Khu..."


In [11]:
df['comments'][1]

'[\'► 👇 𝐒𝐮𝐩𝐩𝐨𝐫𝐭 𝐓𝐡𝐞 𝐂𝐡𝐚𝐧𝐧𝐞𝐥 // 𝐁𝐞𝐜𝐨𝐦𝐞 𝐀 𝐏𝐚𝐭𝐫𝐨𝐧! \\r\\nhttps://www.patreon.com/jamesvj\\r\\n\\r\\n► Follow Me for Behind the Scenes, Updates, and Teasers:\\r\\n📸 IG: james.v.j // https://bit.ly/2TOaBBC \\r\\n🐦 Twitter: @jamesvjani // https://bit.ly/2zYPrsT\', \'ALL financial problems comes from Government. The State educates you badly and takes your taxes and spends it on behalf of those self serving corrupt inept greedy with political power and their, even higher elite masters.\\nWhile Marxism in all its forms is 100% wrong as it IS the State that controls you. Thus you get zero, unless you are IN the political party. Proven by anyone that examines the chaos when it has been applied.\\nThe proof is their loyal technocrats on a very decent wage now acts on their policies of reduced free speech showned by this comment being shadow banned. Such truth ensures my comments always have less than 20 likes.\', "It sound reasoning but in the end  it\'s production value that is reducing over time