In [1]:
import sys
sys.path.insert(0, '../') # so we can access our config files

import os
import logging
import json
import time

from tqdm.notebook import tqdm
import pandas as pd
import random

# YouTube API Libraries
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from socket import error as SocketError

# Config Imports
from config import youtube

In [2]:
print(os.environ.get("default_output"))

None


In [3]:
youtube_api = build(youtube.YOUTUBE_API_SERVICE_NAME, youtube.YOUTUBE_API_VERSION, developerKey=youtube.YOUTUBE_DATA_API_KEY)

In [4]:
def keyExists():
    """ Check if we have a key in our environment

    Returns
    -------
    bool
        If we have a key stored in our environment return True 
    """

    return youtube.YOUTUBE_DATA_API_KEY != None

In [5]:
### Get Video Title, Desc, Tags
def getVideoMetadata(video_id):
    """Download video meta data given an ID

    Method queries the YouTube Data API and retrieves details of the video. These
    details are used as the video desc, title and tags.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.

    Returns
    -------
    dict
        A dictionary with the video desc, title and video tags
    """
    try:
        response = (
            youtube_api.videos()
            .list(part="id,snippet,contentDetails", id=video_id)
            .execute()
        )

        # Get Video Details
        try:
            videoContent = response["items"][0]

            if "snippet" in videoContent.keys():
                videoSnippet = videoContent["snippet"]
                print(videoSnippet["title"])
                print(videoSnippet["description"])
                print(videoSnippet["tags"])

                return {
                    "title": videoSnippet["title"],
                    "description": videoSnippet["description"],
                    "tags": videoSnippet["tags"],
                }

        except:
            print("Failed to get title and video")
            return {"title": "", "description": "", "tags": ""}

    except (HttpError, SocketError) as error:
        print(
            "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                video_id, error
            )
        )

In [6]:
# Top N comments
def getVideoComments(video_id, num_comments):
    """Download video comments given an ID

    Method queries the YouTube Data API and retrieves the top comments of a video.

    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    num_comments: int
        The max number of comments we want from the video.

    Returns
    -------
    dict
        A dictionary with the comments
    """

    counter = num_comments
    nextPageToken = ""
    comments = []

    while counter > 0:
        try:
            # For the first page
            if num_comments == counter:
                response = (
                    youtube_api.commentThreads()
                    .list(part="snippet,replies", videoId=video_id, maxResults=100)
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

            else:
                # For all subsequent pages
                response = (
                    youtube_api.commentThreads()
                    .list(
                        part="snippet,replies",
                        pageToken=nextPageToken,
                        videoId=video_id,
                        maxResults=100,
                    )
                    .execute()
                )

                # Grab the ID to the next page
                if "nextPageToken" in response.keys():
                    nextPageToken = response["nextPageToken"]

                try:
                    pageComments = []

                    # Get all the comments for the page
                    for item in response["items"]:
                        # Start with the top comment
                        pageComments.append(
                            {
                                item["snippet"]["topLevelComment"]["id"]: item[
                                    "snippet"
                                ]["topLevelComment"]["snippet"]["textOriginal"]
                            }
                        )

                        # If we have replies
                        if "replies" in item.keys():
                            # Get all the replies to the comment
                            pageComments += [
                                {reply["id"]: reply["snippet"]["textOriginal"]}
                                for reply in item["replies"]["comments"]
                            ]

                    # Each page grabs at max 100 comments
                    counter -= 100
                    comments += pageComments

                except:
                    return {"comments": []}

        except (HttpError, SocketError) as error:
            print(
                "--- HTTP Error occurred while retrieving information for VideoID: {0}. [ERROR]: {1}".format(
                    video_id, error
                )
            )

    return {"comments":comments}


In [7]:
### Get Video Transcript

def getVideoTranscript(video_id, transcriber=None):
    """Download video transcript given an ID

    Method uses the youtube-transcript library to query the API for a video_id. Method cleans 
    returned output and provides a text blob back.
    
    Parameters
    ----------
    video_id : str
        The video_id for the YouTube video, usually found at the end of the URL.
    transcriber : YouTubeTranscriptApi
        Our transcription object. Passed in to prevent constant constructor calls.

    Returns
    -------
    dict
        A dictionary with the raw transcript, cleaned transcript, and pauses in the video.
    """

    # If we did not pass in a transcriber
    if transcriber == None:
        transcriber = YouTubeTranscriptApi()

    try:
        raw_transcript = transcriber.get_transcript(video_id)
    except:
        return {"cleaned_transcript" : "", "raw_transcript" : {}}

    cleaned_transcript = " ".join([phrase['text'] for phrase in raw_transcript])

    return {"cleaned_transcript" : cleaned_transcript, "raw_transcript" : raw_transcript}

In [9]:
transcriber = YouTubeTranscriptApi()
video_id = '41hCQZAgVa0'

In [10]:
videoMetaData = getVideoMetadata(video_id)
time.sleep(1)
videoComments = getVideoComments(video_id, 200)
time.sleep(2)
videoTranscript = getVideoTranscript(video_id, transcriber)

UFC 285: Post-Fight Press Conference
Following UFC 285, tune in to the Post-Fight Press Conference to hear the athletes take questions from the media.

Subscribe to get all the latest UFC content: http://bit.ly/2uJRzRR

Experience UFC live with UFC FIGHT PASS, the digital subscription service of the UFC. Visit https://ufcfightpass.com/

To order UFC Pay-Per-Views on ESPN+, visit https://bit.ly/2vNIBE8 (U.S. only)

To order UFC Pay-Per-Views, visit http://welcome.ufcfightpass.com/#PPV (Non U.S.)

Connect with UFC online and on Social:
🔴 Website: http://www.ufc.com
🔵 Twitter: http://www.twitter.com/ufc
🔵 Facebook: http://www.facebook.com/ufc
🔴 Instagram: http://www.instagram.com/ufc
🟡 Snapchat: UFC
🟣 Twitch: https://www.twitch.tv/ufc

Connect with UFC FIGHT PASS on Social:
🔵 Twitter: http://www.twitter.com/ufcfightpass
🔵 Facebook: http://www.facebook.com/ufcfightpass
🔴 Instagram: http://www.instagram.com/ufcfightpass

#UFC #UFC285
['UFC', 'Dana White', 'Post-fight Press Conference', 'Pre

In [18]:
videoMetaData

{'title': 'UFC 285: Post-Fight Press Conference',
 'description': 'Following UFC 285, tune in to the Post-Fight Press Conference to hear the athletes take questions from the media.\n\nSubscribe to get all the latest UFC content: http://bit.ly/2uJRzRR\n\nExperience UFC live with UFC FIGHT PASS, the digital subscription service of the UFC. Visit https://ufcfightpass.com/\n\nTo order UFC Pay-Per-Views on ESPN+, visit https://bit.ly/2vNIBE8 (U.S. only)\n\nTo order UFC Pay-Per-Views, visit http://welcome.ufcfightpass.com/#PPV (Non U.S.)\n\nConnect with UFC online and on Social:\n🔴 Website: http://www.ufc.com\n🔵 Twitter: http://www.twitter.com/ufc\n🔵 Facebook: http://www.facebook.com/ufc\n🔴 Instagram: http://www.instagram.com/ufc\n🟡 Snapchat: UFC\n🟣 Twitch: https://www.twitch.tv/ufc\n\nConnect with UFC FIGHT PASS on Social:\n🔵 Twitter: http://www.twitter.com/ufcfightpass\n🔵 Facebook: http://www.facebook.com/ufcfightpass\n🔴 Instagram: http://www.instagram.com/ufcfightpass\n\n#UFC #UFC285',
 '

In [69]:
response = (
    youtube_api.videos()
    .list(part="id,snippet,contentDetails", id='aUUO7Ajvs1Y')
    .execute()
)

In [71]:
response["items"]

[]

In [61]:
import re
import datetime

def parse_duration(duration_string):
    # Define regular expression patterns for hours, minutes, and seconds
    hours_pattern = r'(?P<hours>\d+)H'
    minutes_pattern = r'(?P<minutes>\d+)M'
    seconds_pattern = r'(?P<seconds>\d+)S'

    # Extract the components from the duration string using the regular expressions
    hours = 0
    minutes = 0
    seconds = 0

    hours_match = re.search(hours_pattern, duration_string)
    if hours_match:
        hours = int(hours_match.group('hours'))

    minutes_match = re.search(minutes_pattern, duration_string)
    if minutes_match:
        minutes = int(minutes_match.group('minutes'))

    seconds_match = re.search(seconds_pattern, duration_string)
    if seconds_match:
        seconds = int(seconds_match.group('seconds'))

    # Create a time object with the parsed components
    time_obj = datetime.time(hour=hours, minute=minutes, second=seconds)

    return time_obj

In [35]:
response["items"][0]

{'kind': 'youtube#video',
 'etag': 'Vb98YA1sdsKCZH24VCYKad4anaE',
 'id': 'd07IMGz7Dkc',
 'snippet': {'publishedAt': '2023-02-06T18:00:28Z',
  'channelId': 'UCemIPVbnuBzOmuXza99eH_w',
  'title': '5 Golden Rules of Real Estate Investing',
  'description': 'Subscribe for more videos: https://www.youtube.com/c/ThachNguyen\n\nThank you for watching!\n\nStart your real estate investing journey with mentorship from me and my team: beacons.ai/thachnguyen\n\nFor business inquiries, partnerships, sponsorships, and collaborations: thach518@gmail.com\n\nThach Nguyen has come a long way since arriving in America at age 4 with his parents, four brothers, and sister in 1975. Focus, drive, determination, and great mentors paved the way for him to go from a homeless refugee to a multi-millionaire.  \n\nThach is the CEO & Founder of Thach Real Estate Group and principal of Springboard to Wealth. He is not only a seasoned realtor, investor, coach, author, and speaker, but also a proud ambassador of the A

In [None]:
video = {
    "title": videoMetaData["title"],
    "description": videoMetaData["description"],
    "tags": videoMetaData["tags"],
    "cleaned_transcript": videoTranscript["cleaned_transcript"],
    "raw_transcript": videoTranscript["raw_transcript"], 
    "comments": [list(comment.values())[0] for comment in videoComments["comments"]],
    "comment_ids" : [list(comment.keys())[0] for comment in videoComments["comments"]],
}


In [None]:
pd.DataFrame(pd.Series(video)).T

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/raw/youtube/videos_0.csv')

In [4]:
df.head()

Unnamed: 0,title,description,tags,cleaned_transcript,raw_transcript,comments,comment_ids
0,How TikTok Could Become a U.S. Company | WSJ,"TikTok is at a crossroads, as U.S. concerns ab...","['tik tok china', 'tiktok china', 'tiktok owne...",- [Narrator] TikTok is at a crossroads. Concer...,[{'text': '- [Narrator] TikTok is at a crossro...,"['I don’t trust oracle either but okay', 'It’s...","['UgwGZaDLpB4lHanki6V4AaABAg', 'Ugy5kzxqBohPn3..."
1,Rick Astley - Never Gonna Give You Up (Officia...,The official video for “Never Gonna Give You U...,"['rick astley', 'Never Gonna Give You Up', 'ng...",[Music] you know the rules [Music] gotta make ...,"[{'text': '[Music]', 'start': 0.0, 'duration':...",['1 BILLION views for Never Gonna Give You Up!...,"['UgzarqjaaPC7TbFINNx4AaABAg', 'UgzarqjaaPC7Tb..."


In [5]:
df['title'][0]

'How TikTok Could Become a U.S. Company | WSJ'

In [7]:
df['tags'][0]

"['tik tok china', 'tiktok china', 'tiktok owned by china', 'tiktok security', 'cfius', 'tiktok cfius', 'tiktok ownership', 'tiktok data', 'bytedance', 'tiktok divestiture', 'divestiture', 'tiktok oracle', 'tiktok', 'china', 'china privacy', 'china security', 'data privacy', 'china social media', 'social media privacy', 'tiktok us deal', 'tiktok us security deal', 'tiktok deal', 'china us', 'tiktok in us', 'data', 'privacy', 'tik tok', 'tiktok data privacy', 'business news', 'tiktok ban', 'tiktok user data', 'china news', 'tiktok news', 'techy']"

In [6]:
df['cleaned_transcript'][0]

'- [Narrator] TikTok is at a crossroads. Concerns about its Chinese ownership have been building since 2020. - TikTok\'s been in this\nlimbo for two years, and who knows how many more months or years it could stay in this limbo. - [Narrator] And a panel of US officials is torn on how to address them. If they can\'t come to an agreement, there\'s another possible solution, forcing TikTok to become a US company. Here\'s what it would\ntake to make that happen, and why that move poses\nsome big challenges. First, a brief review of how\nTikTok\'s data is managed, which is the main focus for US officials. - TikTok\'s this huge\ncompany, but it\'s owned by a Chinese parent\ncompany called ByteDance. They run several apps, but TikTok is probably their\nbest known one in the world. - [Narrator] ByteDance itself\nis headquartered in Beijing, and is owned by investors\nfrom around the globe. A Chinese state controlled entity owns 1% of a Beijing based\nsubsidiary of ByteDance. Those connections 