## Scrapping Comments

In [1]:
import re

In [2]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import pandas as pd

In [17]:
def getYoutubeVideoId(url : str = "") -> str:
    """
    Extract video id from youtube url
    Args
        url (str) : youtube url
    Return
        video_id (str) : video_id
    """
       # Check for various YouTube URL formats
    regex = "^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube(?:-nocookie)?\.com|youtu.be))(\/(?:[\w\-]+\?v=|embed\/|live\/|v\/)?)([\w\-]+)(\S+)?$"
    
    match = re.search(regex, url)
    
    if match:
        return match.group(5)  # Returns the video ID
    else:
        return None
    
# function to get comments from the youtube url
def scrapeYoutube(api_key : str, url : str, save_loc : str = "./data/comments.csv") -> None:
    """
    Go youtube url, scrape the comments and save it in save_loc
    Args
        api_key : youtube scrapper api key
        url : youtube url
        save_loc : location to save the comments   
    """
    assert type(api_key) == str, "api key should be string"
    assert type(url) == str, "url should be string"
    assert type(save_loc) == str, "save location should be string"

    # request youtube url
    video_id = getYoutubeVideoId(url)
    comments = getComments(video_id, api_key)
    save_loc = f"./data/comments_{video_id}.csv"
    # save comments
    saveComment(comments, save_loc)


def getComments(video_id : str, api_key : str, part : str = "snippet", max_results : int = 10000) -> list[dict]:
    """
    # Get comments from the url
    Arg\n
        video_id (str) : youtube video id
        part (str)
        max_results (int)
    Return\n
        comments (list) : list of comments
    """
    # get comments
    youtube = build("youtube", "v3", developerKey=api_key)
    try:
        # Retrieve comment thread using the youtube.commentThreads().list() method
        response = youtube.commentThreads().list(
            part=part,
            videoId=video_id,
            textFormat="plainText",
            maxResults=max_results
        ).execute()

        comments = []
        for item in response["items"]:
            comment_text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            likes = item["snippet"]["topLevelComment"]["snippet"]["likeCount"]
            comments.append({"comment": comment_text, "num_of_likes": likes})

        return comments
    
    except HttpError as error:
        print(f"An HTTP error {error.http_status} occurred:\n {error.content}")
        return None
    
def saveComment(comments : list[dict], save_loc : str):
    if comments:
        # Create a pandas dataframe from the comments list
        df = pd.DataFrame(comments)

        # Sort dataframe by number of likes in descending order
        df = df.sort_values(by=['num_of_likes'], ascending=False)

        # Print a preview of the first 10 rows
        print(df.head(10))

        # Export dataframe to a CSV file named "comments.csv"
        df.to_csv(save_loc, index=False)
    else:
        print("Error: Could not retrieve comments from video.")

In [19]:
getYoutubeVideoId("youtube.com/embed/DFYRQ_zQ-gk")

'DFYRQ_zQ-gk'

In [7]:
from dotenv import load_dotenv
import os

In [8]:
load_dotenv()
api_key = os.getenv("YOUTUBE_SCRAPE_KEY")

In [9]:
url = "https://www.youtube.com/watch?v=e-ORhEE9VVg"
video_id = getYoutubeVideoId(url)

In [10]:
comments = getComments(video_id, api_key)

In [11]:
df = pd.DataFrame(comments)
df.sort_values(by=['num_of_likes'], ascending=False)

Unnamed: 0,comment,num_of_likes
86,Anyone in October 2024?😎,5
40,Taylor swift perfect cinderela ❤,2
94,times may change but memories and emotions att...,2
3,Someone october 2024?,1
23,Who else is listening to this in the year 2052??,1
...,...,...
92,i thought no comments would be from this year ...,0
95,So amazing,0
96,"Taylor is my age, it says alot that i dont kno...",0
98,Oi,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment       100 non-null    object
 1   num_of_likes  100 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [13]:
youtube = build("youtube", "v3", developerKey=api_key)
response = youtube.commentThreads().list(
    part="snippet",
    videoId=video_id,
    textFormat="plainText",
    maxResults=100
).execute()

In [14]:
youtube.commentThreads.list()

AttributeError: 'function' object has no attribute 'list'

In [None]:
def load_comments(match):
    for item in match["items"]:
        comment = item["snippet"]["topLevelComment"]
        author = comment["snippet"]["authorDisplayName"]
        text = comment["snippet"]["textDisplay"]
        print("Comment by {}: {}".format(author, text))
        if 'replies' in item.keys():
            for reply in item['replies']['comments']:
                rauthor = reply['snippet']['authorDisplayName']
                rtext = reply["snippet"]["textDisplay"]
            print("\n\tReply by {}: {}".format(rauthor, rtext), "\n")

def get_comment_threads(youtube, video_id):
    results = youtube.commentThreads().list(
        part="snippet",
        maxResults=100,
        videoId=video_id,
        textFormat="plainText"
    ).execute()
    return results

url = "https://www.youtube.com/watch?v=e-ORhEE9VVg"
video_id = getYoutubeVideoId(url)
youtube = build("youtube", "v3", developerKey=api_key)
match = get_comment_threads(youtube, video_id)
next_page_token = match["nextPageToken"]
load_comments(match)
iter = 0
while next_page_token or iter > 2:
    match = get_comment_threads(youtube, video_id)
    next_page_token = match["nextPageToken"]
    load_comments(match)
    iter += 1