# Import Libraries

In [12]:
import pandas as pd
import getpass

from datetime import date
from googleapiclient.discovery import build

# Build YouTube Client

In [2]:
YT_API_KEY =  getpass.getpass('Enter YouTube API key: ')

In [3]:
VIDEO_ID = "UUGY64rPLxo"

In [4]:
YT_CLIENT = build('youtube', 'v3', developerKey=YT_API_KEY)

# API Call Functions

In [9]:
# Function to get replies for a specific comment
def get_replies(yt_client, video_id, comment_id):
    replies = []
    next_page_token = None

    while True:
        # Only use pageToken for pagination; remove maxResults
        reply_request = yt_client.comments().list(
            part="snippet",
            parentId=comment_id,
            textFormat="plainText",
            pageToken=next_page_token  # Use pageToken for pagination
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': comment['textDisplay'],
                'Date': comment.get('updatedAt', comment['publishedAt'])
            })

        # Get the next page token
        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:  # Stop if no more pages
            break

    return replies

# Function to get all comments (including replies) for a single video
def get_comments_for_video(yt_client = YT_CLIENT, video_id = VIDEO_ID):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = yt_client.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=10000
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            all_comments.append({
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': top_comment['textDisplay'],
                'Date': top_comment['updatedAt'] if 'updatedAt' in top_comment else top_comment['publishedAt']
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(yt_client, video_id, item['snippet']['topLevelComment']['id']))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    df = pd.DataFrame(all_comments)

    return df

# Execute API Call

In [10]:
df = get_comments_for_video()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23000 entries, 0 to 22999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Timestamp  23000 non-null  object
 1   Username   23000 non-null  object
 2   VideoID    23000 non-null  object
 3   Comment    23000 non-null  object
 4   Date       23000 non-null  object
dtypes: object(5)
memory usage: 898.6+ KB


# Save to CSV

In [15]:
dir_ = "dataset/"
filename = dir_ + f"comments_{date.today()}.csv"
df.to_csv(filename, index=False)