# Import Libraries

In [1]:
import pandas as pd
import getpass

from datetime import date
from googleapiclient.discovery import build

# Build YouTube Client

In [2]:
YT_API_KEY =  getpass.getpass('Enter YouTube API key: ')

In [3]:
VIDEO_ID = "UUGY64rPLxo"

In [4]:
YT_CLIENT = build('youtube', 'v3', developerKey=YT_API_KEY)

# API Call Functions

In [5]:
# Function to get replies for a specific comment
def get_replies(yt_client, comment_id):
    replies = []
    next_page_token = None

    while True:
        # Only use pageToken for pagination
        reply_request = yt_client.comments().list(
            part="snippet",
            parentId=comment_id,
            textFormat="plainText",
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'ID': item['id'],  # Unique ID for the reply
                'ParentID': comment_id,  # Parent comment ID
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'Comment': comment['textDisplay'],
                'LikeCount': comment.get('likeCount', 0),
                'ReplyCount': None,
                'Date': comment.get('updatedAt', comment['publishedAt'])
            })

        # Get the next page token
        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:  # Stop if no more pages
            break

    return replies


# Function to get all comments (including replies) for a single video
def get_yt_video_comments(yt_client=YT_CLIENT, video_id=VIDEO_ID):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = yt_client.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            top_comment_id = item['snippet']['topLevelComment']['id']
            all_comments.append({
                'ID': top_comment_id,  # Unique ID for top-level comment
                'ParentID': None,  # No parent for top-level comment
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'Comment': top_comment['textDisplay'],
                'LikeCount': top_comment.get('likeCount', 0),
                'ReplyCount': top_comment.get('totalReplyCount', 0),
                'Date': top_comment.get('updatedAt', top_comment['publishedAt'])
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(yt_client, top_comment_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    df = pd.DataFrame(all_comments)

    return df


# Execute API Call

In [6]:
df = get_yt_video_comments(YT_CLIENT, VIDEO_ID)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23005 entries, 0 to 23004
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          23005 non-null  object 
 1   ParentID    11090 non-null  object 
 2   Timestamp   23005 non-null  object 
 3   Username    23005 non-null  object 
 4   Comment     23005 non-null  object 
 5   LikeCount   23005 non-null  int64  
 6   ReplyCount  11915 non-null  float64
 7   Date        23005 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [7]:
df.head()

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date
0,Ugw1etMquQZlMcsRYP14AaABAg,,2025-01-12T04:05:25Z,@Jevano_livia,Anjay,0,0.0,2025-01-12T04:05:25Z
1,UgxJqpTEcsVrR-EwpAp4AaABAg,,2025-01-12T01:46:00Z,@marshaldhiya12,yang kecanduan nonton sampe 2025 merapat,0,0.0,2025-01-12T01:46:00Z
2,UgzutXV_oszidT3jG2p4AaABAg,,2025-01-11T10:33:50Z,@ArsyLa-e9y,Seru denger nya❤,0,0.0,2025-01-11T10:33:50Z
3,UgxyAmycchpw6C8scw54AaABAg,,2025-01-10T08:30:27Z,@debuinbelitung7927,"Lirik lagu, :\nMalam ini rahasia ya\nKamu tak ...",0,0.0,2025-01-10T08:30:27Z
4,UgzTPB9_t8Kj6iD3IzR4AaABAg,,2025-01-08T10:23:36Z,@frizzz3896,muthe model seriusss cakepp juga😅,3,0.0,2025-01-08T10:23:36Z


# Save to CSV

In [8]:
dir_ = "dataset/"
filename = dir_ + f"oshibe_spv_comments_{date.today()}.csv"
df.to_csv(filename, index=False)