# Import Libraries

In [1]:
import pandas as pd
import getpass

from datetime import date
from googleapiclient.discovery import build

# Build YouTube Client

In [2]:
YT_API_KEY =  getpass.getpass('Enter YouTube API key: ')

In [3]:
YT_CLIENT = build('youtube', 'v3', developerKey=YT_API_KEY)

# Video ID

In [4]:
VIDEO_ID = "UUGY64rPLxo"

# API Call Functions

In [5]:
# Function to get replies for a specific comment
def get_replies(yt_client, comment_id):
    replies = []
    next_page_token = None

    while True:
        # Only use pageToken for pagination
        reply_request = yt_client.comments().list(
            part="snippet",
            parentId=comment_id,
            textFormat="plainText",
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'ID': item['id'],  # Unique ID for the reply
                'ParentID': comment_id,  # Parent comment ID
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'Comment': comment['textDisplay'],
                'LikeCount': comment.get('likeCount', 0),
                'ReplyCount': None,
                'Date': comment.get('updatedAt', comment['publishedAt'])
            })

        # Get the next page token
        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:  # Stop if no more pages
            break

    return replies


# Function to get all comments (including replies) for a single video
def get_yt_video_comments(yt_client=YT_CLIENT, video_id=VIDEO_ID):
    all_comments = []
    next_page_token = None

    while True:
        comment_request = yt_client.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            textFormat="plainText",
            maxResults=100
        )
        comment_response = comment_request.execute()

        for item in comment_response['items']:
            top_comment = item['snippet']['topLevelComment']['snippet']
            top_comment_id = item['snippet']['topLevelComment']['id']
            all_comments.append({
                'ID': top_comment_id,  # Unique ID for top-level comment
                'ParentID': None,  # No parent for top-level comment
                'Timestamp': top_comment['publishedAt'],
                'Username': top_comment['authorDisplayName'],
                'Comment': top_comment['textDisplay'],
                'LikeCount': top_comment.get('likeCount', 0),
                'ReplyCount': item['snippet'].get('totalReplyCount', 0),
                'Date': top_comment.get('updatedAt', top_comment['publishedAt'])
            })

            # Fetch replies if there are any
            if item['snippet']['totalReplyCount'] > 0:
                all_comments.extend(get_replies(yt_client, top_comment_id))

        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break

    df = pd.DataFrame(all_comments)

    return df


# Get Dataset

In [6]:
df = get_yt_video_comments(YT_CLIENT, VIDEO_ID)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23008 entries, 0 to 23007
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          23008 non-null  object 
 1   ParentID    11091 non-null  object 
 2   Timestamp   23008 non-null  object 
 3   Username    23008 non-null  object 
 4   Comment     23008 non-null  object 
 5   LikeCount   23008 non-null  int64  
 6   ReplyCount  11917 non-null  float64
 7   Date        23008 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [10]:
# Convert data type of 'ReplyCount' to integer
df['ReplyCount'] = pd.to_numeric(df['ReplyCount'], errors='coerce').astype('Int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23008 entries, 0 to 23007
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          23008 non-null  object
 1   ParentID    11091 non-null  object
 2   Timestamp   23008 non-null  object
 3   Username    23008 non-null  object
 4   Comment     23008 non-null  object
 5   LikeCount   23008 non-null  int64 
 6   ReplyCount  11917 non-null  Int64 
 7   Date        23008 non-null  object
dtypes: Int64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [11]:
# Check Top Comments
top_comments = df[df['ParentID'].isnull()].sort_values(by=['LikeCount', 'ReplyCount'], ascending=False).head()
top_comments

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date
14134,UgzWvu72I8m9-U8pq8F4AaABAg,,2023-03-14T11:13:13Z,@onthebluesky,"Guys, lagu ini bukan tentang LGBT, tapi tentan...",19401,751,2023-03-14T11:13:13Z
13411,Ugxb2yPnhvOFAaF_b2d4AaABAg,,2023-03-14T13:56:21Z,@driezkh,Performance Videonya kaya memberitahu kita ten...,2319,70,2023-03-20T17:56:27Z
21823,UgzCYP-5eQOScO828UZ4AaABAg,,2023-03-13T13:16:02Z,@adanjir1923,Satu persatu member diberikan kesempatan buat...,1885,63,2023-03-13T13:16:02Z
16959,UgwQ0xdL1_z3bGf9UM94AaABAg,,2023-03-14T05:24:15Z,@Jkt48990,"fiks, kalau kedepan jkt48 release single MVnya...",1863,80,2023-03-14T05:24:15Z
2393,UgyO_jkZ191_KXd7EUR4AaABAg,,2023-10-21T18:45:55Z,@ahmadfikri5186,Malam ini rahasia ya\nKamu tak boleh bilang si...,967,13,2023-10-21T18:45:55Z


In [12]:
# Check Top Comment Replies
top_comment_replies = df[df['ParentID'] == 'UgzWvu72I8m9-U8pq8F4AaABAg']
top_comment_replies.head()

Unnamed: 0,ID,ParentID,Timestamp,Username,Comment,LikeCount,ReplyCount,Date
14135,UgzWvu72I8m9-U8pq8F4AaABAg.9nEbUI92zYg9nEbdnwRgsp,UgzWvu72I8m9-U8pq8F4AaABAg,2023-03-14T11:14:39Z,@appharel,"Ya harusnya pake model cowok dong, ini mana co...",710,,2023-03-14T11:14:39Z
14136,UgzWvu72I8m9-U8pq8F4AaABAg.9nEbUI92zYg9nEbgS02exh,UgzWvu72I8m9-U8pq8F4AaABAg,2023-03-14T11:15:00Z,@ellaashel7846,@@appharelntar tambah ribut kalo pale cowo,2305,,2024-04-18T02:25:17Z
14137,UgzWvu72I8m9-U8pq8F4AaABAg.9nEbUI92zYg9nEbkSF3e-V,UgzWvu72I8m9-U8pq8F4AaABAg,2023-03-14T11:15:33Z,@seeesshhbro,@@appharel kameramennya cowo,754,,2023-03-14T11:15:33Z
14138,UgzWvu72I8m9-U8pq8F4AaABAg.9nEbUI92zYg9nEboljWhE4,UgzWvu72I8m9-U8pq8F4AaABAg,2023-03-14T11:16:08Z,@ahmadsusanto7834,​@@appharellu kena pnyakit apa si wkwkwkwk dmn...,642,,2024-04-18T02:25:17Z
14139,UgzWvu72I8m9-U8pq8F4AaABAg.9nEbUI92zYg9nEbt8Sx7Ij,UgzWvu72I8m9-U8pq8F4AaABAg,2023-03-14T11:16:44Z,@appharel,​@@seeesshhbro sambil ngocok ya pas ngerekam wkwk,95,,2023-03-14T11:16:44Z


# Save to CSV

In [13]:
dir_ = "dataset/"
filename = dir_ + f"oshibe_spv_comments_{date.today()}.csv"
df.to_csv(filename, index=False)