In [None]:
# mastermind. The base of the code in this noteobook is from the below repo.
# https://github.com/analyticswithadam/Python/blob/main/Pull_all_Comments_and_Replies_for_YouTube_Playlists.ipynb

# Data scraping youtube

In [1]:
from googleapiclient.discovery import build
import pandas as pd
import getpass
import datetime

In [76]:
api_key = getpass.getpass('Please enter your YouTube API key: ')
# You will need ot obtain one of these googlequery
playlist_ids = ['PL3-OIwNPoC3KQ4d8hMwGIQnBB4A3Dm3UO']
# These can be found in the read me

Please enter your YouTube API key:  ········


In [77]:
# Build the YouTube client
youtube = build('youtube', 'v3', developerKey=api_key)

In [78]:
# This is the function to pull all comments from a youtube playlist. 
# Only include the text following the signin the playlists url.


def get_all_video_ids_from_playlists(youtube, playlist_ids):
    all_videos = []  # Initialize a single list to hold all video IDs

    for playlist_id in playlist_ids:
        next_page_token = None

        # Fetch videos from the current playlist
        while True:
            playlist_request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token)
            playlist_response = playlist_request.execute()

            all_videos += [item['contentDetails']['videoId'] for item in playlist_response['items']]

            next_page_token = playlist_response.get('nextPageToken')

            if next_page_token is None:
                break

    return all_videos

In [79]:
# Fetch all video IDs from the specified playlists
video_ids = get_all_video_ids_from_playlists(youtube, playlist_ids)

In [80]:
# Dates to convert to obtain comments within date range.
# set to desired range
START_DATE_STR = "2024-01-01"  
END_DATE_STR = "2024-11-04"

In [81]:
# https://www.google.com/search?q=only+scrape+comments+between+a+certain+date+youtube+code&sca_esv=66874c16b3ccadbb&ei=p72KaK-NLPCJwbkPw5C5kA0&ved=0ahUKEwjv_KGj8OWOAxXwRDABHUNIDtIQ4dUDCBA&uact=5&oq=only+scrape+comments+between+a+certain+date+youtube+code&gs_lp=Egxnd3Mtd2l6LXNlcnAiOG9ubHkgc2NyYXBlIGNvbW1lbnRzIGJldHdlZW4gYSBjZXJ0YWluIGRhdGUgeW91dHViZSBjb2RlMgUQIRigATIFECEYoAEyBRAhGKABMgUQIRigATIFECEYoAFIuAtQnQRYnApwAHgCkAEAmAFloAGkA6oBAzQuMbgBA8gBAPgBAZgCBqACtgPCAgQQABhHwgIFECEYnwXCAgUQIRirApgDAOIDBRIBMSBAiAYBkAYIkgcDNS4xoAfDJLIHAzQuMbgHswPCBwMwLjbIBwo&sclient=gws-wiz-serp
# Convert date strings to datetime objects. Added to exclude comments post election.
start_date = datetime.datetime.strptime(START_DATE_STR, "%Y-%m-%d").date()
end_date = datetime.datetime.strptime(END_DATE_STR, "%Y-%m-%d").date()

In [82]:
# GET ALL COMMENTS!!!

# Fetch all video IDs from the specified playlists
video_ids = get_all_video_ids_from_playlists(youtube, playlist_ids)

# Function to get replies for a specific comment
def get_replies(youtube, parent_id, video_id):  # Added video_id as an argument
    replies = []
    next_page_token = None

    while True:
        reply_request = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            textFormat="plainText",
            maxResults=100,
            order="time",
            pageToken=next_page_token
        )
        reply_response = reply_request.execute()

        for item in reply_response['items']:
            comment = item['snippet']
            replies.append({
                'Timestamp': comment['publishedAt'],
                'Username': comment['authorDisplayName'],
                'VideoID': video_id,
                'Comment': comment['textDisplay'],
                'Date': comment['updatedAt'] if 'updatedAt' in comment else comment['publishedAt']
            })

        next_page_token = reply_response.get('nextPageToken')
        if not next_page_token:
            break

    return replies

# Function to get all comments (including replies) for a single video
def get_comments_for_video(youtube, video_id):
    all_comments = []
    next_page_token = None

    while True:
        try:
            comment_request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=next_page_token,
                textFormat="plainText",
                order="time",
                maxResults=100
            )
            comment_response = comment_request.execute()
    
            for item in comment_response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comment_date_str = top_comment["publishedAt"].split("T")[0]  # Extract date part only
                comment_date = datetime.datetime.strptime(comment_date_str, "%Y-%m-%d").date()

                # Filter comments based on the specified date range
                if start_date <= comment_date <= end_date:
                    all_comments.append({
                        'Timestamp': top_comment['publishedAt'],
                        'Username': top_comment['authorDisplayName'],
                        'VideoID': video_id,  # Directly using video_id from function parameter
                        'Comment': top_comment['textDisplay'],
                        'Date': top_comment['updatedAt'] if 'updatedAt' in top_comment else top_comment['publishedAt']
                    })
                elif comment_date < start_date:
                    # If we've reached comments older than the start date, stop fetching
                    break  # Exit the loop since comments are ordered by time

    
                # # Fetch replies if there are any
                # if item['snippet']['totalReplyCount'] > 0:
                #     all_comments.extend(get_replies(youtube, item['snippet']['topLevelComment']['id'], video_id))
    
            next_page_token = comment_response.get('nextPageToken')
            if not next_page_token or comment_date < start_date:  # Stop if no more pages or outside date range
                break
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    return all_comments

# List to hold all comments from all videos
all_comments = []


for video_id in video_ids:
    video_comments = get_comments_for_video(youtube, video_id)
    all_comments.extend(video_comments)

# Create DataFrame
comments_df = pd.DataFrame(all_comments)

In [83]:
comments_df.to_csv('./scrapped_date/2024Rep_debate_comments.csv', index=False)
# Use approriate csv name and location
# TO APPEND TO CSV:
# new_df.to_csv('existing_data.csv', mode='a', index=False, header=False)

In [85]:
# checking how many observations were captured
df = pd.read_csv('2024Rep_debate_comments.csv')
df.shape

(51652, 5)

In [None]:
# in the first scraping 2016 we didnt remove comment replies 
# so added a second cell to check the change in number of comments without replies
df = pd.read_csv('2024Rep_debate_comments2.csv')
df.shape