In [None]:
#install packages if necessary

#!pip install youtube-data-api


In [None]:
#set up packages
import pandas as pd
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
from dotenv import load_dotenv

import time # To put the system to sleep
import random # for random numbers

from youtube_api import YouTubeDataAPI
from youtube_api.youtube_api_utils import *

In [None]:
#insert your own API Key here
# load keys from  environmental var
load_dotenv() # .env file in cwd
youtube_api = os.environ.get("api_key") 

In [None]:
def video_info (search_term, max_res):
    '''
    This function takes in a search term and returns the information for the first N videos in a datafram
    search_term: str - what you are looking up in youtube
    max_res: int - the number of to return
    '''
    videos = pd.DataFrame(yt.search(q= search_term, max_results= max_res))
    return videos


In [None]:
def get_comments (video_ids):
    """
    Fetches comments for a list of YouTube video IDs.

    Parameters:
    - video_ids (list): A list of YouTube video IDs for which comments will be fetched.

    Returns:
    - pandas.DataFrame: A DataFrame containing the comments for the specified videos.
      Columns include 'comment_id', 'comment_text', 'comment_like_count',
      'comment_publish_date', 'comment_date', and any additional fields provided by
      the YouTube API.

    Note:
    - The 'comment_publish_date' field is converted to a human-readable date format
      ('%Y-%m-%d %H:%M:%S UTC') and stored in the 'comment_date' column.
    - Comments for videos that encounter errors during retrieval are skipped.
      Any exception that occurs during retrieval is caught and ignored.

    Example:
    >>> video_ids = ['abc123', 'def456']
    >>> comments_df = get_comments(video_ids)
    """
    list_comments = []
    for video_id in video_ids:
    try:
        comments = yt.get_video_comments(video_id )
        list_comments.append(pd.DataFrame(comments))
    except Exception as e:
        pass
    # concat
    comments = pd.concat(list_comments)
    comments['comment_date'] = [datetime.utcfromtimestamp(i).strftime('%Y-%m-%d %H:%M:%S UTC') for i in comments['comment_publish_date']]
    return comments


In [None]:
def full_data_frame(list_of_games, max_res):
    """
    Fetches comments for multiple games and constructs a comprehensive DataFrame.

    Parameters:
    - list_of_games (list): A list of game titles for which comments will be fetched.
    - max_res (int): The maximum number of results to retrieve per game.

    Returns:
    - pandas.DataFrame: A DataFrame containing comments for videos associated with
      the specified list of games. The DataFrame includes columns such as 'comment_id',
      'comment_text', 'comment_like_count', 'comment_publish_date', 'comment_date',
      'game', and any additional fields provided by the YouTube API.

    Note:
    - The 'comment_publish_date' field is converted to a human-readable date format
      ('%Y-%m-%d %H:%M:%S UTC') and stored in the 'comment_date' column.
    - Each row in the resulting DataFrame corresponds to a comment from a video related
      to a specific game.
    - Errors encountered during video information retrieval or comment fetching for a
      particular game are printed but do not interrupt the process.

    Example:
    >>> list_of_games = ['Game1', 'Game2']
    >>> max_res = 10
    >>> comments_df = full_data_frame(list_of_games, max_res)
    """
    game_title = []
    main_df = []
    for game in list_of_games:
    print(game)
    videos = video_info(game, max_res)
    print(videos['video_id'][0])
    comments = get_comments(videos['video_id'])
    game_title += [game for i in range(len(comments))]
    #game_title.append(game_labels)
    #print(game_title)
    main_df.append(comments)
    df = pd.concat(main_df)
    df['game'] = game_title
    return df


In [None]:
#set up the parameters
#list of search terms
game_list = ["FIFA Women's World Cup 2023 Group Stage USA vs Vietnam", "FIFA Women's World Cup 2023 Group Stage USA vs Netherlands",
                   "FIFA Women's World Cup 2023 Group Stage USA vs Portugal", "FIFA Women's World Cup 2023 Round of 16 USA vs Sweden"]

max_res = 20

In [None]:
#scrape comment data from youtube
comment_df = full_data_frame(game_list, max_res)

comment_df.head()

In [None]:
#read out comments as a csv
comment_df.to_csv('raw_data.csv')