In [None]:
import pandas as pd

#read in list of youtubers from file

def get_channels_names(file_path):
    youtubers = pd.read_csv(file_path, sep = ",", header = 0)
    return youtubers

In [None]:
youtubers = get_channels_names("data/Youtubers.csv")
youtubers

In [None]:
from googleapiclient.discovery import build
#building youtube service
def youtube_build_service(YOUTUBE_API_SERVICE_NAME, 
                          YOUTUBE_API_VERSION,
                          KEY):
    return build(YOUTUBE_API_SERVICE_NAME,
                 YOUTUBE_API_VERSION,
                 developerKey=KEY)

In [None]:
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
api_key_1 = "AIzaSyCuPRmg3boEYuCK_IUmX5QthRiVnwOGkFk"
api_key_2  = "AIzaSyCvJwM6a-NpqrXhogr3-ERGVCod6k-rg8Q"
api_key = "AIzaSyCBCRhyNZh98DEOWx0UH4QFgAMqbcVJqho"

youtube_service = youtube_build_service(YOUTUBE_API_SERVICE_NAME,
                                       YOUTUBE_API_VERSION, 
                                       api_key)
youtube_service

In [None]:
import requests
from bs4 import BeautifulSoup
import re

#Parse channel URL to return channel ID. Return URL.

def get_channel_id(channel_url):
    url ="" 
    #getting json
    resp = requests.get(channel_url)
    data = BeautifulSoup(resp.text, "html.parser")
    #finding "externalId" that has the channel id no matter what is link structure
    data_s = str(data)
    
    search_url = re.search('"externalId":',data_s)
    start, end = search_url.span()
    #finding the url after the id, using index
    for i in range(end , end+100):
        if data_s[i] == ",":
            break
        url += data_s[i]
    url = url.split('"')[1]
    return url


In [None]:
#Get statistics, snippet and contentDetails for Channel from YouTube API
##Documentation for returned values: https://developers.google.com/youtube/v3/docs/channels

def get_channel_details(youtube, **kwargs):
    return youtube.channels().list(
        part="statistics,snippet,contentDetails",
        **kwargs
    ).execute()

In [None]:
#Get channel details for specified list of youtubers. 
##Return channel_id, channel_title, channel_subscriber_count, channel_video_count, channel_view_count for each youtuber

def get_channels_details_info(youtubers, youtube_service):
    dict_youtubers = {}
    l_youtubers = []
    for index in range(len(youtubers["url"])):
        # get the channel ID from the URL
        channel_id= get_channel_id(youtubers["url"].iloc[index])
        # get the channel details
        response = get_channel_details(youtube_service, id=channel_id)
        snippet = response["items"][0]["snippet"]
        statistics = response["items"][0]["statistics"]
        dict_youtubers = {
            "channel_id":channel_id,
            "channel_title" : snippet["title"],
            "channel_subscriber_count" : statistics["subscriberCount"],
            "channel_video_count" : statistics["videoCount"],
            "channel_view_count"  : statistics["viewCount"] 
        }
        l_youtubers.append(dict_youtubers)
        
    return l_youtubers
    
  

In [None]:
#get channels info
##API CALL: QUOTA COUNT = 9

#uncomment this section
# channels_info = get_channels_details_info(youtubers, youtube_service)
df = pd.DataFrame(channels_info)
#save to csv file
df.to_csv('data/channels_info.csv')
df

In [None]:
#Get video or channel information based on search parameter specified from YouTube API
##https://developers.google.com/youtube/v3/docs/search

def get_channel_videos(youtube, **kwargs):
    return youtube.search().list(
        **kwargs
    ).execute()

In [None]:
#Get snippet, contentDetails, statistics for video from YouTube API
##snippet property contains the channelId, title, description, tags, and categoryId properties
##https://developers.google.com/youtube/v3/docs/videos/list

def get_video_details(youtube, **kwargs):
    return youtube.videos().list(
        part="snippet,contentDetails,statistics",
        **kwargs
    ).execute()

In [None]:
#Takes video_response from get_video_details as argument. Parses response. 
# Returns dictionary containing : {
#         "Title": title,
#         "Channel Title": channel_title,
#         "Channel ID": channel_id
#         "Publish time": publish_time,
#         "Duration": duration_str,
#         "Number of comments": comment_count,
#         "Number of likes": like_count,
#         "Number of views": view_count
#     }


def video_infos(video_response):
     
    items = video_response.get("items")[0]
    # get the snippet, statistics & content details from the video response
    snippet         = items["snippet"]
    statistics      = items["statistics"]
    content_details = items["contentDetails"]
    # get infos from the snippet
    channel_title = snippet["channelTitle"]
    title         = snippet["title"]
    publish_time  = snippet["publishedAt"]
    
    # get stats infos
    comment_count = statistics["commentCount"]
    like_count    = statistics["likeCount"]
    view_count    = statistics["viewCount"]
    # get duration from content details
    duration = content_details["duration"]
    
    # duration in the form of something like 'PT5H50M15S'
    # parsing it to be something like '5:50:15'
    parsed_duration = re.search(f"PT(\d+H)?(\d+M)?(\d+S)?", duration).groups()
    duration_str = ""
    for d in parsed_duration:
        if d:
            duration_str += f"{d[:-1]}:"
    duration_str = duration_str.strip(":")
    
    dict_video_info = {
        "Title": title,
        "Channel Title": channel_title,
        "Publish time": publish_time,
        "Duration": duration_str,
        "Number of comments": comment_count,
        "Number of likes": like_count,
        "Number of views": view_count
    }
    
    return dict_video_info

In [None]:
#Returns videos from specified channel. Takes youtube_service, channel_id, video limit (default 5) as arguments
##Quota (for running get_channel_videos) = 101 per channel with video limit 5
##API CALLS: get_channel_videos, get_video_details

def get_videos_from_channel(youtube_service, channel_id, videos_limit = 5):

    # counting number of videos grabbed
    n_videos = 0
    next_page_token = None
    list_videos = []

    while n_videos < videos_limit:
        #paramters to select the videos
        #only valorant related videos
        params = {
            'part': 'snippet',
            'q': 'valorant',
            'channelId': channel_id,
            'type': 'video',
        }
        
        if next_page_token:
            params['pageToken'] = next_page_token
        
        #getting channel videos based on parameters
        res = get_channel_videos(youtube_service, **params)
        #getting items
        channel_videos = res.get("items")
    
        for video in channel_videos:
            if n_videos == videos_limit:
                break
                
            n_videos += 1
            video_id = video["id"]["videoId"]
            # easily construct video URL by its ID
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            
            video_response = get_video_details(youtube_service, id=video_id)
            
            print(f"================Video #{n_videos}================")
            # get video details in dictionary
            dictionary_video = video_infos(video_response)
            dictionary_video["video_id"] = video_id
            dictionary_video["url"] = video_url 
            
            
            list_videos.append(dictionary_video)
            
            print(video_infos(video_response))
            print(f"Video URL: {video_url}")
            print("="*40)
        print("*"*100)
        # if there is a next page, then add it to our parameters
        # to proceed to the next page
        if "nextPageToken" in res:
            next_page_token = res["nextPageToken"]
    return list_videos


In [None]:
#Loop through channel info data frame and retrieve videos for all listed channels

videos_retrieved = []

for channel_id in df["channel_id"]:
    videos_retrieved.extend(get_videos_from_channel(youtube_service, channel_id,1))

videos_retrieved

In [None]:
# df_videos = pd.DataFrame(videos_retrieved)
#save to csv file
# df_videos.to_csv('data/videos_info.csv')
df_videos


In [None]:
#Get snippet for commentThread, flexible definition of arguments
##Response documentation https://developers.google.com/youtube/v3/docs/commentThreads/list

def get_comments(youtube, **kwargs):
    return youtube.commentThreads().list(
        part="snippet",
        **kwargs
    ).execute()

In [None]:
# #Returns a list of comment info :comments_dict = {
#                 "Comment ID":comment_id, 
#                 "Comment": comment,
#                 "Likes": like_count,
#                 "Replies": reply_count
#                 "Channel ID": channel_id
#                 "Video ID": videoId
#                 }
#Arguments: video_id, total_comments (default 100), max_comment_per_page (default 100), order (default by time))

##QUOTA USAGE FOR 1 URL: 6

def get_comments_video(videoId, total_comments = 100, max_comment_per_page = 100 , order = "time"):
    #count comments retrieved
    comments_nb = 0 

    #list to store comment dictionary
    list_comments = []
    #comment dictionary for storing comment data
    comments_dict = {}
    
    #while comment count less than total comment value get comments
    while comments_nb <total_comments:
       
        params = {
                'videoId': videoId, 
                'maxResults': max_comment_per_page,
                'order': 'relevance', # default is 'time' (newest)
            }

        response = get_comments(youtube_service, **params)

        items = response.get("items")


        # if items is empty, breakout of the loop
        if not items:
            break
        
        for item in items:
            #if comments_nb exceeds total_comments, break
            if comments_nb >= total_comments:
                break 
            #collect comment text, comment id, reply count, like count, channel id
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_id = item['snippet']['topLevelComment']['id']
            reply_count = item['snippet']['totalReplyCount']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']

            #create dictionary with collected data and video idea
            comments_dict = {
                "Comment ID":comment_id, 
                "Comment": comment,
                "Likes": like_count,
                "Replies": reply_count,
                "Video ID": videoId
                }
            #increase comments_nb
            comments_nb+=1
            #add list_comments to comments_dict
            list_comments.append(comments_dict)

        #if nextPageToken exists, check next page    
        if "nextPageToken" in response:
            # if there is a next page
            # add next page token to the params we pass to the function
            params["pageToken"] =  response["nextPageToken"]
        else:
            # must be end of comments!!!!
            break
   
    return list_comments
    

In [None]:
#save dataframe to .csv file
def save_file(file_name, file_content):
    df_save = pd.DataFrame(file_content)
    df_save.to_csv("data/"+file_name+".csv")
    return df_save

In [None]:
#create list of comments

comments = []

for video_id in df_videos["video_id"]:
    comments.extend(get_comments_video(video_id))

In [None]:
#save comments to csv file using save_file
df_comments = save_file("comments", comments)
df_comments

In [None]:
#join comment, video and channal data on video and channel id
df_video_comment_data = pd.merge(df_videos, df_comments, how = 'outer', left_on = ['video_id'], right_on = ['Video ID'])
df_video_comment_channel_data = pd.merge(df_video_comment_data, df_channel_info, how = 'outer', left_on = ['Channel ID'], right_on = ['channel_id'])

df_video_comment_channel_data
df_video_comment_channel_data.to_csv('data/comments_videos_channel_info.csv')