In [83]:
import pandas as pd

#read in list of youtubers from file

def get_channels_names(file_path):
    youtubers = pd.read_csv(file_path, sep = ",", header = 0)
    return youtubers

In [84]:
youtubers = get_channels_names("data/Youtubers_Half.csv")
youtubers

Unnamed: 0,channel_name,sub_count,url,Name,Type
0,Sykkuno,2.89M,https://www.youtube.com/@Sykkuno,Nicole,variety
1,iiTzTimmy,1.63M,https://www.youtube.com/@iiTzTimmy,Nicole,variety
2,Flights,918K,https://www.youtube.com/@Flightss,Nicole,all valorant
3,QuarterJade,434K,https://www.youtube.com/@QuarterJade/,Nicole,variety
4,red,329K,https://www.youtube.com/@RedValorant,Nicole,all valorant
5,Ziptie,251K,https://www.youtube.com/@ZipTie/,Nicole,all valorant


In [85]:
from googleapiclient.discovery import build

#building youtube service
def youtube_build_service(YOUTUBE_API_SERVICE_NAME, 
                          YOUTUBE_API_VERSION,
                          KEY):
    return build(YOUTUBE_API_SERVICE_NAME,
                 YOUTUBE_API_VERSION,
                 developerKey=KEY)

In [86]:
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
api_key = "YOUR_API_KEY"

youtube_service = youtube_build_service(YOUTUBE_API_SERVICE_NAME,
                                       YOUTUBE_API_VERSION, 
                                       api_key)
youtube_service

<googleapiclient.discovery.Resource at 0x7fe1ee623220>

In [87]:
import requests
from bs4 import BeautifulSoup
import re

#Parse channel URL to return channel ID. Return URL.

def get_channel_id(channel_url):
    url ="" 
    #getting json
    resp = requests.get(channel_url)
    data = BeautifulSoup(resp.text, "html.parser")
    #finding "externalId" that has the channel id no matter what is link structure
    data_s = str(data)
    
    search_url = re.search('"externalId":',data_s)
    start, end = search_url.span()
    #finding the url after the id, using index
    for i in range(end , end+100):
        if data_s[i] == ",":
            break
        url += data_s[i]
    url = url.split('"')[1]
    return url


In [88]:
#Get statistics, snippet and contentDetails for Channel from YouTube API
##Documentation for returned values: https://developers.google.com/youtube/v3/docs/channels

def get_channel_details(youtube, **kwargs):
    return youtube.channels().list(
        part="statistics,snippet,contentDetails",
        **kwargs
    ).execute()

In [89]:
#Get channel details for specified list of youtubers. 
##Return channel_id, channel_title, channel_subscriber_count, channel_video_count, channel_view_count for each youtuber

def get_channels_details_info(youtubers, youtube_service):
    dict_youtubers = {}
    l_youtubers = []
    for index in range(len(youtubers["url"])):
        # get the channel ID from the URL
        channel_id= get_channel_id(youtubers["url"].iloc[index])
        # get the channel details
        response = get_channel_details(youtube_service, id=channel_id)
        snippet = response["items"][0]["snippet"]
        statistics = response["items"][0]["statistics"]
        dict_youtubers = {
            "channel_id":channel_id,
            "channel_title" : snippet["title"],
            "channel_subscriber_count" : statistics["subscriberCount"],
            "channel_video_count" : statistics["videoCount"],
            "channel_view_count"  : statistics["viewCount"] 
        }
        l_youtubers.append(dict_youtubers)
        
    return l_youtubers

In [91]:
#get channels info
##API CALL: QUOTA COUNT = 9
channels_info = get_channels_details_info(youtubers, youtube_service)
df_channel_info = pd.DataFrame(channels_info)

#save to csv file
df_channel_info.to_csv('data/channels_info.csv')
df_channel_info


Unnamed: 0,channel_id,channel_title,channel_subscriber_count,channel_video_count,channel_view_count
0,UCRAEUAmW9kletIzOxhpLRFw,Sykkuno,2890000,640,371243211
1,UC5v2QgY2D5tlu8uws23MG4Q,iiTzTimmy,1630000,744,270583472
2,UCIfAlCwj-ZPZq5fqjpYDX3w,Flights,918000,56,96575008
3,UC_wSuaxwUYsJOBZDWwHIQZg,QuarterJade,434000,383,71185748
4,UCFJ1pr8iwWPeQjmeHnPhqvA,Red,329000,294,53077841
5,UCQ8VQZoYPeXF_q0E19UDGYQ,Ziptie,252000,237,80884332


In [92]:
#Get video or channel information based on search parameter specified from YouTube API
##https://developers.google.com/youtube/v3/docs/search

def get_channel_videos(youtube, **kwargs):
    return youtube.search().list(
        **kwargs
    ).execute()

In [93]:
#Get snippet, contentDetails, statistics for video from YouTube API
##snippet property contains the channelId, title, description, tags, and categoryId properties
##https://developers.google.com/youtube/v3/docs/videos/list

def get_video_details(youtube, **kwargs):
    return youtube.videos().list(
        part="snippet,contentDetails,statistics",
        **kwargs
    ).execute()

In [95]:
#Takes video_response from get_video_details as argument. Parses response. 
# Returns dictionary containing : {
#         "Title": title,
#         "Channel Title": channel_title,
#         "Channel ID": channel_id
#         "Publish time": publish_time,
#         "Duration": duration_str,
#         "Number of comments": comment_count,
#         "Number of likes": like_count,
#         "Number of views": view_count
#     }

def video_infos(video_response):
     
    items = video_response.get("items")[0]
    # get the snippet, statistics & content details from the video response
    snippet         = items["snippet"]
    statistics      = items["statistics"]
    content_details = items["contentDetails"]
    # get infos from the snippet
    channel_title = snippet["channelTitle"]
    channel_id = snippet["channelId"]
    title         = snippet["title"]
    publish_time  = snippet["publishedAt"]
    
    # get stats infos
    comment_count = statistics["commentCount"]
    like_count    = statistics["likeCount"]
    view_count    = statistics["viewCount"]
    # get duration from content details
    duration = content_details["duration"]
    
    # duration in the form of something like 'PT5H50M15S'
    # parsing it to be something like '5:50:15'
    parsed_duration = re.search(f"PT(\d+H)?(\d+M)?(\d+S)?", duration).groups()
    duration_str = ""
    for d in parsed_duration:
        if d:
            duration_str += f"{d[:-1]}:"
    duration_str = duration_str.strip(":")
    
    dict_video_info = {
        "Title": title,
        "Channel Title": channel_title,
        "Channel ID": channel_id,
        "Publish time": publish_time,
        "Duration": duration_str,
        "Number of comments": comment_count,
        "Number of likes": like_count,
        "Number of views": view_count
        
    }
    
    return dict_video_info

In [108]:
#Returns videos from specified channel. Takes youtube_service, channel_id, video limit (default 5) as arguments
##Quota (for running get_channel_videos) = 101 per channel with video limit 5
##API CALLS: get_channel_videos, get_video_details

def get_videos_from_channel(youtube_service, channel_id, videos_limit = 5):

    # counting number of videos grabbed
    n_videos = 0
    next_page_token = None
    list_videos = []

    while n_videos < videos_limit:
        #paramters to select the videos
        #only valorant related videos
        params = {
            'part': 'snippet',
            'q': 'valorant',
            'channelId': channel_id,
            'type': 'video',
        }
        
        if next_page_token:
            params['pageToken'] = next_page_token
        
        #getting channel videos based on parameters
        res = get_channel_videos(youtube_service, **params)

        #getting items
        channel_videos = res.get("items")
    
        for video in channel_videos:
            if n_videos >= videos_limit:
                break
                
            n_videos += 1
            video_id = video["id"]["videoId"]
            # easily construct video URL by its ID
            video_url = f"https://www.youtube.com/watch?v={video_id}"
            
            video_response = get_video_details(youtube_service, id=video_id)

            # get video details in dictionary
            dictionary_video = video_infos(video_response)
            dictionary_video["video_id"] = video_id
            dictionary_video["url"] = video_url 
            
            list_videos.append(dictionary_video)

        
        # if there is a next page, then add it to our parameters
        # to proceed to the next page
        if "nextPageToken" in res:
            next_page_token = res["nextPageToken"]
    return list_videos

In [106]:
#Loop through channel info data frame and retrieve videos for all listed channels
videos_retrieved = []
for channel_id in df_channel_info["channel_id"]:
     videos_retrieved.extend(get_videos_from_channel(youtube_service, channel_id))

videos_retrieved

next page
next page
next page
next page
next page
next page


[{'Title': 'We are the lowest-ranked team in this Valorant tournament but...',
  'Channel Title': 'Sykkuno',
  'Channel ID': 'UCRAEUAmW9kletIzOxhpLRFw',
  'Publish time': '2021-12-26T15:00:09Z',
  'Duration': '30:15',
  'Number of comments': '400',
  'Number of likes': '32336',
  'Number of views': '769838',
  'video_id': 'Kl2XzD5DMoY',
  'url': 'https://www.youtube.com/watch?v=Kl2XzD5DMoY'},
 {'Title': 'Winning at Valorant by only going C',
  'Channel Title': 'Sykkuno',
  'Channel ID': 'UCRAEUAmW9kletIzOxhpLRFw',
  'Publish time': '2022-11-20T18:47:39Z',
  'Duration': '11:19',
  'Number of comments': '192',
  'Number of likes': '6592',
  'Number of views': '92661',
  'video_id': 'rMMFFE3BM5k',
  'url': 'https://www.youtube.com/watch?v=rMMFFE3BM5k'},
 {'Title': "I couldn't believe I did this 1v5 ACE!",
  'Channel Title': 'Sykkuno',
  'Channel ID': 'UCRAEUAmW9kletIzOxhpLRFw',
  'Publish time': '2022-01-15T17:24:46Z',
  'Duration': '10:20',
  'Number of comments': '164',
  'Number of lik

In [110]:
df_videos = pd.DataFrame(videos_retrieved)
#save to csv file
df_videos.to_csv('data/videos_info.csv')
df_videos

Unnamed: 0,Title,Channel Title,Channel ID,Publish time,Duration,Number of comments,Number of likes,Number of views,video_id,url
0,We are the lowest-ranked team in this Valorant...,Sykkuno,UCRAEUAmW9kletIzOxhpLRFw,2021-12-26T15:00:09Z,30:15,400,32336,769838,Kl2XzD5DMoY,https://www.youtube.com/watch?v=Kl2XzD5DMoY
1,Winning at Valorant by only going C,Sykkuno,UCRAEUAmW9kletIzOxhpLRFw,2022-11-20T18:47:39Z,11:19,192,6592,92661,rMMFFE3BM5k,https://www.youtube.com/watch?v=rMMFFE3BM5k
2,I couldn't believe I did this 1v5 ACE!,Sykkuno,UCRAEUAmW9kletIzOxhpLRFw,2022-01-15T17:24:46Z,10:20,164,15988,338205,hHyUOwruyZE,https://www.youtube.com/watch?v=hHyUOwruyZE
3,I CAN'T BELIEVE I DID THAT!! - Valorant with f...,Sykkuno,UCRAEUAmW9kletIzOxhpLRFw,2020-08-11T17:00:11Z,10:1,296,27636,777388,lKItvDga7Eo,https://www.youtube.com/watch?v=lKItvDga7Eo
4,League of Legends player plays VALORANT,Sykkuno,UCRAEUAmW9kletIzOxhpLRFw,2020-04-11T14:34:54Z,11:34,192,5196,108752,9VYcATksKwU,https://www.youtube.com/watch?v=9VYcATksKwU
5,I DROPPED A 41 KILL GAME IN RADIANT (Valorant),iiTzTimmy,UC5v2QgY2D5tlu8uws23MG4Q,2022-11-27T23:00:32Z,13:17,50,1867,58301,nHywtfQIwgw,https://www.youtube.com/watch?v=nHywtfQIwgw
6,SHOULD I GO PRO IN VALORANT?,iiTzTimmy,UC5v2QgY2D5tlu8uws23MG4Q,2022-11-25T23:00:04Z,8:8,95,1933,54429,2_7Pro9m07E,https://www.youtube.com/watch?v=2_7Pro9m07E
7,VALORANT FOR 65 HOURS: The Film | Solo Iron-Ra...,iiTzTimmy,UC5v2QgY2D5tlu8uws23MG4Q,2021-11-30T22:00:02Z,1:24:14,3691,131714,5103967,6YESOZ_tGWo,https://www.youtube.com/watch?v=6YESOZ_tGWo
8,PLAYING AGAINST THE MOST STACKED RADIANT TEAM ...,iiTzTimmy,UC5v2QgY2D5tlu8uws23MG4Q,2022-11-26T23:00:30Z,10:42,32,1592,43897,e8REhf2Feek,https://www.youtube.com/watch?v=e8REhf2Feek
9,IITZTIMMY VS. SEN TENZ,iiTzTimmy,UC5v2QgY2D5tlu8uws23MG4Q,2022-08-26T00:35:31Z,13:39,172,6315,206728,4FgntfRhdQ4,https://www.youtube.com/watch?v=4FgntfRhdQ4


In [49]:
#Get snippet for commentThread, flexible definition of arguments
##Response documentation https://developers.google.com/youtube/v3/docs/commentThreads/list

def get_comments(youtube, **kwargs):
    return youtube.commentThreads().list(
        part="snippet",
        **kwargs
    ).execute()

In [116]:
# #Returns a list of comment info :comments_dict = {
#                 "Comment ID":comment_id, 
#                 "Comment": comment,
#                 "Likes": like_count,
#                 "Replies": reply_count
#                 "Channel ID": channel_id
#                 "Video ID": videoId
#                 }
#Arguments: video_id, total_comments (default 100), max_comment_per_page (default 100), order (default by time))

##QUOTA USAGE FOR 1 URL: 6

def get_comments_video(videoId, total_comments = 100, max_comment_per_page = 100 , order = "time"):
    #count comments retrieved
    comments_nb = 0 

    #list to store comment dictionary
    list_comments = []
    #comment dictionary for storing comment data
    comments_dict = {}
    
    #while comment count less than total comment value get comments
    while comments_nb <total_comments:
       
        params = {
                'videoId': videoId, 
                'maxResults': max_comment_per_page,
                'order': 'relevance', # default is 'time' (newest)
            }

        response = get_comments(youtube_service, **params)

        items = response.get("items")


        # if items is empty, breakout of the loop
        if not items:
            break
        
        for item in items:
            #if comments_nb exceeds total_comments, break
            if comments_nb >= total_comments:
                break 
            #collect comment text, comment id, reply count, like count, channel id
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_id = item['snippet']['topLevelComment']['id']
            reply_count = item['snippet']['totalReplyCount']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']

            #create dictionary with collected data and video idea
            comments_dict = {
                "Comment ID":comment_id, 
                "Comment": comment,
                "Likes": like_count,
                "Replies": reply_count,
                "Video ID": videoId
                }
            #increase comments_nb
            comments_nb+=1
            #add list_comments to comments_dict
            list_comments.append(comments_dict)

        #if nextPageToken exists, check next page    
        if "nextPageToken" in response:
            # if there is a next page
            # add next page token to the params we pass to the function
            params["pageToken"] =  response["nextPageToken"]
        else:
            # must be end of comments!!!!
            break
   
    return list_comments

In [117]:
#save dataframe to .csv file
def save_file(file_name, file_content):
    df_save = pd.DataFrame(file_content)
    df_save.to_csv("data/"+file_name+".csv")
    return df_save

In [118]:
comments = []

for video_id in df_videos["video_id"]:
    comments.extend(get_comments_video(video_id))

import json
with open('data/comments.json', 'w') as f:
    json.dump(comments, f)

next
next
next
next
next
next
next
next
next
next
next
next
next
next
next
next
next
next


In [119]:
#save to csv file
df_comments = save_file("comments", comments)
df_comments

Unnamed: 0,Comment ID,Comment,Likes,Replies,Video ID
0,Ugx-MyFNGwiUKZIWI1t4AaABAg,When you&#39;re the lowest team but you&#39;re...,3716,27,Kl2XzD5DMoY
1,UgzWDQhfUdW3Si6omLl4AaABAg,"Their synergy was so good, and all of them pla...",3532,3,Kl2XzD5DMoY
2,UgzSYS98RNc4FGJBkHJ4AaABAg,I can relate to this vid<br>My team: we’re win...,2003,3,Kl2XzD5DMoY
3,Ugy-kpso2H9PZV7xo-94AaABAg,Team Scarra had like the highest average in pl...,1571,19,Kl2XzD5DMoY
4,UgyrpTZOumJ9VvMrfhR4AaABAg,Sykkuno be like : &quot;We are the lowest rank...,1448,0,Kl2XzD5DMoY
...,...,...,...,...,...
2600,UgxQ5p9xXS7WLN58bD14AaABAg,"So, with ziptie and &quot;PRX Esoteric&quot; w...",0,0,VVg_51Pmtos
2601,Ugx_0-Qa_A1FstlxtJB4AaABAg,OMG cant wait to play Yoru again!,0,1,VVg_51Pmtos
2602,UgwfLFmSHqRL38qCmW54AaABAg,World&#39;s most cleverest and 999999 iq yoru ...,0,0,VVg_51Pmtos
2603,Ugwi-IB1Q_sEQq8bSW54AaABAg,I&#39;m pausing the video to try and find the ...,0,0,VVg_51Pmtos


In [122]:
#join comment data to video data on video id
df_video_comment_data = pd.merge(df_videos, df_comments, how = 'outer', left_on = ['video_id'], right_on = ['Video ID'])
df_video_comment_data

df_video_comment_data.to_csv('data/comments_videos_info.csv')
