<a href="https://colab.research.google.com/github/ajay-sachin/youtube/blob/main/Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**YOUTUBE DATA SCRAPPING WITH PYTHON**

Import the libraries

In [1]:
import numpy as np
import pandas as pd
from googleapiclient.discovery import build

Data Creation with API

In [2]:
api_key = "AIzaSyABi2XonLSZIeHvo35TDZhO7UgHpsXWrts"
channel_id = ['UCl23mvQ3321L7zO6JyzhVmg',  #mumbai_indians
              'UC2J_VKrAzOEJuQvFFtj3KUw',  #chennai_super_kings
              'UCCq1xDJMBRF61kiOgU90_kw',  #royal_challengers_bangalore
]
youtube = build('youtube', 'v3', developerKey=api_key)

Channel info

In [3]:
def get_channel_info(youtube, channel_id):
  all_data= []
  request = youtube.channels().list(
      part='snippet, content_details, statistics',
      id = ','.join(channel_id))
  response = request.execute()
  for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    description = response['items'][i]['snippet']['description'],
                    publishdate = response['items'][i]['snippet']['publishedAt'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'],
                    thumbnail =  response['items'][i]['snippet']['thumbnails']['high']['url']
                    )
        all_data.append(data)
  return pd.DataFrame(all_data)


In [4]:
channel_data = get_channel_info(youtube, channel_id)
channel_data.dtypes


channelName    object
description    object
publishdate    object
subscribers    object
views          object
totalVideos    object
playlistId     object
thumbnail      object
dtype: object

In [5]:
#convert to numerical format
numerical_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numerical_cols] = channel_data[numerical_cols].apply(pd.to_numeric, errors='coerce')

GETTING VIDEO INFO OF CHANNELS

In [6]:
def get_video_ids(youtube, playlist_id):

    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()

    video_ids = []

    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])

    next_page_token = response.get('nextPageToken')
    more_pages = True

    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()

            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            next_page_token = response.get('nextPageToken')

    return video_ids[:10] #getting 10 videos...

In [7]:
def get_video_details(youtube, video_ids):

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    return all_video_info

In [8]:
video_df = pd.DataFrame()
#comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():

    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # get video data
    video_data = get_video_details(youtube, video_ids)
    # get comment data
    #comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether

    video_df = video_df.append(video_data)
    #comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video information from channel: Mumbai Indians


  video_df = video_df.append(video_data)


Getting video information from channel: Chennai Super Kings


  video_df = video_df.append(video_data)


Getting video information from channel: Royal Challengers Bangalore


  video_df = video_df.append(video_data)


In [9]:
cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[cols] = video_df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

Getting comment data

In [10]:
def get_comments_in_videos(youtube, video_ids):
    all_comments = []

    for video_id in video_ids:
        data = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            maxResults='100',
            textFormat="plainText").execute()

        for i in data["items"]:
            commentsData = dict(

            name = i["snippet"]['topLevelComment']["snippet"]["authorDisplayName"],
            comment = i["snippet"]['topLevelComment']["snippet"]["textDisplay"],
            published_at = i["snippet"]['topLevelComment']["snippet"]['publishedAt'],
            likes = i["snippet"]['topLevelComment']["snippet"]['likeCount'],
            replies = i["snippet"]['totalReplyCount'])

            all_comments.append(commentsData)

            totalReplyCount = i["snippet"]['totalReplyCount']

            if totalReplyCount > 0:

                parent = i["snippet"]['topLevelComment']["id"]

                data2 = youtube.comments().list(part='snippet', maxResults='100', parentId=parent,
                                                textFormat="plainText").execute()

                for i in data2["items"]:
                    commentsData = dict(
                    name = i["snippet"]["authorDisplayName"],
                    comment = i["snippet"]["textDisplay"],
                    published_at = i["snippet"]['publishedAt'],
                    likes = i["snippet"]['likeCount'],
                    replies = "")

                    all_comments.append(commentsData)

        while ("nextPageToken" in data):

            data = youtube.commentThreads().list(part='snippet', videoId=video_id, pageToken=data["nextPageToken"],
                                                maxResults='100', textFormat="plainText").execute()

            for i in data["items"]:
                commentsData = dict(
                name = i["snippet"]['topLevelComment']["snippet"]["authorDisplayName"],
                comment = i["snippet"]['topLevelComment']["snippet"]["textDisplay"],
                published_at = i["snippet"]['topLevelComment']["snippet"]['publishedAt'],
                likes = i["snippet"]['topLevelComment']["snippet"]['likeCount'],
                replies = i["snippet"]['totalReplyCount'])

                all_comments.append(commentsData)

                totalReplyCount = i["snippet"]['totalReplyCount']

                if totalReplyCount > 0:

                    parent = i["snippet"]['topLevelComment']["id"]

                    data2 = youtube.comments().list(part='snippet', maxResults='100', parentId=parent,
                                                    textFormat="plainText").execute()

                    for i in data2["items"]:
                        commentsData = dict(
                        name = i["snippet"]["authorDisplayName"],
                        comment = i["snippet"]["textDisplay"],
                        published_at = i["snippet"]['publishedAt'],
                        likes = i["snippet"]['likeCount'],
                        replies = '')

                        all_comments.append(commentsData)



    return all_comments

In [11]:
comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():

    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # get video data

    # get comment data
    comments_data = get_comments_in_videos(youtube, video_ids)

    # append video data together and comment data toghether


    comments_df = comments_df.append(comments_data, ignore_index=True)

Getting video information from channel: Mumbai Indians


  comments_df = comments_df.append(comments_data, ignore_index=True)


Getting video information from channel: Chennai Super Kings


  comments_df = comments_df.append(comments_data, ignore_index=True)


Getting video information from channel: Royal Challengers Bangalore


  comments_df = comments_df.append(comments_data, ignore_index=True)


In [12]:
comments_df

Unnamed: 0,name,comment,published_at,likes,replies
0,Game ka kida,Hiii❤❤❤,2023-10-06T06:07:17Z,0,0
1,Haris Perwez,Happy Family 💙🙌🤌,2023-10-06T05:37:23Z,0,0
2,Sachin Kr,First like and comment 😊😊😊,2023-10-06T05:30:29Z,0,0
3,Raheem Hasan,Bavuma.....\nMere ko kya \nMain to reservation...,2023-10-06T06:26:56Z,1,0
4,pro pri YT,Nepal World Cup jeet sakti hai😊 lekin Virat Ko...,2023-10-06T05:21:16Z,1,0
...,...,...,...,...,...
958,Pushpa Umesh,ಜೈ ಆರ್ಸಿಬಿ ❤👑 ಮೊದಲನೇ ಕಾಮೆಂಟ್,2023-09-28T06:26:46Z,2,0
959,karan gonte,First viewer me,2023-09-28T06:26:45Z,0,0
960,Nevar Mind,First comment,2023-09-28T06:26:44Z,0,0
961,Rs.creator,Hello sir,2023-09-28T06:26:40Z,0,0


Save into CSV file

Channel details

In [None]:

channel_data.to_csv('/content/drive/MyDrive/Ajay_Sachin/Project/channalStats.csv', index=False, header=True)

video details

In [None]:
video_df.to_csv('/content/drive/MyDrive/Ajay_Sachin/Project/videoDetails.csv', index=False, header=True)

comment details

In [None]:
comments_df.to_csv('/content/drive/MyDrive/Ajay_Sachin/Project/allComments.csv', index=False, header=True)