# Youtube creator page scraper

## Part 1 Basic steps
In the first part we will start by importing the right packages and load the data required to let the scraper work

#### Step 1
First we import the packages required for the scraper to work

In [1]:
import time
import selenium.webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
from tqdm import tqdm

#### Step 2 
First we import the dataset with the Youtubers name or Code which will be used to make the scraper work. 

In [2]:
dataset_thesis = pd.read_csv(r'creators_test.csv', low_memory = False, sep = ",")
dataset_thesis

Unnamed: 0.1,Unnamed: 0,channelId
0,1,UCIv0wH_CdWgttwX6B-Cyt8w
1,2,UCEU5ZK7DwN9ppqPFJiGah3A
2,3,UC8CX0LD98EDXl4UYX1MDCXg
3,4,UCF_fDSgPpBQuh1MsUTgIARQ
4,5,UCUtmzHuW43bFbUP-Xl0TTKA


In [3]:
dataset_code = dataset_thesis['channelId']
dataset_code

0    UCIv0wH_CdWgttwX6B-Cyt8w
1    UCEU5ZK7DwN9ppqPFJiGah3A
2    UC8CX0LD98EDXl4UYX1MDCXg
3    UCF_fDSgPpBQuh1MsUTgIARQ
4    UCUtmzHuW43bFbUP-Xl0TTKA
Name: channelId, dtype: object

## Part 2 Scraper videos page
This scraper scrapes the information from the "video's" page which contains the information of the account name, number of subscribers and the number of video's posted on that account

#### Step 3
In the first extracting step we extract the Youtube pages which can be accessed by adding **'channel'** after the regulare "youtube.com" 

In [1]:
def extract_youtube_channel(codes):
    youtube_channel = [] 
    for code in tqdm(codes):
        driver = selenium.webdriver.Chrome()
        driver.get("https://www.youtube.com/channel/" + code + "/videos")
    
        time.sleep(5)
        
        #Accepting the cookies
        accept = driver.find_elements_by_class_name("VfPpkd-dgl2Hf-ppHlrf-sM5MNb")[5]
        accept.click()
    
        while True:
            scroll_height = 5000
            document_height_before = driver.execute_script("return document.documentElement.scrollHeight")
            driver.execute_script(f"window.scrollTo(0, {document_height_before + scroll_height});")
            time.sleep(1.5)
            document_height_after = driver.execute_script("return document.documentElement.scrollHeight")
            if document_height_after == document_height_before:
                break
              
        #for page_url in page_urls:
        res = driver.page_source.encode('utf-8')
        soup = BeautifulSoup(res, "html.parser")
          
        def num_video():
            try:
                num_video = len(soup.find_all(class_="yt-simple-endpoint style-scope ytd-grid-video-renderer"))
                return(num_video)
            except:
                return  
       
    
        def channel_id():
            try:
                channel_id = soup.find("meta", itemprop="channelId")["content"]
                return(channel_id)
            except:
                return 
    
        youtube_channel.append({"num_video" : num_video(),
                               "channel_id" : channel_id()})                         
            
        sleep (1)
            
    return youtube_channel

youtube_channel = extract_youtube_channel(dataset_code)
youtube_channel

NameError: name 'dataset_code' is not defined

In [None]:
youtube_channel_dataframe_thesis_p1 = pd.DataFrame(youtube_channel)
youtube_channel_dataframe_thesis_p1.to_csv("creator_characteristics_p1.csv", index=False)
youtube_channel_dataframe_thesis_p1

With the first scraper we extracted the number of videos, the second scraper will be used to detect the number of total views and since when somebody is member of YouTube.

## Part 3 Scraper about page
This scraper scrapes the information from the "about" page which contains the information of the account name, number of subscribers, member since and the total number of views an account achieved.

#### Step 4
In the second extracting step from Part 3 we extract the Youtube pages which can be accessed by adding a **'channel'** after the regulare "youtube.com" 

In [None]:
def extract_youtube_channel_p2(codes):
    youtube_channel_p2 = [] 
    for code in tqdm(codes):
        driver = selenium.webdriver.Chrome()
        driver.get("https://www.youtube.com/channel/" + code + "/about")
    
        time.sleep(5)
        
        #Accepting the cookies
        accept = driver.find_elements_by_class_name("VfPpkd-dgl2Hf-ppHlrf-sM5MNb")[5]
        accept.click()
               
        #for page_url in page_urls:
        res = driver.page_source.encode('utf-8')
        soup = BeautifulSoup(res, "html.parser")

        def name():
            try:
                name = soup.find(attrs={"id": "text-container"}).text
                name = name.replace("\n", "")
                return(name)
            except:
                return 
            
        def subscribe():
            try:
                subscribe = soup.find(attrs={"id": "subscriber-count"}).text
                subscribe = subscribe.replace("\xa0", "")
                subscribe = subscribe.replace(" abonnees", "")
                return(subscribe)
            except:
                return 
            
        def views():
            try:
                views = soup.find(attrs={"id": "right-column"}).get_text()
                views = views.split('\n')[3]
                views = views.replace(" weergaven", "")
                return(views)
            except:
                return 
    
        def member_since():
            try:
                member_since = soup.find(attrs={"id": "right-column"}).get_text()
                member_since = member_since.split('\n')[2]
                member_since = member_since.replace("Lid geworden op", "")
                return(member_since)
            except:
                return  
        
        def channel_id():
            try:
                channel_id = soup.find("meta", itemprop="channelId")["content"]
                return(channel_id)
            except:
                return 
        
        youtube_channel_p2.append({"creator": name(),
                                    "subscribers" : subscribe(),
                                    "views" : views(),
                                    "member_since" : member_since(),
                                    "channel_id" : channel_id()})                       
            
        sleep (1)
            
    return youtube_channel_p2

youtube_channel_p2 = extract_youtube_channel_p2(dataset_code)
youtube_channel_p2

In [None]:
youtube_channel_p2_dataframe_thesis = pd.DataFrame(youtube_channel_p2)
youtube_channel_p2_dataframe_thesis.to_csv("creator_characteristics_part_channel_p2.csv", index=False)
youtube_channel_p2_dataframe_thesis

## Part 4 Merge
In this last part of the scraper we will merge the two datasets we have created earlier on together to one complete dataset. 

In [None]:
complete_dataframe_thesis = youtube_channel_dataframe_thesis_p1.merge(youtube_channel_p2_dataframe_thesis, how="inner", on =["channel_id"])
complete_dataframe_thesis.to_csv("creator_characteristics_complete.csv", index=False)
complete_dataframe_thesis