In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import re
import datetime as dt
from webdriver_manager.chrome import ChromeDriverManager



# Youtube queries to search
youtube_query_list = ['stellar crypto', 'ripple', 'cosmos crypto', 'dot crypto', 'dogecoin',
                      'bitcoin', 'ethereum', 'litecoin', 'algorand', 'cardano',
                      'tezos', 'monero', 'dai crypto', 'filecoin', 'tron crypto',
                      'eos crypto', 'chainlink']

# Helper function (format numeric fields)
def clean_youtube_data(input_df):

    watching = pd.DataFrame(input_df['views'].str.find('watching'))
    mask = (watching['views'] == -1)
    df = input_df[mask].reset_index(drop = True)

    for index, row in df.iterrows():
        try:
            # channel_subs
            subs = row['channel_subs']
            index_w = subs.find(" ")
            subs = subs[0:index_w]
            real_subs = 0

            if (subs[len(subs)-1] == "M"):
                real_subs = int(1000000*float(subs[0:len(subs)-1]))

            elif (subs[len(subs)-1] == "K"):
                real_subs = int(1000*float(subs[0:len(subs)-1]))

            else:
                real_subs = int(subs)

            df.iat[index, 4] = real_subs

        except:
            pass
        
        try:
            # channel name
            channel_name = row['channel_name']
            channel_name = channel_name.replace('\n', '')
            channel_name = channel_name.strip()
            df.iat[index, 3] = channel_name

        except:
            pass
        
        try:
            # title
            title = row['title']
            title = title.replace('\n', '')
            title = title.strip()
            df.iat[index, 5] = title

        except:
            pass
        
        try:
            # date posted
            date_posted = row['date_posted']
            date_posted = date_posted.split()
            date_posted = date_posted[-3] + " " + date_posted[-2] + " " + date_posted[-1]
            date_posted = dt.datetime.strptime(date_posted, '%b %d, %Y').date()
            df.iat[index, 6] = date_posted

        except:
            pass

        try:
            # num_views
            views = row['views']

            if ('watching' in views):
                df = df.drop(index)
                continue

            index_w2 = views.find(" ")
            real_views = views[0:index_w2]
            real_views = int(real_views.replace(",", ''))
            df.iat[index, 9] = real_views

        except:
            pass

        try:
            # num_likes
            likes = row['num_likes']
            likes = likes.rstrip(' ')
            index_w3 = likes.find("M")
            index_w4 = likes.find("K")

            if (index_w3 != -1):
                likes = float(likes[0:index_w3])*1000000

            elif (index_w4 != -1):
                likes =float(likes[0:index_w4])*1000

            df.iat[index, 11] = likes

        except:
            pass

        # num_dislikes
        try:
            dislikes = row['num_dislikes']
            index_w5 = dislikes.find("M")
            index_w6 = dislikes.find("K")

            if (index_w5 != -1):
                dislikes = float(dislikes[0:index_w5])*1000000

            elif (index_w6 != -1):
                dislikes = float(dislikes[0:index_w6])*1000
            df.iat[index, 12] = dislikes
        except:
            pass

        # num_comments
        try:
            comments = row['num_comments']
            comments = int(comments.replace(",",""))
            df.iat[index, 10] = comments

        except:
            pass
        
        try:
            # description
            description = row['description']
            description = description.replace('\n', '')
            description = description.strip()
            df.iat[index, 8] = description

        except:
            pass


    return df

# Scrape function (coin queries)
def scrape_youtube(search_query_list, num_vids):
    # Access URL to scrape
    template_url = 'https://www.youtube.com/results?search_query={}&sp=CAMSBAgEEAE%253D'

    # Post data dictionary
    post_dict = {
            "timestamp_scrape" : [],  # timestamp of scrape
            "date_scrape": [],        # date of scrape
            "search_word" : [],       # search query
            "channel_name" : [],      # name of posting account
            "channel_subs" : [],      # number of subscribers on posting account
            "title": [],              # title of video
            "date_posted" : [],       # date video was posted
            "url": [],                # title of video
            "description": [],        # description of video
            "views": [],              # number of video views
            "num_comments": [],       # number of video views
            "num_likes": [],          # number of video likes
            "num_dislikes": []        # number of video dislikes
    }

    chrome_options = Options()
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--window-size=1920,1920")
    chrome_options.add_argument("--mute-audio")


    for search_query in search_query_list:
        url = template_url.format(search_query)
        PATH = "/Users/ArpanBagui/Documents/UPenn/Projects/CryptoDash/chromedriver"
        #driver = webdriver.Chrome(executable_path=PATH, options=chrome_options)
        driver = webdriver.Chrome(ChromeDriverManager().install())
        #driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        time.sleep(2)

        # Find ads in results
        path = '//*[@id="ad-badge-container"]/ytd-badge-supported-renderer/div/span'
        ad_list= driver.find_elements_by_xpath(path)
        num_ads = len(ad_list)
        
        #Variable that counts vid num
        vid_count = 0
        
        for video in driver.find_elements_by_id('video-title')[num_ads:num_vids + num_ads]:
            #Adding to vid count
            vid_count += 1
            
            try:
                video.click()
                time.sleep(5)
            except:
                continue
            
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
                
            try:
                post_dict['url'].append(video.get_attribute('href'))
            except StaleElementReferenceException:
                post_dict['url'].append('')

            try:
                post_dict['title'].append(video.get_attribute('text'))
            except StaleElementReferenceException:
                post_dict['title'].append('')

            try:
                #Variable that allows us to skip over previous channels (weird error with soup)
                channel_index = vid_count*2 - 1
                channel = soup.find_all('a', {"class" : "yt-simple-endpoint style-scope yt-formatted-string"})[channel_index].get_text()
                post_dict['channel_name'].append(channel)

            except TimeoutException:
                post_dict['channel_name'].append('')

            try:
                subs = soup.findAll('yt-formatted-string', id = "owner-sub-count")[0].get_text()
                post_dict['channel_subs'].append(subs)

            except TimeoutException:
                post_dict['channel_subs'].append('')

                
            try:
                date_posted = soup.find_all('yt-formatted-string', {"class" : "style-scope ytd-video-primary-info-renderer"})[1].get_text()
                post_dict['date_posted'].append(date_posted)

            except TimeoutException:
                post_dict['date_posted'].append('')

            
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'view-count')))
                post_dict['views'].append(driver.find_element_by_class_name('view-count').text)

            except TimeoutException:
                post_dict['views'].append(np.NaN)

            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'count')))
                num_comments = soup.find_all('h2', id = 'count')[0].get_text()
                num_comments = num_comments.split()[0]
                post_dict['num_comments'].append(num_comments)

            except TimeoutException:
                post_dict['num_comments'].append(np.NaN)
                
            except IndexError:
                post_dict['num_comments'].append(np.NaN)

            try:
                likes = soup.findAll('yt-formatted-string', {"class" : "style-scope ytd-toggle-button-renderer style-text"})[1].get_text()
                post_dict['num_likes'].append(likes)

            except TimeoutException:
                post_dict['num_likes'].append(np.NaN)

            try:
                dislikes = soup.findAll('yt-formatted-string', {"class" : "style-scope ytd-toggle-button-renderer style-text"})[2].get_text()
                post_dict['num_dislikes'].append(dislikes)
            except TimeoutException:
                post_dict['num_dislikes'].append(np.NaN)

            try:
                description = soup.find_all('div', id = 'description')[0].get_text() 
                post_dict['description'].append(description)

            except TimeoutException:
                post_dict['description'].append('')

            post_dict['timestamp_scrape'].append(pd.to_datetime('now'))
            post_dict['date_scrape'].append(dt.date.today())
            post_dict['search_word'].append(search_query)

            driver.back()

        driver.quit()


   
    youtube_data = clean_youtube_data(pd.DataFrame(post_dict))

    youtube_data['channel_subs'] = youtube_data['channel_subs'].replace([''], np.NaN)

    youtube_data['views'] = youtube_data['views'].replace([''], np.NaN)
    youtube_data['num_comments'] = youtube_data['num_comments'].replace([''], np.NaN)

    youtube_data['num_comments_found'] = (youtube_data['num_comments'] > 0)


    return youtube_data

In [2]:
search_list = ["polkadot", "solana"]

In [3]:
scrape_youtube(search_list, 3)



Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
There is no [mac64] chromedriver for browser 96.0.4664 in cache
Get LATEST driver version for 96.0.4664
Trying to download new driver from https://chromedriver.storage.googleapis.com/96.0.4664.45/chromedriver_mac64.zip
Driver has been saved in cache [/Users/ArpanBagui/.wdm/drivers/chromedriver/mac64/96.0.4664.45]


Current google-chrome version is 96.0.4664
Get LATEST driver version for 96.0.4664
Driver [/Users/ArpanBagui/.wdm/drivers/chromedriver/mac64/96.0.4664.45/chromedriver] found in cache


Unnamed: 0,timestamp_scrape,date_scrape,search_word,channel_name,channel_subs,title,date_posted,url,description,views,num_comments,num_likes,num_dislikes,num_comments_found
0,2021-12-18 20:01:02.895301,2021-12-18,polkadot,Coin Bureau,1760000,Polkadot Parachain Auctions!! ULTIMATE 101 GUI...,2021-11-19,https://www.youtube.com/watch?v=qYr_QLOjL8w,📲 Insider Info in my Socials 👉 https://guy.coi...,212799.0,401.0,9000,Dislike,True
1,2021-12-18 20:01:09.448894,2021-12-18,polkadot,Crypto Moonlight,85500,Everdot - First Ever Reflection Token Rewardin...,2021-11-21,https://www.youtube.com/watch?v=AwuvJebAenk,🔹New Project:💲 🔹Check out there website - http...,131275.0,115.0,11000,Dislike,True
2,2021-12-18 20:01:15.611632,2021-12-18,polkadot,Max Maher,670000,Polkadot Projects Will Make Millionaires (100X),2021-12-13,https://www.youtube.com/watch?v=oNdPEqMO4yM,🔗Join the Patreon: https://www.patreon.com/max...,122921.0,,4400,Dislike,False
3,2021-12-18 20:01:30.085615,2021-12-18,solana,Coin Bureau,1760000,"Solana: Could SOL Hit $1,000?! DEEP DIVE!! ☀️",2021-12-02,https://www.youtube.com/watch?v=enAoz-87D7A,"388,529 views • Dec 2, 2021 • 📲 Insider Info i...",,812.0,19000,Dislike,True
4,2021-12-18 20:01:36.782566,2021-12-18,solana,EllioTrades Crypto,567000,CRYPTO'S BEST OPPORTUNITY IS LIKE BUYING SOLAN...,2021-11-27,https://www.youtube.com/watch?v=8msZ3PkNiAk,"312,860 views • Nov 27, 2021 • ✅ Follow EllioT...",,912.0,13000,Dislike,True
5,2021-12-18 20:01:43.003405,2021-12-18,solana,BitBoy Crypto,1430000,2 Hottest Solana And Ethereum Projects (Ready ...,2021-12-01,https://www.youtube.com/watch?v=8UCOi4nLl2g,"233,287 views • Dec 1, 2021 • Welcome to NFT U...",,160.0,3800,Dislike,True
