In [2]:
pip install selenium

Collecting selenium
  Using cached selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Using cached h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Using cached selenium-4.25.0-py3-none-any.whl (9.7 MB)
Downloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m3.1 M

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import re

def scrape_youtube_video(url):
    # Set up Selenium WebDriver options
    options = Options()
    options.add_argument("--headless")  # Run headless Chrome
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920x1080")
    options.add_argument("--mute-audio")
    
    # Initialize the WebDriver (make sure to specify the path to chromedriver if necessary)
    driver = webdriver.Chrome(options=options)

    # Open the YouTube video URL
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

    # Scroll to load dynamic content
    driver.execute_script("window.scrollTo(0, 600);")  # Scroll to load description and likes
    time.sleep(2)

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    video_data = {}

    # Scrape the title
    try:
        title = soup.find('h1', {'class': 'title'}).text.strip()
    except:
        title = None
    video_data['title'] = title

    # Scrape the description
    try:
        # Expand the description if necessary
        show_more = driver.find_element(By.XPATH, "//tp-yt-paper-button[@id='expand']")
        driver.execute_script("arguments[0].click();", show_more)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        description = soup.find('yt-formatted-string', {'class': 'content', 'slot': 'content'}).text.strip()
    except:
        description = None
    video_data['description'] = description

    # Scrape the number of views
    try:
        views = soup.find('span', class_='view-count').text.strip()
    except:
        views = None
    video_data['views'] = views

    # Scrape the published date
    try:
        date_published = soup.find('div', {'id': 'date'}).find('yt-formatted-string').text.strip()
    except:
        date_published = None
    video_data['date_published'] = date_published

    # Scrape the number of likes
    try:
        # Likes are dynamically loaded; need to scroll and wait
        driver.execute_script("window.scrollTo(0, 800);")
        time.sleep(2)
        like_button = driver.find_element(By.XPATH, "//ytd-toggle-button-renderer[1]//a")
        likes = like_button.get_attribute('aria-label')
    except:
        likes = None
    video_data['likes'] = likes

    # Scrape the uploader information
    try:
        channel_name = soup.find('yt-formatted-string', {'class': 'ytd-channel-name'}).find('a').text.strip()
    except:
        channel_name = None
    video_data['channel_name'] = channel_name

    try:
        subscriber_count = soup.find('yt-formatted-string', {'id': 'owner-sub-count'}).text.strip()
    except:
        subscriber_count = None
    video_data['subscriber_count'] = subscriber_count

    # Scrape the video duration
    try:
        duration = soup.find('span', {'class': 'ytp-time-duration'}).text.strip()
    except:
        duration = None
    video_data['duration'] = duration

    # Scrape tags (keywords) from page source
    try:
        page_source = driver.page_source
        keywords_match = re.search('"keywords":\[(.*?)\]', page_source)
        if keywords_match:
            keywords = keywords_match.group(1).replace('"', '').split(',')
        else:
            keywords = None
    except:
        keywords = None
    video_data['tags'] = keywords

    # Scrape comments
    try:
        # Scroll to the comments section
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(5)  # Wait for comments to load
        # Scroll multiple times to load more comments
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        comment_elems = soup.find_all('yt-formatted-string', {'id': 'content-text'})
        comments = [comment.text.strip() for comment in comment_elems]
    except:
        comments = []
    video_data['comments'] = comments

    # Scrape the number of comments
    try:
        comment_count = soup.find('h2', {'id': 'count'}).find('yt-formatted-string').text.strip()
    except:
        comment_count = None
    video_data['comment_count'] = comment_count

    # Scrape the transcript
    try:
        # Click on the "More actions" button (the three dots)
        more_actions = driver.find_element(By.XPATH, "//button[@aria-label='More actions']")
        driver.execute_script("arguments[0].click();", more_actions)
        time.sleep(1)
        # Click on "Open transcript"
        open_transcript = driver.find_element(By.XPATH, "//yt-formatted-string[text()='Show transcript']")
        driver.execute_script("arguments[0].click();", open_transcript)
        time.sleep(2)
        # Extract the transcript
        transcript_elements = driver.find_elements(By.XPATH, "//ytd-transcript-renderer//div[@id='body']//div[@class='cue-group style-scope ytd-transcript-body-renderer']//div[@class='cue style-scope ytd-transcript-body-renderer']")
        transcript = ' '.join([elem.text.strip() for elem in transcript_elements])
    except:
        transcript = None
    video_data['transcript'] = transcript

    # Close the driver
    driver.quit()

    return video_data

# Example usage
if __name__ == "__main__":
    url = input("Enter YouTube video URL: ")
    data = scrape_youtube_video(url)
    for key, value in data.items():
        if key == 'comments':
            print(f"{key}: {len(value)} comments scraped\n")
        else:
            print(f"{key}: {value}\n")


  keywords_match = re.search('"keywords":\[(.*?)\]', page_source)


Enter YouTube video URL:  https://youtu.be/9D0bGia4QrI?si=TEXzWYIN3YpZqzJy


title: 

description: None

views: 758,263 views

date_published: None

likes: Like this comment along with 384 other people

channel_name: Sherlock Holmes Stories Magpie Audio

subscriber_count: 150K subscribers

duration: 50:42

tags: ['scandal in bohemia', 'Scandal', 'Bohemia', 'Adventures of Sherlock Holmes', 'Sherlock Holmes', 'Holmes', 'Homes', 'Watson', 'detective', 'Greg Wagland', 'Wagland', 'Magpie Audio', 'unabridged']

comments: 0 comments scraped

comment_count: 273 Comments

transcript: None



In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import re

def scrape_youtube_video(url):
    # Set up Chrome options and service
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Runs Chrome in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())

    # Set up WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open YouTube video
    driver.get(url)

    # Wait for page to load
    time.sleep(5)

    # Scroll to load dynamic content
    driver.execute_script("window.scrollTo(0, 600);")
    time.sleep(2)

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    video_data = {}

    # Scrape the title
    try:
        title = soup.find('h1', {'class': 'title'}).text.strip()
    except:
        title = None
    video_data['title'] = title

    # Scrape the description
    try:
        # Expand the description if necessary
        more_button = driver.find_element(By.XPATH, '//*[@id="expand"]')
        driver.execute_script("arguments[0].click();", more_button)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        description = soup.find('yt-formatted-string', {'class': 'content', 'slot': 'content'}).text.strip()
    except:
        description = None
    video_data['description'] = description

    # Scrape the number of views
    try:
        views = soup.find('span', class_='view-count').text.strip()
    except:
        views = None
    video_data['views'] = views

    # Scrape the published date
    try:
        date_published = soup.find('div', {'id': 'date'}).find('yt-formatted-string').text.strip()
    except:
        date_published = None
    video_data['date_published'] = date_published

    # Scrape the number of likes
    try:
        like_button = driver.find_element(By.XPATH, "//ytd-toggle-button-renderer[1]//a")
        likes = like_button.get_attribute('aria-label')
    except:
        likes = None
    video_data['likes'] = likes

    # Scrape the uploader information
    try:
        channel_name = soup.find('yt-formatted-string', {'class': 'ytd-channel-name'}).find('a').text.strip()
    except:
        channel_name = None
    video_data['channel_name'] = channel_name

    try:
        subscriber_count = soup.find('yt-formatted-string', {'id': 'owner-sub-count'}).text.strip()
    except:
        subscriber_count = None
    video_data['subscriber_count'] = subscriber_count

    # Scrape the video duration
    try:
        duration = soup.find('span', {'class': 'ytp-time-duration'}).text.strip()
    except:
        duration = None
    video_data['duration'] = duration

    # Scrape tags (keywords) from page source
    try:
        page_source = driver.page_source
        keywords_match = re.search('"keywords":\[(.*?)\]', page_source)
        if keywords_match:
            keywords = keywords_match.group(1).replace('"', '').split(',')
        else:
            keywords = None
    except:
        keywords = None
    video_data['tags'] = keywords

    # Scrape comments
    try:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(5)  # Wait for comments to load
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        comment_elems = soup.find_all('yt-formatted-string', {'id': 'content-text'})
        comments = [comment.text.strip() for comment in comment_elems]
    except:
        comments = []
    video_data['comments'] = comments

    # Scrape the number of comments
    try:
        comment_count = soup.find('h2', {'id': 'count'}).find('yt-formatted-string').text.strip()
    except:
        comment_count = None
    video_data['comment_count'] = comment_count

    # Scrape the transcript
    try:
        # Open the transcript menu
        more_actions = driver.find_element(By.XPATH, "//button[@aria-label='More actions']")
        driver.execute_script("arguments[0].click();", more_actions)
        time.sleep(1)

        # Click on "Open transcript"
        open_transcript = driver.find_element(By.XPATH, "//yt-formatted-string[text()='Show transcript']")
        driver.execute_script("arguments[0].click();", open_transcript)
        time.sleep(2)

        # Extract the transcript
        transcript = ""
        index = 1
        while True:
            try:
                # Locate each transcript segment
                parent_xpath = f'//*[@id="segments-container"]/ytd-transcript-segment-renderer[{index}]'
                child_xpath = f'{parent_xpath}/div/yt-formatted-string'
                text_element = driver.find_element(By.XPATH, child_xpath)
                transcript += text_element.text + "\n"
                index += 1
            except:
                break
    except:
        transcript = None
    video_data['transcript'] = transcript

    # Close the driver
    driver.quit()

    return video_data

# Example usage
if __name__ == "__main__":
    url = input("Enter YouTube video URL: ")
    data = scrape_youtube_video(url)
    for key, value in data.items():
        if key == 'comments':
            print(f"{key}: {len(value)} comments scraped\n")
        else:
            print(f"{key}: {value}\n")


  keywords_match = re.search('"keywords":\[(.*?)\]', page_source)


Enter YouTube video URL:  https://youtu.be/9D0bGia4QrI?si=TEXzWYIN3YpZqzJy


title: 

description: None

views: 758,266 views

date_published: None

likes: Like this comment along with 384 other people

channel_name: Sherlock Holmes Stories Magpie Audio

subscriber_count: 150K subscribers

duration: 50:42

tags: ['scandal in bohemia', 'Scandal', 'Bohemia', 'Adventures of Sherlock Holmes', 'Sherlock Holmes', 'Holmes', 'Homes', 'Watson', 'detective', 'Greg Wagland', 'Wagland', 'Magpie Audio', 'unabridged']

comments: 0 comments scraped

comment_count: 273 Comments

transcript: None



In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import re

def scrape_youtube_video(url):
    # Set up Chrome options and service
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Runs Chrome in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--mute-audio")
    service = Service(ChromeDriverManager().install())

    # Set up WebDriver
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open YouTube video
    driver.get(url)

    # Wait for page to load
    time.sleep(5)

    # Scroll to load dynamic content
    driver.execute_script("window.scrollTo(0, 600);")  # Scroll to load description and likes
    time.sleep(2)

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    video_data = {}

    # Scrape the title
    try:
        # Try to get title from meta tag
        title = soup.find('meta', property='og:title')['content']
    except:
        try:
            # Fallback to h1
            title = soup.find('h1').text.strip()
        except:
            title = None
    video_data['title'] = title

    # Scrape the description
    try:
        # Expand the description if necessary
        more_button = driver.find_element(By.XPATH, "//tp-yt-paper-button[@id='expand']")
        driver.execute_script("arguments[0].click();", more_button)
        time.sleep(2)
        # Update soup after clicking
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        description = soup.find('yt-formatted-string', {'class': 'content', 'slot': 'content'}).text.strip()
    except:
        description = None
    video_data['description'] = description

    # Scrape the number of views
    try:
        # Get views from meta tag
        views = soup.find('meta', itemprop='interactionCount')['content']
        # Format views with commas
        views = f"{int(views):,} views"
    except:
        try:
            # Fallback to span
            views = soup.find('span', class_='view-count').text.strip()
        except:
            views = None
    video_data['views'] = views

    # Scrape the published date
    try:
        date_published = soup.find('meta', itemprop='datePublished')['content']
    except:
        try:
            date_published = soup.find('div', {'id': 'date'}).find('yt-formatted-string').text.strip()
        except:
            date_published = None
    video_data['date_published'] = date_published

    # Scrape the number of likes
    try:
        # Like button's aria-label contains likes
        like_button = driver.find_element(By.XPATH, "//ytd-toggle-button-renderer[1]//yt-formatted-string")
        likes = like_button.get_attribute('aria-label')
        # Extract number from string
        likes_number = re.search(r'([\d,]+)', likes)
        if likes_number:
            likes = likes_number.group(1)
    except:
        likes = None
    video_data['likes'] = likes

    # Scrape the uploader information
    try:
        channel_name = soup.find('yt-formatted-string', {'class': 'ytd-channel-name'}).find('a').text.strip()
    except:
        channel_name = None
    video_data['channel_name'] = channel_name

    try:
        subscriber_count = soup.find('yt-formatted-string', {'id': 'owner-sub-count'}).text.strip()
    except:
        subscriber_count = None
    video_data['subscriber_count'] = subscriber_count

    # Scrape the video duration
    try:
        duration = soup.find('meta', itemprop='duration')['content']
        # Convert duration from ISO 8601 to HH:MM:SS
        match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        seconds = int(match.group(3)) if match.group(3) else 0
        duration = f"{hours}:{minutes:02}:{seconds:02}" if hours > 0 else f"{minutes}:{seconds:02}"
    except:
        duration = None
    video_data['duration'] = duration

    # Scrape tags (keywords) from page source
    try:
        page_source = driver.page_source
        # Correct regex with escaped brackets
        keywords_match = re.search(r'"keywords":\[(.*?)\]', page_source)
        if keywords_match:
            keywords = keywords_match.group(1).replace('"', '').split(',')
            keywords = [tag.strip() for tag in keywords]
        else:
            keywords = None
    except:
        keywords = None
    video_data['tags'] = keywords

    # Scrape comments
    try:
        # Scroll to the comments section
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(5)  # Wait for comments to load

        # Scroll multiple times to load more comments
        last_height = driver.execute_script("return document.documentElement.scrollHeight")
        scroll_attempt = 0
        while scroll_attempt < 5:
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.documentElement.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            scroll_attempt += 1

        # Update soup after scrolling
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        comment_elems = soup.find_all('yt-formatted-string', {'id': 'content-text'})
        comments = [comment.text.strip() for comment in comment_elems]
    except:
        comments = []
    video_data['comments'] = comments

    # Scrape the number of comments
    try:
        # Get comment count from meta tag
        comment_count = soup.find('yt-formatted-string', {'id': 'count'}).text.strip()
    except:
        comment_count = None
    video_data['comment_count'] = comment_count

    # Scrape the transcript using the user's working code
    try:
        # Click on the "More actions" button (the three dots)
        more_actions = driver.find_element(By.XPATH, "//button[@aria-label='More actions']")
        driver.execute_script("arguments[0].click();", more_actions)
        time.sleep(1)

        # Click on "Show transcript"
        open_transcript = driver.find_element(By.XPATH, "//yt-formatted-string[text()='Show transcript']")
        driver.execute_script("arguments[0].click();", open_transcript)
        time.sleep(2)

        # Initialize an empty string to store the full transcript
        transcript = ""

        # Start from the first parent container and loop through all transcript segments
        index = 1
        while True:
            try:
                # Format the parent and child XPath using the index
                parent_xpath = f'//*[@id="segments-container"]/ytd-transcript-segment-renderer[{index}]'
                child_xpath = f'{parent_xpath}/div/yt-formatted-string'

                # Locate the child element that contains the transcript text
                text_element = driver.find_element(By.XPATH, child_xpath)
                transcript += text_element.text + "\n"

                # Move to the next parent container
                index += 1
            except:
                # Break the loop if no more segments are found
                break
    except:
        transcript = None
    video_data['transcript'] = transcript

    # Close the driver
    driver.quit()

    return video_data

# Example usage
if __name__ == "__main__":
    url = input("Enter YouTube video URL: ")
    data = scrape_youtube_video(url)
    for key, value in data.items():
        if key == 'comments':
            print(f"{key}: {len(value)} comments scraped\n")
        else:
            print(f"{key}: {value}\n")


Enter YouTube video URL:  https://youtu.be/9D0bGia4QrI?si=TEXzWYIN3YpZqzJy


title: 1 A Scandal in Bohemia from The Adventures of Sherlock Holmes  (1892) Audiobook

description: None

views: 758,269 views

date_published: 2017-12-16T14:30:01-08:00

likes: None

channel_name: Sherlock Holmes Stories Magpie Audio

subscriber_count: 150K subscribers

duration: 50:43

tags: ['scandal in bohemia', 'Scandal', 'Bohemia', 'Adventures of Sherlock Holmes', 'Sherlock Holmes', 'Holmes', 'Homes', 'Watson', 'detective', 'Greg Wagland', 'Wagland', 'Magpie Audio', 'unabridged']

comments: 0 comments scraped

comment_count: None

transcript: None

