In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
from collections import defaultdict
from selenium import webdriver

In [18]:
import os
import time
from selenium import webdriver
from bs4 import BeautifulSoup

def get_video_links_ted():
    # Initialize WebDriver (Windows)
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  # Start maximized
    driver = webdriver.Chrome(options=options)

    driver.get('https://www.youtube.com/user/TEDtalksDirector/videos')

    print("📢 Started scraping TED Talks YouTube channel...")

    prev_count = 0
    scroll_attempts = 0

    while True:
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(2)  # Allow time for loading

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        contents_div = soup.find('div', id='contents')

        res = contents_div.find_all('a', 
                                    attrs={'class': 'yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media'})

        links = list(set(i.get('href').replace('/watch?v=', '') for i in res))

        # Check if new videos were loaded
        if len(links) == prev_count:
            scroll_attempts += 1
        else:
            scroll_attempts = 0

        prev_count = len(links)

        # Stop if no new videos are loaded after 3 attempts
        if scroll_attempts >= 3:
            print("🚀 No more new videos found. Stopping scrolling...")
            break

    driver.quit()  # Close the browser
    print(f"✅ Scraping completed! Total videos found: {len(links)}")
    
    return links

# Run the function
links_video_ted = get_video_links_ted()

# Print sample links
print(f"🔗 Sample links: {links_video_ted[:5]}")


📢 Started scraping TED Talks YouTube channel...
🚀 No more new videos found. Stopping scrolling...
✅ Scraping completed! Total videos found: 1272
🔗 Sample links: ['4SCrXqbhmCY', 'tWZmunAvlMM', 'vO5Rio_skIU', 'Rk5C149J9C0', 'KNEGWrD08f8']


In [20]:
import time
import json
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

SAVE_FILE = "scraped_links.json"  # File to store progress

def load_existing_links():
    """Load existing scraped links from a JSON file to resume progress."""
    if os.path.exists(SAVE_FILE):
        try:
            with open(SAVE_FILE, "r") as file:
                data = json.load(file)
            return set(data)  # Convert to set for faster lookups
        except json.JSONDecodeError:
            print("⚠️ Corrupted JSON file, starting fresh...")
            return set()
    return set()

def save_links(links):
    """Save scraped links to JSON file periodically to prevent data loss."""
    with open(SAVE_FILE, "w") as file:
        json.dump(list(links), file, indent=4)

def get_video_links_ted():
    # Load previously scraped links to avoid duplicates
    scraped_links = load_existing_links()
    
    # Initialize WebDriver (Windows)
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  # Start maximized
    driver = webdriver.Chrome(options=options)
    
    driver.get('https://www.youtube.com/user/TEDtalksDirector/videos')
    print(f"📢 Started scraping TED Talks YouTube channel... (Resuming from {len(scraped_links)} videos)")

    prev_count = 0
    scroll_attempts = 0
    max_attempts = 8  # Number of times we try before stopping

    try:
        while True:
            # Scroll down multiple times slowly
            for _ in range(3):
                ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
                time.sleep(1.5)

            # Parse page content
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            contents_div = soup.find('div', id='contents')

            res = contents_div.find_all('a', 
                                        attrs={'class': 'yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media'})

            new_links = set(i.get('href').replace('/watch?v=', '') for i in res) - scraped_links
            scraped_links.update(new_links)  # Add new links

            # Save progress every 50 new videos
            if len(new_links) > 0:
                save_links(scraped_links)
                print(f"✅ Saved progress: {len(scraped_links)} videos scraped so far...")

            # Check if new videos were loaded
            if len(scraped_links) == prev_count:
                scroll_attempts += 1
            else:
                scroll_attempts = 0

            prev_count = len(scraped_links)

            # Try clicking "Show More" button if present
            try:
                show_more = driver.find_element(By.XPATH, '//yt-formatted-string[text()="Show more"]')
                show_more.click()
                time.sleep(3)  # Give time to load more videos
            except:
                pass  # No "Show More" button found, continue scrolling

            # Stop if no new videos are loaded after max_attempts
            if scroll_attempts >= max_attempts:
                print("🚀 No more new videos found. Stopping scrolling...")
                break

    except Exception as e:
        print(f"❌ Error occurred: {e}")
    
    finally:
        driver.quit()  # Ensure the browser is closed
        save_links(scraped_links)  # Save final progress
        print(f"✅ Scraping completed! Total videos found: {len(scraped_links)}")
    
    return scraped_links

# Run the function
links_vid = get_video_links_ted()

# Print sample links
print(f"🔗 Sample links: {list(links_vid)[:5]}")


📢 Started scraping TED Talks YouTube channel... (Resuming from 0 videos)
✅ Saved progress: 28 videos scraped so far...
✅ Saved progress: 60 videos scraped so far...
✅ Saved progress: 88 videos scraped so far...
✅ Saved progress: 120 videos scraped so far...
✅ Saved progress: 148 videos scraped so far...
✅ Saved progress: 180 videos scraped so far...
✅ Saved progress: 208 videos scraped so far...
✅ Saved progress: 240 videos scraped so far...
✅ Saved progress: 268 videos scraped so far...
✅ Saved progress: 300 videos scraped so far...
✅ Saved progress: 328 videos scraped so far...
✅ Saved progress: 360 videos scraped so far...
✅ Saved progress: 388 videos scraped so far...
✅ Saved progress: 420 videos scraped so far...
✅ Saved progress: 448 videos scraped so far...
✅ Saved progress: 480 videos scraped so far...
✅ Saved progress: 508 videos scraped so far...
✅ Saved progress: 540 videos scraped so far...
✅ Saved progress: 564 videos scraped so far...
✅ Saved progress: 572 videos scraped 

In [None]:
import time
import json
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

SAVE_FILE = "scraped_links.json"  # Store progress

def load_existing_links():
    """Load previously scraped links to avoid duplicates."""
    if os.path.exists(SAVE_FILE):
        try:
            with open(SAVE_FILE, "r") as file:
                data = json.load(file)
            return set(data)  # Convert list to set for faster lookup
        except json.JSONDecodeError:
            print("⚠️ JSON file corrupted. Restarting fresh...")
            return set()
    return set()

def save_links(links):
    """Save links to a JSON file periodically."""
    with open(SAVE_FILE, "w") as file:
        json.dump(list(links), file, indent=4)

def get_video_links_ted():
    # Load previously scraped links
    scraped_links = load_existing_links()
    
    # Initialize WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  
    driver = webdriver.Chrome(options=options)

    driver.get('https://www.youtube.com/user/TEDtalksDirector/videos')
    print(f"📢 Resuming scraping... (Already have {len(scraped_links)} videos)")

    # Click the "Popular" button
    try:
        time.sleep(2)  # Wait for page to load
        popular_button = driver.find_element(By.XPATH, '//yt-formatted-string[@title="Popular"]')
        popular_button.click()
        time.sleep(3)  # Allow sorting to take effect
        print("🔥 Clicked on 'Popular' button successfully!")
    except Exception as e:
        print(f"⚠️ Couldn't click on 'Popular' button: {e}")

    prev_count = len(scraped_links)
    scroll_attempts = 0
    max_attempts = 10  # Try 10 times before stopping

    try:
        while True:
            # Scroll further to reach older videos
            for _ in range(5):  
                ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
                time.sleep(1.5)

            # Parse updated page content
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            contents_div = soup.find('div', id='contents')

            res = contents_div.find_all('a', 
                                        attrs={'class': 'yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media'})

            new_links = set(i.get('href').replace('/watch?v=', '') for i in res) - scraped_links
            scraped_links.update(new_links)

            # Save progress every 50 new videos
            if len(new_links) > 0:
                save_links(scraped_links)
                print(f"✅ Progress saved: {len(scraped_links)} videos scraped so far...")

            # Stop if no new videos load after several attempts
            if len(scraped_links) == prev_count:
                scroll_attempts += 1
            else:
                scroll_attempts = 0  # Reset counter if new videos found

            prev_count = len(scraped_links)

            # Attempt clicking "Show More" if available
            try:
                show_more = driver.find_element(By.XPATH, '//yt-formatted-string[text()="Show more"]')
                show_more.click()
                time.sleep(3)
            except:
                pass  # Ignore if button not found

            if scroll_attempts >= max_attempts:
                print("🚀 No more new videos found. Stopping scrolling...")
                break

    except Exception as e:
        print(f"❌ Error: {e}")
    
    finally:
        driver.quit()  # Close browser
        save_links(scraped_links)  # Save final progress
        print(f"✅ Scraping completed! Total videos found: {len(scraped_links)}")
    
    return scraped_links

# Run the function to continue scraping
links_vid = get_video_links_ted()

# Print sample links
print(f"🔗 Sample links: {list(links_vid)[-5:]}")  # Print last 5 links for confirmation


📢 Resuming scraping... (Already have 2060 videos)
🔥 Clicked on 'Popular' button successfully!
✅ Progress saved: 2115 videos scraped so far...
✅ Progress saved: 2140 videos scraped so far...
✅ Progress saved: 2166 videos scraped so far...
✅ Progress saved: 2220 videos scraped so far...
✅ Progress saved: 2244 videos scraped so far...
✅ Progress saved: 2269 videos scraped so far...
✅ Progress saved: 2294 videos scraped so far...
✅ Progress saved: 2341 videos scraped so far...
✅ Progress saved: 2368 videos scraped so far...
✅ Progress saved: 2390 videos scraped so far...
✅ Progress saved: 2414 videos scraped so far...
✅ Progress saved: 2463 videos scraped so far...
✅ Progress saved: 2486 videos scraped so far...
✅ Progress saved: 2511 videos scraped so far...
✅ Progress saved: 2531 videos scraped so far...
✅ Progress saved: 2577 videos scraped so far...
✅ Progress saved: 2599 videos scraped so far...
✅ Progress saved: 2622 videos scraped so far...
✅ Progress saved: 2651 videos scraped so f

In [24]:
import time
import json
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

SAVE_FILE = "scraped_links.json"  # Store progress

def load_existing_links():
    """Load previously scraped links to avoid duplicates."""
    if os.path.exists(SAVE_FILE):
        try:
            with open(SAVE_FILE, "r") as file:
                data = json.load(file)
            return set(data)  # Convert list to set for faster lookup
        except json.JSONDecodeError:
            print("⚠️ JSON file corrupted. Restarting fresh...")
            return set()
    return set()

def save_links(links):
    """Save links to a JSON file periodically."""
    with open(SAVE_FILE, "w") as file:
        json.dump(list(links), file, indent=4)

def get_video_links_ted():
    # Load previously scraped links
    scraped_links = load_existing_links()
    
    # Initialize WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  
    driver = webdriver.Chrome(options=options)

    driver.get('https://www.youtube.com/user/TEDtalksDirector/videos')
    print(f"📢 Resuming scraping... (Already have {len(scraped_links)} videos)")

    # Click the "Popular" button
    try:
        time.sleep(2)  # Wait for page to load
        popular_button = driver.find_element(By.XPATH, '//yt-formatted-string[@title="Popular"]')
        popular_button.click()
        time.sleep(3)  # Allow sorting to take effect
        print("🔥 Clicked on 'Popular' button successfully!")
    except Exception as e:
        print(f"⚠️ Couldn't click on 'Popular' button: {e}")

    prev_count = len(scraped_links)
    last_scroll_height = 0

    try:
        while True:
            # Scroll down
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)

            # Get new page height
            new_scroll_height = driver.execute_script("return document.documentElement.scrollHeight")
            
            # Parse updated page content
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            contents_div = soup.find('div', id='contents')

            res = contents_div.find_all('a', 
                                        attrs={'class': 'yt-simple-endpoint focus-on-expand style-scope ytd-rich-grid-media'})

            new_links = set(i.get('href').replace('/watch?v=', '') for i in res) - scraped_links
            scraped_links.update(new_links)

            # Save progress every 50 new videos
            if len(new_links) > 0:
                save_links(scraped_links)
                print(f"✅ Progress saved: {len(scraped_links)} videos scraped so far...")

            # If no new videos and scroll height doesn't change, stop
            if len(scraped_links) == prev_count and new_scroll_height == last_scroll_height:
                print("🚀 Reached the end of the page. Stopping scraping...")
                break

            prev_count = len(scraped_links)
            last_scroll_height = new_scroll_height  # Update last scroll height

    except Exception as e:
        print(f"❌ Error: {e}")
    
    finally:
        driver.quit()  # Close browser
        save_links(scraped_links)  # Save final progress
        print(f"✅ Scraping completed! Total videos found: {len(scraped_links)}")
    
    return scraped_links

# Run the function to continue scraping
links_vid = get_video_links_ted()

# Print sample links
print(f"🔗 Sample links: {list(links_vid)[-5:]}")  # Print last 5 links for confirmation


📢 Resuming scraping... (Already have 2836 videos)
🔥 Clicked on 'Popular' button successfully!
✅ Progress saved: 2843 videos scraped so far...
✅ Progress saved: 2846 videos scraped so far...
✅ Progress saved: 2852 videos scraped so far...
✅ Progress saved: 2856 videos scraped so far...
✅ Progress saved: 2860 videos scraped so far...
✅ Progress saved: 2865 videos scraped so far...
✅ Progress saved: 2868 videos scraped so far...
✅ Progress saved: 2875 videos scraped so far...
✅ Progress saved: 2879 videos scraped so far...
✅ Progress saved: 2882 videos scraped so far...
✅ Progress saved: 2886 videos scraped so far...
✅ Progress saved: 2893 videos scraped so far...
✅ Progress saved: 2896 videos scraped so far...
✅ Progress saved: 2901 videos scraped so far...
✅ Progress saved: 2903 videos scraped so far...
✅ Progress saved: 2908 videos scraped so far...
✅ Progress saved: 2915 videos scraped so far...
✅ Progress saved: 2918 videos scraped so far...
✅ Progress saved: 2922 videos scraped so f