In [3]:
import os
import requests
from bs4 import BeautifulSoup
import yt_dlp
from urllib.parse import urljoin

# Create downloads folder if it doesn't exist
download_folder = 'downloads'
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Base URL of the page to crawl
base_url = 'https://socialdance.stanford.edu/Syllabi/'

# Function to download PDF
def download_pdf(pdf_url, folder):
    response = requests.get(pdf_url)
    pdf_name = os.path.join(folder, pdf_url.split('/')[-1])
    with open(pdf_name, 'wb') as pdf_file:
        pdf_file.write(response.content)
    print(f'Downloaded PDF: {pdf_name}')

# Function to download YouTube video using yt-dlp
def download_youtube_video(video_url, video_title, folder):
    # Sanitize the title to create a valid filename
    valid_title = "".join(c for c in video_title if c.isalnum() or c in (' ', '_')).rstrip()
    video_path = os.path.join(folder, valid_title + '.mp4')
    
    ydl_opts = {
        'outtmpl': video_path,
        'format': 'best'
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    
    print(f'Downloaded YouTube video: {video_path}')

# URL of the page to crawl
urls = [
    #'https://socialdance.stanford.edu/Syllabi/Pinewoods2024.html',
    "https://socialdance.stanford.edu/Syllabi/PinewoodsFolkDays2024.html",
]

for url in urls:
    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all table rows
    rows = soup.find_all('tr')
    
    # Loop through the rows to find links
    for row in rows:
        link = row.find('a')
        if link:
            href = link.get('href')
            link_text = link.get_text(strip=True)
            full_url = urljoin(base_url, href)
            if href.lower().endswith('.pdf'):
                download_pdf(full_url, download_folder)
            elif 'youtube.com' in href or 'youtu.be' in href:
                download_youtube_video(full_url, link_text, download_folder)


Downloaded PDF: downloads\The_Lambeth_Walk.pdf
Downloaded PDF: downloads\Palais_Glide_Hot_Pretzels.pdf
[youtube] Extracting URL: https://www.youtube.com/watch?v=hyp4ZlQoVS8
[youtube] hyp4ZlQoVS8: Downloading webpage
[youtube] hyp4ZlQoVS8: Downloading ios player API JSON
[youtube] hyp4ZlQoVS8: Downloading m3u8 information
[info] hyp4ZlQoVS8: Downloading 1 format(s): 18
[download] Destination: downloads\1938 film of the Palais Glide and Lambeth Walk.mp4
[download] 100% of   10.55MiB in 00:00:21 at 502.41KiB/s 
Downloaded YouTube video: downloads\1938 film of the Palais Glide and Lambeth Walk.mp4
Downloaded PDF: downloads\Big_Apple_5_Figures.PDF
Downloaded PDF: downloads\One-Step.pdf
Downloaded PDF: downloads\Ragtime_Era_Animal_Dances.pdf
Downloaded PDF: downloads\Chalypso.pdf
[youtube] Extracting URL: https://www.youtube.com/watch?v=giVUuw5e-pg
[youtube] giVUuw5e-pg: Downloading webpage
[youtube] giVUuw5e-pg: Downloading ios player API JSON
[youtube] giVUuw5e-pg: Downloading m3u8 informa



[youtube] 2DwK57Dea7Y: Downloading ios player API JSON
[youtube] 2DwK57Dea7Y: Downloading player 1f8742dc
[youtube] 2DwK57Dea7Y: Downloading web player API JSON
[youtube] 2DwK57Dea7Y: Downloading m3u8 information
[info] 2DwK57Dea7Y: Downloading 1 format(s): 18
[download] Destination: downloads\Music for 1950s Teen Cha Cha.mp4
[download] 100% of    7.07MiB in 00:00:12 at 560.50KiB/s 
Downloaded YouTube video: downloads\Music for 1950s Teen Cha Cha.mp4
[youtube] Extracting URL: https://youtu.be/oE4GMAm4E7k
[youtube] oE4GMAm4E7k: Downloading webpage
[youtube] oE4GMAm4E7k: Downloading ios player API JSON
[youtube] oE4GMAm4E7k: Downloading m3u8 information
[info] oE4GMAm4E7k: Downloading 1 format(s): 18
[download] Destination: downloads\Music for the Shadow Waltz Mixer.mp4
[download] 100% of    6.91MiB in 00:00:13 at 536.96KiB/s 
Downloaded YouTube video: downloads\Music for the Shadow Waltz Mixer.mp4
[youtube] Extracting URL: https://youtu.be/U6G-gwjseYI
[youtube] U6G-gwjseYI: Downloading 