In [5]:
import requests
from bs4 import BeautifulSoup
import csv
from pytube import Search
import pandas as pd
from yt_dlp import YoutubeDL

In [6]:
def scrape_composer_works(url, composer_name):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract tables from the page
    tables = soup.find_all('table', class_='wikitable')

    composer_pieces = []

    # Process each table
    for table in tables:
        rows = table.find_all('tr')
        title_idx = key_idx = date_idx = None

        # Find the indexes for Title, Key, and Date
        headers = rows[0].find_all('th')
        for idx, header in enumerate(headers):
            header_text = header.get_text(strip=True).lower()
            if "title" in header_text:
                title_idx = idx
            elif "key" in header_text:
                key_idx = idx
            elif "date" in header_text:
                date_idx = idx

        # Process each row in the table
        for row in rows[1:]:
            cols = row.find_all('td')
            if title_idx is not None and len(cols) > title_idx:
                title = cols[title_idx].get_text(strip=True)
                key = cols[key_idx].get_text(strip=True) if key_idx is not None and len(cols) > key_idx else ''
                date = cols[date_idx].get_text(strip=True) if date_idx is not None and len(cols) > date_idx else ''
                
                if not date:
                    continue
                # Format title with key if key exists
                if key:
                    title = f"{title} (in {key})"

                composer_pieces.append({
                    "composer": composer_name,
                    "Piece Name": title,
                    "year": date
                })

    # Check if any pieces were found
    if not composer_pieces:
        print("No valid pieces found")
        return

    df = pd.DataFrame(composer_pieces)
    return df

In [7]:
def search_youtube(query):
    ydl_opts = {
        'quiet': True,
        'no_warnings': True,
        'extract_flat': True,
        'default_search': 'ytsearch1:'
    }
    with YoutubeDL(ydl_opts) as ydl:
        try:
            result = ydl.extract_info(f"ytsearch1:{query}", download=False)
            if result and 'entries' in result and result['entries']:
                return f"https://youtube.com/watch?v={result['entries'][0]['id']}"
        except:
            pass
    return ""

def add_youtube_links(df, composer_name):
    df['Youtube Link'] = df.apply(
        lambda row: search_youtube(f"{row['Piece Name']} {composer_name}"), 
        axis=1
    )
    return df

In [8]:
url = "https://imslp.org/wiki/List_of_works_by_Ludwig_van_Beethoven" # Get IMSLP catalog for a composer
composer_name = "Ludwig van Beethoven" # Specify name of composer

# Creation of data set including composer, piece name, year, and link to recording
beethoven_df = scrape_composer_works(url, composer_name)
beethoven_df = add_youtube_links(beethoven_df, composer_name)

In [15]:
beethoven_df.to_csv('specify name', index=False) # Save your df to a csv