In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
def get_issue_urls():
    # The correct URL structure based on the screenshot
    base_url = "https://worksinprogress.co/issue-{}"
    return [base_url.format(i) for i in range(1, 17)]  # Issues 1 to 16

def get_essay_urls_from_issue(issue_url):
    response = requests.get(issue_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector based on the screenshot
    article_links = soup.select('article.article-card header.article-card__head a')
    
    essay_urls = []
    for link in article_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://worksinprogress.co' + url
        essay_urls.append(url)
    
    return essay_urls

def scrape_all_essay_urls():
    all_essay_urls = []
    issue_urls = get_issue_urls()
    
    for issue_url in issue_urls:
        print(f"Scraping essays from {issue_url}")
        essay_urls = get_essay_urls_from_issue(issue_url)
        all_essay_urls.extend(essay_urls)
        time.sleep(1)  # Be respectful with rate limiting
    
    return all_essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Optionally, save URLs to a file
    with open('work_in_progress_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_archive_url():
    return "https://www.worksinprogress.news/archive"

def get_essay_urls_from_archive(archive_url):
    response = requests.get(archive_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector based on the screenshot
    article_links = soup.select('a[data-testid="post-preview-title"]')
    
    essay_urls = []
    for link in article_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://www.worksinprogress.news' + url
        essay_urls.append(url)
    
    return essay_urls

def scrape_all_essay_urls():
    archive_url = get_archive_url()
    print(f"Scraping essays from {archive_url}")
    essay_urls = get_essay_urls_from_archive(archive_url)
    return essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Optionally, save URLs to a file
    with open('works_in_progress_substack_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

In [None]:
def get_archive_url():
    return "https://www.worksinprogress.news/archive"

def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def get_essay_urls_from_archive(archive_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    
    driver.get(archive_url)
    
    # Scroll to load all articles
    scroll_to_bottom(driver)
    
    # Wait for articles to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'a[data-testid="post-preview-title"]'))
    )
    
    # Get the page source after scrolling
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all article links
    article_links = soup.select('a[data-testid="post-preview-title"]')
    
    essay_urls = []
    for link in article_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://www.worksinprogress.news' + url
        essay_urls.append(url)
    
    driver.quit()
    return essay_urls

def scrape_all_essay_urls():
    archive_url = get_archive_url()
    print(f"Scraping essays from {archive_url}")
    essay_urls = get_essay_urls_from_archive(archive_url)
    return essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Save URLs to a file
    with open('works_in_progress_substack_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_issue_urls():
    base_url = "https://worksinprogress.co/issue-{}"
    return [base_url.format(i) for i in range(1, 17)]  # Issues 1 to 16

def get_essay_urls_from_issue(issue_url):
    response = requests.get(issue_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Correct selectors based on the HTML structure we confirmed earlier
    spotlight_links = soup.select('article.spotlight-card .spotlight-card_title > a')
    grid_links = soup.select('article.article-card header.article-card_head > a')
    
    all_links = spotlight_links + grid_links
    
    essay_urls = []
    for link in all_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://worksinprogress.co' + url
        essay_urls.append(url)
    
    return essay_urls

def scrape_all_essay_urls():
    all_essay_urls = []
    issue_urls = get_issue_urls()
    
    for issue_url in issue_urls:
        print(f"Scraping essays from {issue_url}")
        essay_urls = get_essay_urls_from_issue(issue_url)
        all_essay_urls.extend(essay_urls)
        time.sleep(1)  # Be respectful with rate limiting
    
    return all_essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Optionally, save URLs to a file
    with open('work_in_progress_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_issue_urls():
    base_url = "https://worksinprogress.co/issue-{}"
    return [base_url.format(i) for i in range(1, 17)]  # Issues 1 to 16

def get_essay_urls_from_issue(issue_url):
    print(f"Fetching {issue_url}")
    response = requests.get(issue_url)
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector format that we know worked
    grid_links = soup.select('article.article-card header.article-card__head a')
    
    # Adding spotlight articles with a similar structure
    spotlight_links = soup.select('article.spotlight-card .spotlight-card__title a')
    
    all_links = spotlight_links + grid_links
    
    essay_urls = []
    for link in all_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://worksinprogress.co' + url
        if url not in essay_urls:  # Avoid duplicates
            essay_urls.append(url)
    
    print(f"Found {len(essay_urls)} unique essay URLs in this issue")
    return essay_urls

def scrape_all_essay_urls():
    all_essay_urls = []
    issue_urls = get_issue_urls()
    
    for issue_url in issue_urls:
        print(f"\nScraping essays from {issue_url}")
        essay_urls = get_essay_urls_from_issue(issue_url)
        all_essay_urls.extend(essay_urls)
        time.sleep(1)  # Be respectful with rate limiting
    
    return all_essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Save URLs to a file
    with open('work_in_progress_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup

def check_url(url):
    print(f"Attempting to access: {url}")
    response = requests.get(url)
    print(f"Status code: {response.status_code}")
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        print("\nFirst 1000 characters of the page content:")
        print(soup.prettify()[:1000])
        
        print("\nAll 'article' tags found:")
        articles = soup.find_all('article')
        for i, article in enumerate(articles, 1):
            print(f"\nArticle {i}:")
            print(article.prettify()[:500])  # Print first 500 characters of each article
    else:
        print("Failed to access the page. Please check the URL.")

if __name__ == "__main__":
    base_url = "https://worksinprogress.co"
    check_url(base_url)
    
    issue_url = "https://worksinprogress.co/issue-1"
    check_url(issue_url)

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_issue_urls():
    base_url = "https://worksinprogress.co/issue-{}"
    return [base_url.format(i) for i in range(1, 17)]  # Issues 1 to 16

def get_essay_urls_from_issue(issue_url):
    print(f"Scraping essays from {issue_url}")
    response = requests.get(issue_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector for regular articles
    regular_links = soup.select('article.article-card header.article-card__head a')
    
    # Adding a selector for spotlight articles
    spotlight_links = soup.select('article.spotlight-card a.spotlight-card__link')
    
    all_links = regular_links + spotlight_links
    
    essay_urls = []
    for link in all_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://worksinprogress.co' + url
        if url not in essay_urls:  # Avoid duplicates
            essay_urls.append(url)
    
    print(f"Found {len(essay_urls)} unique essay URLs in this issue")
    return essay_urls

def scrape_all_essay_urls():
    all_essay_urls = []
    issue_urls = get_issue_urls()
    
    for issue_url in issue_urls:
        essay_urls = get_essay_urls_from_issue(issue_url)
        all_essay_urls.extend(essay_urls)
        time.sleep(1)  # Be respectful with rate limiting
    
    return all_essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Save URLs to a file
    with open('work_in_progress_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_issue_urls():
    base_url = "https://worksinprogress.co/issue-{}"
    return [base_url.format(i) for i in range(1, 17)]  # Issues 1 to 16

def get_essay_urls_from_issue(issue_url):
    print(f"Scraping essays from {issue_url}")
    response = requests.get(issue_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector for regular articles
    all_links = soup.select('article.article-card header.article-card__head a, article.spotlight-card .spotlight-card__title a')
    
    essay_urls = []
    for link in all_links:
        url = link['href']
        # Exclude author links and ensure we're only getting article links
        if '/issue/' in url and 'our-authors' not in url:
            if not url.startswith('http'):
                url = 'https://worksinprogress.co' + url
            if url not in essay_urls:  # Avoid duplicates
                essay_urls.append(url)
    
    print(f"Found {len(essay_urls)} unique essay URLs in this issue")
    return essay_urls

def scrape_all_essay_urls():
    all_essay_urls = []
    issue_urls = get_issue_urls()
    
    for issue_url in issue_urls:
        essay_urls = get_essay_urls_from_issue(issue_url)
        all_essay_urls.extend(essay_urls)
        time.sleep(1)  # Be respectful with rate limiting
    
    return all_essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Save URLs to a file
    with open('work_in_progress_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [None]:
import requests
from bs4 import BeautifulSoup
import time

def get_archive_url():
    return "https://instituteforprogress.substack.com/archive"

def get_essay_urls_from_archive(archive_url):
    response = requests.get(archive_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Using the correct selector based on the screenshot
    article_links = soup.select('a[data-testid="post-preview-title"]')
    
    essay_urls = []
    for link in article_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://instituteforprogress.substack.com' + url
        essay_urls.append(url)
    
    return essay_urls

def scrape_all_essay_urls():
    archive_url = get_archive_url()
    print(f"Scraping essays from {archive_url}")
    essay_urls = get_essay_urls_from_archive(archive_url)
    return essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Optionally, save URLs to a file
    with open('institute_for_progress_substack_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

In [30]:
def get_archive_url():
    return "https://nathanpmyoung.substack.com/archive"

def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def get_essay_urls_from_archive(archive_url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(options=chrome_options)
    
    driver.get(archive_url)
    
    # Scroll to load all articles
    scroll_to_bottom(driver)
    
    # Wait for articles to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'a[data-testid="post-preview-title"]'))
    )
    
    # Get the page source after scrolling
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all article links
    article_links = soup.select('a[data-testid="post-preview-title"]')
    
    essay_urls = []
    for link in article_links:
        url = link['href']
        if not url.startswith('http'):
            url = 'https://nathanpmyoung.substack.com' + url
        essay_urls.append(url)
    
    driver.quit()
    return essay_urls

def scrape_all_essay_urls():
    archive_url = get_archive_url()
    print(f"Scraping essays from {archive_url}")
    essay_urls = get_essay_urls_from_archive(archive_url)
    return essay_urls

if __name__ == "__main__":
    essay_urls = scrape_all_essay_urls()
    
    print(f"\nTotal essays found: {len(essay_urls)}")
    for url in essay_urls:
        print(url)

    # Save URLs to a file
    with open('predictive_text_substack_urls.txt', 'w') as f:
        for url in essay_urls:
            f.write(f"{url}\n")

Scraping essays from https://nathanpmyoung.substack.com/archive

Total essays found: 69
https://nathanpmyoung.substack.com/p/pantheon-is-really-good
https://nathanpmyoung.substack.com/cp/147682155
https://nathanpmyoung.substack.com/p/truth-seeking-projects-im-interested
https://nathanpmyoung.substack.com/p/forecasting-is-mostly-vibes-so-is
https://nathanpmyoung.substack.com/p/ai-and-integrity
https://nathanpmyoung.substack.com/p/questions-are-too-cheap
https://nathanpmyoung.substack.com/cp/144402757
https://nathanpmyoung.substack.com/p/advice-for-high-schoolers
https://nathanpmyoung.substack.com/p/that-carlsmith-blog-series-in-1-page
https://nathanpmyoung.substack.com/p/my-understanding-of-truth
https://nathanpmyoung.substack.com/p/lets-fix-project-overruns
https://nathanpmyoung.substack.com/p/up-for-debate
https://nathanpmyoung.substack.com/p/theseus-and-the-minotaur
https://nathanpmyoung.substack.com/p/5-of-a-kind
https://nathanpmyoung.substack.com/p/be-more-katja
https://nathanpmyou