In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

In [2]:
def get_all_news_urls(base_url, suffix:str, max_pages:int=None):
    news_urls = []
    page = 1
    continue_search = True
    
    try:
        while continue_search:
            if page == 1:
                current_url = base_url
            else:
                current_url = f"{base_url}/page/{page}"
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            }
            
            response = requests.get(current_url, headers=headers)
            
            if response.status_code != 200:
                print(f"Reached end at page {page-1}")
                break
                
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')
            
            found_on_page = 0
            
            for link in links:
                href = link.get('href')
                if href:
                    full_url = urljoin(current_url, href)
                    if suffix in full_url:
                        if full_url not in news_urls:
                            news_urls.append(str(full_url))
                            found_on_page += 1
            
            print(f"Page {page}: Found {found_on_page} new URLs")
            
            # Check max_pages condition
            if max_pages is not None and page >= max_pages:  # Modified this condition
                print(f"Reached maximum pages limit: {max_pages}")
                break
            if found_on_page == 0:
                print("No new URLs found on this page")
                break
                
            page += 1
            time.sleep(1)
            
        return news_urls
        
    except requests.RequestException as e:
        print(f"Error: {e}")
        return news_urls

def main(base_url:str, suffix:str, max_pages:int=None):
    print(f"In main(), max_pages = {max_pages}")  # Debug print
    urls = get_all_news_urls(base_url, suffix, max_pages=max_pages)  # Explicitly named parameter
    
    print("\nResults Summary:")
    print(f"Total URLs found: {len(urls)}")
    print("\nAll URLs:")
    print(urls)
    for url in urls[:max_pages]:
        print(url)

if __name__ == "__main__":
    main('https://aldailynews.com', '', max_pages=10)

In main(), max_pages = 10
Page 1: Found 36 new URLs
Page 2: Found 3 new URLs
Page 3: Found 3 new URLs
Page 4: Found 3 new URLs
Page 5: Found 3 new URLs
Page 6: Found 3 new URLs
Page 7: Found 3 new URLs
Page 8: Found 3 new URLs
Page 9: Found 3 new URLs
Page 10: Found 3 new URLs
Reached maximum pages limit: 10

Results Summary:
Total URLs found: 63

All URLs:
['https://aldailynews.com#daily-news', 'https://aldailynews.com/', 'https://aldailynews.com/login', 'https://aldailynews.com/logout/?redirect_to=https%3A%2F%2Faldailynews.com&_wpnonce=3b2e12d755', 'https://aldailynews.com/membership-account/subscription/', 'https://aldailynews.com/membership-account/', 'https://aldailynews.com/category/inside-alabama-politics/', 'https://aldailynews.com/category/new-member-profiles/', 'https://aldailynews.com/category/commentary/', 'https://aldailynews.com/category/podcasts-and-videos/', 'https://aldailynews.com/category/education/', 'https://aldailynews.com/category/legislature-2025/', 'https://ald