In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

In [3]:
def get_all_news_urls(base_url, suffix:str, max_pages:int=None):
    news_urls = []
    page = 1
    continue_search = True
    
    try:
        while continue_search:
            if page == 1:
                current_url = base_url
            else:
                current_url = f"{base_url}/page/{page}"
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
            }
            
            response = requests.get(current_url, headers=headers)
            
            if response.status_code != 200:
                print(f"Reached end at page {page-1}")
                break
                
            soup = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')
            
            found_on_page = 0
            
            for link in links:
                href = link.get('href')
                if href:
                    full_url = urljoin(current_url, href)
                    if suffix in full_url:
                        if full_url not in news_urls:
                            news_urls.append(str(full_url))
                            found_on_page += 1
            
            print(f"Page {page}: Found {found_on_page} new URLs")
            
            # Check max_pages condition
            if max_pages is not None and page >= max_pages:  # Modified this condition
                print(f"Reached maximum pages limit: {max_pages}")
                break
            if found_on_page == 0:
                print("No new URLs found on this page")
                break
                
            page += 1
            time.sleep(1)
            
        return news_urls
        
    except requests.RequestException as e:
        print(f"Error: {e}")
        return news_urls

def main(base_url:str, suffix:str, max_pages:int=None):
    print(f"In main(), max_pages = {max_pages}")  # Debug print
    urls = get_all_news_urls(base_url, suffix, max_pages=max_pages)  # Explicitly named parameter
    
    print("\nResults Summary:")
    print(f"Total URLs found: {len(urls)}")
    print("\nAll URLs:")
    print(urls)
    for url in urls[:max_pages]:
        print(url)

if __name__ == "__main__":
    main('https://1819news.com/', '/news/item', max_pages=10)

In main(), max_pages = 10
Page 1: Found 27 new URLs
Reached end at page 1

Results Summary:
Total URLs found: 27

All URLs:
['https://1819news.com/news/item/ag-marshall-floats-using-state-consumer-protection-laws-as-accountability-mechanism-for-covid-19-vaccine-harm-misinformation', 'https://1819news.com/news/item/u-s-rep-rogers-pushes-to-end-relationship-with-bloated-bureaucratic-supranational-united-nations', 'https://1819news.com/news/item/defendants-in-former-mobile-police-chief-paul-prines-lawsuit-ask-for-dismissal-offer-response-to-allegations', 'https://1819news.com/news/item/sean-of-the-south-of-life-and-heinz-ketchup', 'https://1819news.com/news/item/fbi-director-kash-patel-sending-500-agents-to-huntsvilles-redstone-arsenal', 'https://1819news.com/news/item/tuberville-britt-urge-new-fbi-director-kash-patel-to-fill-1-000-employee-slots-at-redstone-arsenal', 'https://1819news.com/news/item/alabamas-randy-owen-and-wife-kelly-celebrate-50th-wedding-anniversary', 'https://1819news.