In [1]:
import requests
from bs4 import BeautifulSoup
from typing import List
import time

def is_recipe_page(url: str) -> bool:
    """
    Check if a webpage is a recipe by looking for 'PRINT RECIPE' and 'PIN RECIPE' buttons.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        response = requests.get(url.strip(), headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        print_recipe = soup.find(string=lambda text: 'print recipe' in text.lower() if text else False)
        pin_recipe = soup.find(string=lambda text: 'pin recipe' in text.lower() if text else False)
        
        return bool(print_recipe and pin_recipe)
        
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return False

def process_url_file(input_file: str, output_file: str) -> None:
    """
    Process a file of URLs and write valid recipe URLs to a new file.
    
    Args:
        input_file (str): Path to input file containing URLs
        output_file (str): Path to output file for valid recipe URLs
    """
    valid_urls = []
    total_urls = 0
    valid_count = 0
    
    print(f"Reading URLs from {input_file}...")
    
    try:
        # Read URLs from input file
        with open(input_file, 'r') as f:
            urls = f.readlines()
        
        total_urls = len(urls)
        print(f"Found {total_urls} URLs to process")
        
        # Process each URL
        for i, url in enumerate(urls, 1):
            url = url.strip()
            print(f"\nProcessing URL {i}/{total_urls}: {url}")
            
            if is_recipe_page(url):
                valid_urls.append(url)
                valid_count += 1
                print("✓ Valid recipe page")
            else:
                print("✗ Not a recipe page")
            
            # Add a small delay to be nice to the server
            time.sleep(1)
            
        # Write valid URLs to output file
        with open(output_file, 'w') as f:
            for url in valid_urls:
                f.write(f"{url}\n")
        
        print(f"\nProcessing complete!")
        print(f"Total URLs processed: {total_urls}")
        print(f"Valid recipe URLs found: {valid_count}")
        print(f"Valid URLs written to: {output_file}")
        
    except Exception as e:
        print(f"Error: {str(e)}")

def main():
    # File paths
    input_file = "recipe_urls.txt"  # Your input file with URLs
    output_file = "valid_recipe_urls.txt"  # Output file for valid URLs
    
    process_url_file(input_file, output_file)

if __name__ == "__main__":
    main()

Reading URLs from recipe_urls.txt...
Found 1280 URLs to process

Processing URL 1/1280: https://pinchofyum.com/bang-bang-salmon-with-avocado-cucumber-salsa
✓ Valid recipe page

Processing URL 2/1280: https://pinchofyum.com/scallion-pancake-with-eggs
✓ Valid recipe page

Processing URL 3/1280: https://pinchofyum.com/homemade-alfredo-sauce
✓ Valid recipe page

Processing URL 4/1280: https://pinchofyum.com/sticky-gochujang-tofu-with-herbs-and-peanuts
✓ Valid recipe page

Processing URL 5/1280: https://pinchofyum.com/chicken-tacos
✓ Valid recipe page

Processing URL 6/1280: https://pinchofyum.com/garlic-and-black-pepper-beef-skewers
✓ Valid recipe page

Processing URL 7/1280: https://pinchofyum.com/chicken-with-coconut-kale
✓ Valid recipe page

Processing URL 8/1280: https://pinchofyum.com/ricotta-meatballs-with-the-crispy-topping
✓ Valid recipe page

Processing URL 9/1280: https://pinchofyum.com/welcome-to-the-sos-series
✗ Not a recipe page

Processing URL 10/1280: https://pinchofyum.com/