In [None]:
import requests
from bs4 import BeautifulSoup

# Send HTTP request to the target URL
url = "https://example.com"
response = requests.get(url)

# Retrieve the HTML content
html_content = response.text

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Extract the desired data using selectors or XPath
title = soup.find("h1").text
paragraphs = [p.text for p in soup.find_all("p")]

# Store the data in a desired format
data = {
    "title": title,
    "paragraphs": paragraphs
}

# Example: Print the scraped data
print(data)


## types of html parser

In [None]:
from bs4 import BeautifulSoup

html_content = "<html><body><h1>Hello, BeautifulSoup!</h1></body></html>"

# Using the "html.parser" parser
soup = BeautifulSoup(html_content, "html.parser")

# Using the "lxml" parser
soup = BeautifulSoup(html_content, "lxml")

# Using the "html5lib" parser
soup = BeautifulSoup(html_content, "html5lib")


## Using try-except blocks for error handling:

In [3]:
import requests

try:
    response = requests.get(url)
    response.raise_for_status()
    # Process the response or extract data here
except requests.exceptions.RequestException as e:
    # Handle request exceptions, such as connection errors or invalid URLs
    print("Request error:", e)
except requests.exceptions.HTTPError as e:
    # Handle HTTP errors, such as 404 or 500 status codes
    print("HTTP error:", e)


## Logging errors:

In [4]:
import logging
import requests

logging.basicConfig(level=logging.ERROR)

try:
    response = requests.get(url)
    response.raise_for_status()
    # Process the response or extract data here
except requests.exceptions.RequestException as e:
    # Log the error using the logging module
    logging.error("Request error: %s", e)
except requests.exceptions.HTTPError as e:
    logging.error("HTTP error: %s", e)


## Retrying failed requests with exponential backoff:

In [5]:
import requests
import time

MAX_RETRIES = 3
RETRY_DELAY = 2

for _ in range(MAX_RETRIES):
    try:
        response = requests.get(url)
        response.raise_for_status()
        # Process the response or extract data here
        break  # Break the loop if the request is successful
    except requests.exceptions.RequestException as e:
        print("Request error:", e)
        # Wait for a delay before retrying
        time.sleep(RETRY_DELAY)
else:
    print("Max retries exceeded, request failed")


## Implementing timeouts:

In [6]:
import requests

try:
    response = requests.get(url, timeout=5)
    response.raise_for_status()
    # Process the response or extract data here
except requests.exceptions.RequestException as e:
    print("Request error:", e)


## how to simulate user authentication using the requests library in Python:

In [7]:
import requests

# URL of the login page
login_url = 'https://example.com/login'

# User credentials
username = 'your_username'
password = 'your_password'

# Create a session object
session = requests.Session()

# Send a GET request to the login page to retrieve necessary cookies and tokens
response = session.get(login_url)

# Extract necessary form data (such as CSRF token) from the response
# ...

# Prepare the login payload with the required form data
payload = {
    'username': username,
    'password': password,
    'csrf_token': 'your_csrf_token'
}

# Send a POST request to the login page with the login payload
response = session.post(login_url, data=payload)

# Check if authentication was successful (based on the response)
if response.status_code == 200:
    print("Login successful")
else:
    print("Login failed")

# Now, you can use the authenticated session to access other protected pages
response = session.get('https://example.com/protected_page')

# Process the response and extract the desired data
# ...

# Close the session (optional)
session.close()


Login failed


## ethical considerations while web scraping using Python

In [8]:
import requests
import time
from urllib.parse import urlparse

# URL of the website you intend to scrape
url = 'https://example.com/'

# Send a GET request to the website's robots.txt file
robots_url = urlparse(url)._replace(path='/robots.txt').geturl()
response = requests.get(robots_url)

# Check if the website has a robots.txt file
if response.status_code == 200:
    robots_txt = response.text
    # Parse the robots.txt file and extract the rules
    # ...

    # Check if the website allows scraping
    if 'User-agent: *' in robots_txt and 'Disallow: /' not in robots_txt:
        # Respect rate limits and avoid overloading the website's servers
        delay_between_requests = 2  # Set an appropriate delay between requests
        headers = {'User-Agent': 'Your User Agent'}  # Set a valid User-Agent header

        # Start scraping the website
        while True:
            try:
                # Send a GET request to the website
                response = requests.get(url, headers=headers)
                # Process the response or extract the desired data
                # ...

                # Respect rate limits and avoid overloading the website's servers
                time.sleep(delay_between_requests)
            except requests.exceptions.RequestException as e:
                # Handle request exceptions, such as connection errors
                print("Request error:", e)

            # Break the loop if scraping is complete or a condition is met
            # ...

        # Ensure the scraped data is used responsibly and legally
        # ...

    else:
        print("Scraping not allowed by the website's robots.txt file")
else:
    print("Website does not have a robots.txt file")


Website does not have a robots.txt file


## implement monitoring and maintenance features in a Python web scraping script

In [9]:
import requests
import logging
import time

# Set up logging
logging.basicConfig(filename='scraping.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Target URL to scrape
url = 'https://example.com/'

# Function to scrape the website
def scrape_website():
    try:
        # Send a GET request to the website
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Process the response or extract the desired data
            # ...
            
            # Log a success message
            logging.info('Scraping successful')
        else:
            # Log an error message
            logging.error('Request failed with status code: ' + str(response.status_code))
            
    except requests.exceptions.RequestException as e:
        # Log the exception message
        logging.error('Request error: ' + str(e))
        # Raise the exception to be handled at a higher level
        raise

# Function to periodically scrape the website
def run_scraping_job():
    while True:
        try:
            # Scrape the website
            scrape_website()
            
        except Exception as e:
            # Log the exception message
            logging.error('An error occurred: ' + str(e))
            # Implement additional error handling or notifications if needed
            
        # Wait for a specified interval before scraping again
        time.sleep(3600)  # Scrape every hour

# Entry point of the script
if __name__ == '__main__':
    try:
        # Run the scraping job
        run_scraping_job()
        
    except KeyboardInterrupt:
        # Log a message when the script is interrupted by the user
        logging.info('Script interrupted by user')


: 

## implement proxies or rotating IP addresses in Python using the Requests library

In [1]:
import requests

# URL to scrape
url = 'https://example.com/'

# List of proxies
proxies = [
    'http://proxy1.example.com:8080',
    'http://proxy2.example.com:8080',
    'http://proxy3.example.com:8080',
    # Add more proxies as needed
]

# Function to scrape the website using a proxy
def scrape_website_with_proxy():
    try:
        # Select a proxy from the list
        proxy = proxies.pop(0)
        
        # Send a GET request to the website using the proxy
        response = requests.get(url, proxies={'http': proxy, 'https': proxy})
        
        # Process the response or extract the desired data
        # ...
        
        # Add the used proxy back to the list for rotation
        proxies.append(proxy)
        
    except requests.exceptions.RequestException as e:
        # Handle request exceptions
        print('Request error:', e)

# Entry point of the script
if __name__ == '__main__':
    # Scrape the website using a proxy
    scrape_website_with_proxy()


Request error: HTTPSConnectionPool(host='example.com', port=443): Max retries exceeded with url: / (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002E5E05D2050>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')))
