In [4]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

# Function to extract emails from a web page
def extract_emails(text):
    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    emails = re.findall(email_pattern, text)
    return emails

# Function to extract all URLs from a base page
def extract_urls(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    urls = set()
    # Find all <a> tags to get href links
    for link in soup.find_all('a', href=True):
        full_url = urljoin(base_url, link['href'])
        urls.add(full_url)
    return urls

# Function to scrape data (emails and other parameters)
def scrape_data(base_url):
    urls = extract_urls(base_url)
    all_emails = set()
    for url in urls:
        try:
            print(f'Scraping URL: {url}')
            response = requests.get(url)
            if response.status_code == 200:
                page_text = response.text
                # Extract emails from the page content
                emails = extract_emails(page_text)
                all_emails.update(emails)
        except Exception as e:
            print(f'Error scraping {url}: {e}')
    return all_emails

# Main execution
if __name__ == "__main__":
    base_url = 'https://vitbhopal.ac.in/'  # Target base URL
    emails = scrape_data(base_url)
    print(f"Emails found: {emails}")


Scraping URL: https://www.instagram.com/p/CjmWnnVIxZI/
Scraping URL: https://vitbhopal.ac.in/controller-of-examinations/
Scraping URL: https://scontent.cdninstagram.com/v/t51.29350-15/313728665_514974173855862_6038399911470391137_n.jpg?_nc_cat=104&ccb=1-7&_nc_sid=8ae9d6&_nc_ohc=4VDnHTD5KKgAX_cKjIl&_nc_ht=scontent.cdninstagram.com&edm=ANo9K5cEAAAA&oh=00_AfDpsCYSwi4zpq2MfaHRGrVAz5g3SOFPP339BxXSUPibrA&oe=636BAF64
Scraping URL: https://vitbhopal.ac.in/placement-overview/
Scraping URL: https://vitbhopal.ac.in/internship-placements/
Scraping URL: https://scontent.cdninstagram.com/v/t51.29350-15/311810332_180614894496450_3581879533604932487_n.jpg?_nc_cat=109&ccb=1-7&_nc_sid=8ae9d6&_nc_ohc=OvZmS6KVk2EAX_FOHdp&_nc_ht=scontent.cdninstagram.com&edm=ANo9K5cEAAAA&oh=00_AfAi5ZiuhAOJnxUrSfBldTPogTahKbjn4V-P4vvaH6yFZg&oe=636AB410
Scraping URL: https://scontent.cdninstagram.com/v/t51.29350-15/311372797_147998754603025_9141007060745456813_n.jpg?_nc_cat=107&ccb=1-7&_nc_sid=8ae9d6&_nc_ohc=qnqWY9RDn1gAX--M