In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

import os

In [10]:
# URL to scrape
base_url = "https://www.usf.edu/business/"

In [11]:
# Make a GET request to fetch the raw HTML content
response = requests.get(base_url)
response.raise_for_status()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [14]:
# Find the nav tag with class 'siteNav'
nav_tag = soup.find('nav', class_='siteNav')

# List to store the final links
links = []

# If the nav tag is found, extract hrefs from anchor tags and concatenate with base URL
if nav_tag:
    links.extend([urljoin(base_url, a['href']) for a in nav_tag.find_all('a', href=True)])
else:
    print("No nav tag with class 'siteNav' found.")

In [16]:
links

['https://www.usf.edu/business/undergraduate/index.aspx',
 'https://www.usf.edu/business/graduate/index.aspx',
 'https://www.usf.edu/business/schools/index.aspx',
 'https://www.usf.edu/business/centers/index.aspx',
 'https://www.usf.edu/business/about/index.aspx',
 'https://www.usf.edu/business/resources/index.aspx']

In [5]:
# Iterate over each link from the previous step
for link in links[:]:  # using a slice to iterate over a copy of the list
    # Fetch content of the page
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the div with id 'content' and class 'content'
    content_div = soup.find('div', id='content', class_='content')
    
    # If the div is found, extract hrefs from anchor tags, concatenate with base URL, and exclude '#'
    if content_div:
        links.extend([urljoin(link, a['href']) for a in content_div.find_all('a', href=True) if a['href'] != '#'])

In [6]:
links

['https://www.usf.edu/business/undergraduate/index.aspx',
 'https://www.usf.edu/business/graduate/index.aspx',
 'https://www.usf.edu/business/schools/index.aspx',
 'https://www.usf.edu/business/centers/index.aspx',
 'https://www.usf.edu/business/about/index.aspx',
 'https://www.usf.edu/business/resources/index.aspx',
 'https://www.usf.edu/business/undergraduate/index.aspx#content-title',
 'https://www.usf.edu/business',
 'https://www.usf.edu/business/undergraduate',
 'https://www.usf.edu/business/undergraduate/index.aspx',
 'https://www.usf.edu/business/undergraduate/requirements.aspx',
 'https://www.usf.edu/business/undergraduate/majors.aspx',
 'https://www.usf.edu/business/undergraduate/minors.aspx',
 'https://www.usf.edu/business/undergraduate/certificates.aspx',
 'https://www.usf.edu/business/undergraduate/special-programs.aspx',
 'https://www.usf.edu/business/undergraduate/advising/index.aspx',
 'https://www.usf.edu/business/student-success/index.aspx',
 'https://www.usf.edu/busin

In [7]:
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()

# List to store the extracted text content, one element per page
text_contents = []

for link in links:
    driver.get(link)
    
    # Click on all anchors with href="#"
    anchors_to_click = driver.find_elements(By.CSS_SELECTOR, '#content.content a[href="#"]')
    for anchor in anchors_to_click:
        try:
            anchor.click()
            # Introducing a short sleep to ensure any dynamic content loads after click
            time.sleep(2)
        except:
            pass  # Handle any exceptions that arise from trying to click the anchor (like it being hidden)

    # Locate the desired div and extract its text content
    try:
        content_div = driver.find_element(By.CSS_SELECTOR, '#content.content .mainContent_well.u-flexItem--largeExtra')
        # Appending the extracted text as a new element in the list
        text_contents.append(content_div.text)
    except:
        pass  # If the desired div isn't found, move on to the next link

driver.quit()

In [8]:
text_contents

["OVERVIEW\nView Program Brochure\nAt the USF Muma College of Business, the business world is part of the classroom and USF's undergraduate curriculum provides rich opportunities to bridge theory and practice, just as the metropolitan location provides opportunities for student internships, part-time jobs, and cooperative education experiences relevant to students' career goals.\nTwo aids in bridging theory and practice are embedded into the curriculum – the Business & Workplace Skills and Best Practices Certificate with Sandler Inc providing students with the workplace skills to launch their careers, and the Citizen Data Science Certificate training students in Tableau data visualization.\nBACHELOR OF ARTS/SCIENCE\nThe two undergraduate degrees offered, the Bachelor of Arts or the Bachelor of Science, are broad enough in nature to give students an understanding of the arts, humanities, and sciences, while narrow enough to provide a thorough understanding of business and management ten

# Ensure the 'Scrapped Pages' directory exists
output_folder = "Scrapped Pages"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through each text content in 'text_contents'
for index, content in enumerate(text_contents, 1):
    # Define the filename using the index to ensure a unique name for each file
    filename = os.path.join(output_folder, f"page_{index}.txt")
    
    # Write the content to the file
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

print(f"Saved {len(text_contents)} pages to '{output_folder}' directory.")

In [7]:
# Define the URL of the webpage to scrape
base_url = "https://www.usf.edu"
url = "https://www.usf.edu/business/about/bios/"

# Create a folder called "BIOS" if it doesn't exist
if not os.path.exists("BIOS"):
    os.makedirs("BIOS")

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the div with class="mainContent u-flex--large u-wrapper u-clearfix"
    main_content_div = soup.find('div', class_='mainContent u-flex--large u-wrapper u-clearfix')
    
    # Find all anchor tags within the specified div
    anchor_tags = main_content_div.find_all('a')
    
    # Extract href attributes that start with "/" and are not "mailto:"
    hrefs = [a['href'] for a in anchor_tags if a.has_attr('href') and a['href'].startswith('/') and not a['href'].startswith('mailto:')]
    
    # Concatenate the hrefs with the base URL to get full URLs
    full_urls = [base_url + href for href in hrefs]
    
    # Visit each full URL and save the page content in a text file
    for index, full_url in enumerate(full_urls, start=1):
        page_response = requests.get(full_url)
        
        if page_response.status_code == 200:
            page_soup = BeautifulSoup(page_response.content, 'html.parser')
            
            # Find the div with class="mainContent_well u-flexItem--largeExtra"
            well_div = page_soup.find('div', class_='mainContent_well u-flexItem--largeExtra')
            
            # Extract and save the text data from the div in a text file
            if well_div:
                text_data = well_div.get_text(strip=True)
                filename = f"BIOS/page_{index}.txt"
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(text_data)
                print(f"Saved page {index} to {filename}")
            else:
                print("Div with class 'mainContent_well u-flexItem--largeExtra' not found on:", full_url)
        else:
            print("Failed to retrieve the page at:", full_url)
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

Saved page 1 to BIOS/page_1.txt
Saved page 2 to BIOS/page_2.txt
Saved page 3 to BIOS/page_3.txt
Saved page 4 to BIOS/page_4.txt
Saved page 5 to BIOS/page_5.txt
Saved page 6 to BIOS/page_6.txt
Saved page 7 to BIOS/page_7.txt
Saved page 8 to BIOS/page_8.txt
Saved page 9 to BIOS/page_9.txt
Saved page 10 to BIOS/page_10.txt
Saved page 11 to BIOS/page_11.txt
Saved page 12 to BIOS/page_12.txt
Saved page 13 to BIOS/page_13.txt
Saved page 14 to BIOS/page_14.txt
Saved page 15 to BIOS/page_15.txt
Saved page 16 to BIOS/page_16.txt
Saved page 17 to BIOS/page_17.txt
Saved page 18 to BIOS/page_18.txt
Saved page 19 to BIOS/page_19.txt
Saved page 20 to BIOS/page_20.txt
Saved page 21 to BIOS/page_21.txt
Saved page 22 to BIOS/page_22.txt
Saved page 23 to BIOS/page_23.txt
Saved page 24 to BIOS/page_24.txt
Saved page 25 to BIOS/page_25.txt
Saved page 26 to BIOS/page_26.txt
Saved page 27 to BIOS/page_27.txt
Saved page 28 to BIOS/page_28.txt
Saved page 29 to BIOS/page_29.txt
Saved page 30 to BIOS/page_30.tx

Saved page 235 to BIOS/page_235.txt
Saved page 236 to BIOS/page_236.txt
Saved page 237 to BIOS/page_237.txt
Saved page 238 to BIOS/page_238.txt
Saved page 239 to BIOS/page_239.txt
Saved page 240 to BIOS/page_240.txt
Saved page 241 to BIOS/page_241.txt
Saved page 242 to BIOS/page_242.txt
Saved page 243 to BIOS/page_243.txt
Saved page 244 to BIOS/page_244.txt
Saved page 245 to BIOS/page_245.txt
Saved page 246 to BIOS/page_246.txt
Saved page 247 to BIOS/page_247.txt
Saved page 248 to BIOS/page_248.txt
Saved page 249 to BIOS/page_249.txt
Saved page 250 to BIOS/page_250.txt
Saved page 251 to BIOS/page_251.txt
Saved page 252 to BIOS/page_252.txt
Saved page 253 to BIOS/page_253.txt
Saved page 254 to BIOS/page_254.txt
Saved page 255 to BIOS/page_255.txt
Saved page 256 to BIOS/page_256.txt
Saved page 257 to BIOS/page_257.txt
Saved page 258 to BIOS/page_258.txt
Saved page 259 to BIOS/page_259.txt
Saved page 260 to BIOS/page_260.txt
Saved page 261 to BIOS/page_261.txt
Saved page 262 to BIOS/page_