In [1]:
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import spacy
from urllib.parse import urljoin, urlparse
import time


import pandas as pd
from selenium import webdriver
import time
from urllib.parse import urlparse, urljoin
from Uniscraper.Uniscraper import uniscraper
def generate_url_list(school_info, max_links=20):
    """
    This function returns the subdomain links visible from a food bank or wellness programs homepage.
    Params:
        school_info: DataFrame with 'school_name' and 'url' columns
        max_links: max size of the list being returned for each school
    Returns:
        result_df: DataFrame with 'school_name' and 'url' columns
    """
    all_links = []  

    
    for index, row in school_info.iterrows():
        #get school name and url to base/starting page
        school_name = row["school_name"] 
        url = row["url"]  

        
        driver = webdriver.Chrome() 
        driver.get(url)
        time.sleep(1)

        # parsing url to ensure consistency and proper formatting
        parsed_url = urlparse(url)
        #takes elements such as scheme and netloc to create valid base domain
        base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"

        # set data structure used to avoid duplicates
        links = set()

        #looping though each sublink
        for a in driver.find_elements(By.TAG_NAME, "a"):
            href = a.get_attribute("href")
            if href: # if link exists
                # joining to ensure only focused websites are being generated
                full_link = urljoin(base_domain, href)
                #adding to list of links if it has base domain 
                if full_link.startswith(base_domain) and full_link not in links:
                    links.add(full_link)
                    if len(links) >= max_links: #stopping point after max_links
                        break

        
        driver.quit()

        # all links for a school will have school name but different urls
        for link in links:
            all_links.append({"school_name": school_name, "url": link})

    # conver to dataframe (table)
    result_df = pd.DataFrame(all_links)

    return result_df



# Example DataFrame to hold the school names and their corresponding homepage URLs
school_info = pd.DataFrame({
    "school_name": ["UNC", "UGA","UC Davis", "UCLA", "PSU"], # Add your school names here
    "url": ["https://dos.unc.edu/student-support/basicneeds/",
            "https://well-being.uga.edu/basic-needs/#:~:text=YOUR%20BASIC%20WELL-BEING%20NEEDS&text=Access%20to%20essential%20resources%20is,to%20students%20at%20no%20cost",
            "https://financialaid.ucdavis.edu/wellness-outreach/basic-needs",
            "https://bewellbruin.ucla.edu/resource/ucla-basic-needs",
            "https://studentaffairs.psu.edu/basic-needs-support"
            ]  
})

result = generate_url_list(school_info)

# Load NLP model (helps categorize text automatically)
nlp = spacy.load("en_core_web_sm")

# Define university URLs to scrape
university_urls = result["url"]
school_names = result["school_name"]

# Define keywords to extract relevant content
keywords = [
    "Food Security", "Housing Stability", "Financial Assistance", "Healthcare Services", "Mental Health Support",
    "Transportation Access", "Personal Care Items", "Childcare Support", "Technology Access", "Clothing & Weather Essentials",
    "Academic Support", "Community & Belonging", "School Supplies", "Cooking Supplies", "Cleaning Supplies",
    "Nutrition Education", "Financial Literacy", "Legal Support", "Crisis Intervention", "Laundry Access",
    "Career Resources", "Substance Abuse Support", "Financial Counseling", "Emergency Housing", 
    "Immigration & International Student Support", "Communication Services", "Domestic Violence Resources"
]

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Initialize list to store extracted data
data = []

def clean_text(text):
    """Removes excessive spaces, newlines, and special characters from text."""
    return re.sub(r'\s+', ' ', text).strip()

def extract_relevant_text(url):
    """Extracts relevant content based on predefined keywords."""
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    text = clean_text(soup.get_text().lower())
    extracted_info = {"URL": url, "Text": text}
    
    for keyword in keywords:
        extracted_info[keyword] = "Yes" if keyword.lower() in text else "No"
    
    return extracted_info

def extract_contact_info(text):
    """Extracts email and phone numbers from the scraped text."""
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    phones = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return {"Emails": ", ".join(set(emails)), "Phone Numbers": ", ".join(set(phones))}

def categorize_services(text):
    """Uses NLP to categorize extracted text into relevant service categories."""
    doc = nlp(text)
    categories = {key: "No" for key in keywords}
    
    for sent in doc.sents:
        for key in keywords:
            if key.lower() in sent.text.lower():
                categories[key] = "Yes"
    return categories

print("Starting enhanced web scraping...")
for url in university_urls:
    try:
        print(f"Scraping: {url}")
        extracted_data = extract_relevant_text(url)
        contact_info = extract_contact_info(extracted_data["Text"])
        categorized_data = categorize_services(extracted_data["Text"])
        
        # Merge all extracted data
        final_data = {**extracted_data, **contact_info, **categorized_data}
        data.append(final_data)
    except Exception as e:
        print(f"Error scraping {url}: {e}")

driver.quit()

# Convert to DataFrame and save as CSV
df = pd.DataFrame(data)
df.drop(columns=["Text"], inplace=True) 
df["school_name"] = result["school_name"] # Remove raw text to keep CSV clean

df


Starting enhanced web scraping...
Scraping: https://dos.unc.edu/student-support/basicneeds/financial-security/
Scraping: https://dos.unc.edu/events
Scraping: https://dos.unc.edu/student-support/basicneeds/food-security/
Scraping: https://dos.unc.edu/student-support/basicneeds/
Scraping: https://dos.unc.edu/about-us/
Scraping: https://dos.unc.edu/about-us/meet-with-dos/
Scraping: https://dos.unc.edu/student-support/care-referral-form/
Scraping: https://dos.unc.edu/for-faculty-staff/
Scraping: https://dos.unc.edu/carolina-veterans-resource-center/
Scraping: https://dos.unc.edu/
Scraping: https://dos.unc.edu/student-support/basicneeds/#brx-footer
Scraping: https://dos.unc.edu/about-us/our-staff/
Scraping: https://dos.unc.edu/urgent-concerns/
Scraping: https://dos.unc.edu/student-support/basicneeds/housing-security/
Scraping: https://dos.unc.edu/student-support/basicneeds/communityresources/
Scraping: https://dos.unc.edu/student-support/basicneeds/#brx-content
Scraping: https://dos.unc.edu

Unnamed: 0,URL,Food Security,Housing Stability,Financial Assistance,Healthcare Services,Mental Health Support,Transportation Access,Personal Care Items,Childcare Support,Technology Access,...,Career Resources,Substance Abuse Support,Financial Counseling,Emergency Housing,Immigration & International Student Support,Communication Services,Domestic Violence Resources,Emails,Phone Numbers,school_name
0,https://dos.unc.edu/student-support/basicneeds...,Yes,No,Yes,No,No,No,No,No,No,...,No,No,No,No,No,No,No,"help@studentaid.unc.edu., dos@unc.edu, centerc...","919-962-9640, 919-966-4042",UNC
1,https://dos.unc.edu/events,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,"dos@unc.edu, centercvrc@unc.edu","919-962-9640, 919-966-4042",UNC
2,https://dos.unc.edu/student-support/basicneeds...,Yes,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,"dos@unc.edu, carolinacupboard@gmail.com., cent...","919-962-9640, 919-966-4042",UNC
3,https://dos.unc.edu/student-support/basicneeds/,Yes,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,"dos@unc.edu, centercvrc@unc.edu","919-962-9640, 919-966-4042",UNC
4,https://dos.unc.edu/about-us/,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,"dos@unc.edu, centercvrc@unc.edu","919-962-9640, 919-966-4042",UNC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,https://studentaffairs.psu.edu/find-support#he...,No,No,Yes,No,No,No,No,No,No,...,No,No,No,No,No,No,No,,,PSU
93,https://studentaffairs.psu.edu/get-involved-0#...,No,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,,,PSU
94,https://studentaffairs.psu.edu/find-support#ad...,No,No,Yes,No,No,No,No,No,No,...,No,No,No,No,No,No,No,,,PSU
95,https://studentaffairs.psu.edu/find-support#me...,No,No,Yes,No,No,No,No,No,No,...,No,No,No,No,No,No,No,,,PSU
