In [2]:
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import spacy

# Load NLP model (helps categorize text automatically)
nlp = spacy.load("en_core_web_sm")

# Define university URLs to scrape
university_urls = [
    "https://mbc.studentlife.umich.edu",
    "https://basicneeds.ucsb.edu",
    "https://www.berkeley.edu/basicneeds/",
    "https://basicneeds.ucsd.edu/",
    "https://basicneeds.osu.edu/"
]

# Define keywords to extract relevant content
keywords = [
    "Food Security", "Housing Stability", "Financial Assistance", "Healthcare Services", "Mental Health Support",
    "Transportation Access", "Personal Care Items", "Childcare Support", "Technology Access", "Clothing & Weather Essentials",
    "Academic Support", "Community & Belonging", "School Supplies", "Cooking Supplies", "Cleaning Supplies",
    "Nutrition Education", "Financial Literacy", "Legal Support", "Crisis Intervention", "Laundry Access",
    "Career Resources", "Substance Abuse Support", "Financial Counseling", "Emergency Housing", 
    "Immigration & International Student Support", "Communication Services", "Domestic Violence Resources"
]

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Initialize list to store extracted data
data = []

def clean_text(text):
    """Removes excessive spaces, newlines, and special characters from text."""
    return re.sub(r'\s+', ' ', text).strip()

def extract_relevant_text(url):
    """Extracts relevant content based on predefined keywords."""
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    text = clean_text(soup.get_text().lower())
    extracted_info = {"URL": url, "Text": text}
    
    for keyword in keywords:
        extracted_info[keyword] = "Yes" if keyword.lower() in text else "No"
    
    return extracted_info

def extract_contact_info(text):
    """Extracts email and phone numbers from the scraped text."""
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    phones = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return {"Emails": ", ".join(set(emails)), "Phone Numbers": ", ".join(set(phones))}

def categorize_services(text):
    """Uses NLP to categorize extracted text into relevant service categories."""
    doc = nlp(text)
    categories = {key: "No" for key in keywords}
    
    for sent in doc.sents:
        for key in keywords:
            if key.lower() in sent.text.lower():
                categories[key] = "Yes"
    return categories

print("Starting enhanced web scraping...")
for url in university_urls:
    try:
        print(f"Scraping: {url}")
        extracted_data = extract_relevant_text(url)
        contact_info = extract_contact_info(extracted_data["Text"])
        categorized_data = categorize_services(extracted_data["Text"])
        
        # Merge all extracted data
        final_data = {**extracted_data, **contact_info, **categorized_data}
        data.append(final_data)
    except Exception as e:
        print(f"Error scraping {url}: {e}")

driver.quit()

# Convert to DataFrame and save as CSV
df = pd.DataFrame(data)
df.drop(columns=["Text"], inplace=True)  # Remove raw text to keep CSV clean
df.to_csv("enhanced_college_basic_needs_data.csv", index=False)
print("Web scraping completed! Data saved to enhanced_college_basic_needs_data.csv")

Starting enhanced web scraping...
Scraping: https://mbc.studentlife.umich.edu
Scraping: https://basicneeds.ucsb.edu
Scraping: https://www.berkeley.edu/basicneeds/
Scraping: https://basicneeds.ucsd.edu/
Scraping: https://basicneeds.osu.edu/
Error scraping https://basicneeds.osu.edu/: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=133.0.6943.142)
Stacktrace:
0   chromedriver                        0x00000001028bf708 chromedriver + 5969672
1   chromedriver                        0x00000001028b732a chromedriver + 5935914
2   chromedriver                        0x0000000102373650 chromedriver + 415312
3   chromedriver                        0x000000010236a5e0 chromedriver + 378336
4   chromedriver                        0x000000010235a7a8 chromedriver + 313256
5   chromedriver                        0x000000010235c49d chromedriver + 320669
6   chromedriver                        0x000000010235ab2b chromedriver + 314155
7   chromedriver                        0x00