Before running the demo, download the spacy english model.

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import time
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import spacy
from urllib.parse import urljoin, urlparse
import time


def generate_url_list(school_info, max_links=20):
    """
    This function returns the subdomain links visible from a food bank or wellness programs homepage.
    Params:
        school_info: DataFrame with 'school_name' and 'url' columns
        max_links: max size of the list being returned for each school
    Returns:
        result_df: DataFrame with 'school_name' and 'url' columns
    """
    all_links = []  

    
    for index, row in school_info.iterrows():
        #get school name and url to base/starting page
        school_name = row["school_name"] 
        url = row["url"]  

        
        driver = webdriver.Chrome() 
        driver.get(url)
        time.sleep(1)

        # parsing url to ensure consistency and proper formatting
        parsed_url = urlparse(url)
        #takes elements such as scheme and netloc to create valid base domain
        base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"

        # set data structure used to avoid duplicates
        links = set()

        #looping though each sublink
        for a in driver.find_elements(By.TAG_NAME, "a"):
            href = a.get_attribute("href")
            if href: # if link exists
                # joining to ensure only focused websites are being generated
                full_link = urljoin(base_domain, href)
                #adding to list of links if it has base domain 
                if full_link.startswith(base_domain) and full_link not in links:
                    links.add(full_link)
                    if len(links) >= max_links: #stopping point after max_links
                        break

        
        driver.quit()

        # all links for a school will have school name but different urls
        for link in links:
            all_links.append({"school_name": school_name, "url": link})

    # conver to dataframe 
    result_df = pd.DataFrame(all_links)

    return result_df



# small example of dataframe for 5 schools
school_info = pd.DataFrame({
    "school_name": ["UNC", "UGA","UC Davis", "UCLA", "PSU"], 
    "url": ["https://dos.unc.edu/student-support/basicneeds/",
            "https://well-being.uga.edu/basic-needs/",
            "https://financialaid.ucdavis.edu/wellness-outreach/basic-needs",
            "https://bewellbruin.ucla.edu/resource/ucla-basic-needs",
            "https://studentaffairs.psu.edu/basic-needs-support"
            ]  
})


result = generate_url_list(school_info)

# Load NLP model
nlp = spacy.load("en_core_web_sm")

#Urls and school names list 
university_urls = result["url"]
school_names = result["school_name"]


keywords = [
    "Food Security", "Housing Stability", "Financial Assistance", "Healthcare Services", "Mental Health Support",
    "Transportation Access", "Personal Care Items", "Childcare Support", "Technology Access", "Clothing & Weather Essentials",
    "Academic Support", "Community & Belonging", "School Supplies", "Cooking Supplies", "Cleaning Supplies",
    "Nutrition Education", "Financial Literacy", "Legal Support", "Crisis Intervention", "Laundry Access",
    "Career Resources", "Substance Abuse Support", "Financial Counseling", "Emergency Housing", 
    "Immigration & International Student Support", "Communication Services", "Domestic Violence Resources"
]


driver = webdriver.Chrome()

data = []

def clean_text(text):
    """Removes excessive spaces, newlines, and special characters from text."""
    return re.sub(r'\s+', ' ', text).strip()

#updated extraction function
def extract_relevant_text(url, limit = 10):
    """Extracts relevant content and retrieves keyword occurrences with sentence context."""
    
    driver.get(url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    text = clean_text(soup.get_text())
    extracted_info = {"URL": url, "Text": text}

    # Process text with spaCy
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]  # Tokenize into sentences
    total_occurrences = []
    #loop through the keyword list
    for keyword in keywords:
        keyword_lower = keyword.lower()
        occurrences = []
        
        # Find occurences
        for i, sentence in enumerate(sentences):
            if keyword_lower in sentence.lower():  # If the keyword is found in the sentence
                before = sentences[i - 1] if i > 0 else " "  # Previous sentence
                after = sentences[i + 1] if i < len(sentences) - 1 else "N/A"  # Next sentence
                highlighted_sentence = sentence.replace(keyword, keyword.upper())
                occurrence_text = f"Occurrence X: {before} {highlighted_sentence} {after} \n"
                occurrences.append(occurrence_text)
                total_occurrences.append(occurrence_text)
            if len(total_occurrences) >= 5:
                break
        # Placing the occurences into the same column separated by ||
        extracted_info[keyword] = "\n".join(occurrences) if occurrences else "No"
        if len(total_occurrences) >= 5:
            break

    return extracted_info


def extract_contact_info(text):
    """Extracts email and phone numbers from the scraped text."""
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    phones = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
    return {"Emails": ", ".join(set(emails)), "Phone Numbers": ", ".join(set(phones))}


print("Starting enhanced web scraping...")
for url in university_urls:
    try:
        #print(f"Scraping: {url}")
        extracted_data = extract_relevant_text(url)
        contact_info = extract_contact_info(extracted_data["Text"])
#         categorized_data = categorize_services(extracted_data["Text"])
        
        # Merge all extracted data and got rid of the categorized data
        final_data = {**extracted_data, **contact_info}  #, **categorized_data
        data.append(final_data)
    except Exception as e:
        print(f"Error scraping {url}: {e}")

driver.quit()

# Convert to DataFrame and save as CSV
df = pd.DataFrame(data)
df.drop(columns=["Text"], inplace=True) 
df["school_name"] = result["school_name"] # Remove raw text to keep CSV clean

#df


#Aggregate keyword occurrences into a single row per school
def merge_occurrences(series):
    """Merge occurrences from multiple sublinks, ensuring proper sequencing."""
    unique_values = series.dropna().unique()
    filtered_values = [val for val in unique_values if val != "No"]

    if not filtered_values:
        return "No"

    # Step 1: Standardize occurrence format (replace numbers with 'X')
    occurrences = "\n".join(filtered_values)
    occurrence_list = [line for line in occurrences.split("\n") if line.strip()]

    # Step 3: Renumber properly
    reordered = []
    for i, occ in enumerate(occurrence_list):
        reordered.append(occ.replace("Occurrence X:", f"Occurrence {i + 1}:") + "\n")

    return "\n".join(reordered)


#Count total mentions per school
def count_mentions(series):
    """Count total keyword mentions across multiple rows for a school."""
    return series.str.count("Occurrence").sum()

#Perform groupby aggregation
agg_dict = {keyword: merge_occurrences for keyword in keywords}
agg_dict["Emails"] = merge_occurrences
agg_dict["Phone Numbers"] = merge_occurrences

df_grouped = df.groupby("school_name").agg(agg_dict).reset_index()

#Create a new column for total keyword mentions
df_grouped["Total Mentions"] = df[keywords].applymap(lambda x: x.count("Occurrence") if isinstance(x, str) else 0).groupby(df["school_name"]).sum().sum(axis=1).values

df_grouped

Starting enhanced web scraping...


KeyboardInterrupt: 

In [None]:
#df.to_csv("first_word_search.csv", index=False)
df_grouped.to_csv("condensed_word_search.csv", index=False)