In [1]:
from urllib.parse import urlparse, urljoin

def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme) and '.pdf' not in url and '.jpg' not in url and '.docx' not in url

In [2]:
import os

current_directory = os.getcwd()
final_directory = os.path.join(current_directory, r'dataset')
if not os.path.exists(final_directory):
    os.makedirs(final_directory)

In [3]:
import string

URL_SEP = '--->'

def write_content_to_file(url, url_soup):
    text = ''
    for div in url_soup.findAll('div', {'class': 'content-area'}):
        for line in div.strings:
            line = line.strip()
            line = line.translate(str.maketrans('', '', string.punctuation))
            if not line.isspace() and line:
                text += url + URL_SEP + line + '\n'
    if text:
        file_name = url.replace('/','_')
        file = open('dataset/' + file_name, 'w+')
        file.write(text)
        file.close()

In [4]:
import requests
from bs4 import BeautifulSoup

whitelist = ['https://coronavirus.usc', 
#              'https://about.usc.edu', # uncomment for including this (has lot of unncessary links)
             'https://policy.usc.edu', 
             'https://sites.google.com/usc.edu', 
             'https://studenthealth.usc.edu']

urls = set()

def find_nested_urls(url):
    response = requests.get(url, headers = {'user-agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')
    write_content_to_file(url, soup)

    for link in soup.find_all('a'):
        nested_url = link.get('href')
        if nested_url not in urls and is_valid(nested_url) and any(xs in nested_url for xs in whitelist) :
            print(nested_url)
            urls.add(nested_url)
            find_nested_urls(nested_url)


In [5]:
find_nested_urls("https://studenthealth.usc.edu/")

https://studenthealth.usc.edu/
https://studenthealth.usc.edu/getting-started/
https://studenthealth.usc.edu/health-requirements-for-new-students/
https://studenthealth.usc.edu/getting-started/introductory-videos/
https://studenthealth.usc.edu/myshr/
https://studenthealth.usc.edu/getting-started/us-based/
https://studenthealth.usc.edu/getting-started/international/
https://studenthealth.usc.edu/health-professions/
https://studenthealth.usc.edu/getting-started/parents/
https://studenthealth.usc.edu/our-services/
https://studenthealth.usc.edu/medical-care/
https://studenthealth.usc.edu/medical-care/patient-education/
https://studenthealth.usc.edu/counseling/
https://studenthealth.usc.edu/workshops-and-programs/
https://studenthealth.usc.edu/counseling/group-counseling/
https://sites.google.com/usc.edu/bipocmentalhealth
https://studenthealth.usc.edu/counseling/community-centered-services-embedded-counselors-and-liaisons/
https://studenthealth.usc.edu/sexual-assault/
https://studenthealth.u

https://policy.usc.edu/tag/faculty/
https://policy.usc.edu/usc-collections-or-gifts-in-kind-acquisitions/
https://policy.usc.edu/category/all-policy-topics/facilities-and-equipment/
https://policy.usc.edu/usc-collections-loans/
https://policy.usc.edu/usc-collections-deaccession/
https://policy.usc.edu/signage/
https://policy.usc.edu/equipment-policy/
https://policy.usc.edu/weekday-football-game-protocol/
https://policy.usc.edu/facilities-and-construction-projects/
https://policy.usc.edu/galen-event-booking/
https://policy.usc.edu/space-planning-and-management/
https://policy.usc.edu/information-security-logging-and-monitoring/
https://policy.usc.edu/category/all-policy-topics/information-technology/
https://policy.usc.edu/asset-management/
https://policy.usc.edu/third-party-security-risk-management/
https://policy.usc.edu/information-security-awareness-training/
https://policy.usc.edu/data-protection/
https://policy.usc.edu/secure-systems-development/
https://policy.usc.edu/cloud-secur

https://policy.usc.edu/hipaa-pat-606-resolution-of-patient-complaints/
https://policy.usc.edu/hipaa-pat-605-patient-requests-to-receive-confidential-communications/
https://policy.usc.edu/hipaa-clin-206-minimum-security-standards-for-ephi-for-keck/
https://policy.usc.edu/hipaa-res-301-uses-and-disclosures-of-protected-health-information-for-research-purposes/
https://policy.usc.edu/hipaa-clin-203-special-privacy-considerations/
https://policy.usc.edu/hipaa-clin-202-personal-representatives-of-patients/
https://policy.usc.edu/injury-and-illness-prevention/
https://policy.usc.edu/campus-access/
https://policy.usc.edu/biorepositories/
https://policy.usc.edu/hipaa-clin-201-use-of-protected-helath-information-for-treatment-payment-and-health-care-operations/
https://policy.usc.edu/tag/staff/page/5/
https://policy.usc.edu/hipaa-bus-701-business-associates/
https://policy.usc.edu/hipaa-clin-204-facility-directory/
https://policy.usc.edu/hipaa-clin-200-notice-of-privacy-practices-1/
https://po

https://studenthealth.usc.edu/directory/broderick-leaks-phd/
https://studenthealth.usc.edu/directory/jo-jo-lee-lpcc/
https://studenthealth.usc.edu/directory/diana-linares-lcsw/
https://studenthealth.usc.edu/directory/milena-lukic-ms-lmft/
https://studenthealth.usc.edu/directory/magali-martinez-lmft/
https://studenthealth.usc.edu/directory/pratik-mehta-md/
https://studenthealth.usc.edu/directory/tiffany-nakamura-lpcc/
https://studenthealth.usc.edu/directory/erika-nanes/
https://studenthealth.usc.edu/directory/parissa-nili/
https://studenthealth.usc.edu/directory/yong-sue-park/
https://studenthealth.usc.edu/directory/cristina-perez-lcsw/
https://studenthealth.usc.edu/directory/psychologist-clinical-assistant-professor-and-faculty-of-the-department-of-psychiatry-and-behavioral-sciences-at-keck-school-of-medicine-of-usc/
https://studenthealth.usc.edu/directory/kemarvin-pitts-msw/
https://studenthealth.usc.edu/directory/melissa-pottash/
https://studenthealth.usc.edu/directory/susan-ramirez/

https://coronavirus.usc.edu/2020/04/29/4-29-seroprevalence-study-at-usc-student-health/
https://coronavirus.usc.edu/2020/04/15/4-15-message-to-trojan-community-from-president-folt/
https://coronavirus.usc.edu/2020/03/30/join-usc-in-helping-those-impacted-by-covid-19/
https://coronavirus.usc.edu/2020/03/19/keck-employee-screening-protocol/
https://coronavirus.usc.edu/2020/03/16/notification-covid-19-eastern-los-angeles-facilities/
https://coronavirus.usc.edu/2020/03/11/keck-medicine-usc-policy-and-information-update/
https://coronavirus.usc.edu/category/keck/page/3/
https://coronavirus.usc.edu/2020/03/10/travel-advisory-and-policy-keck-medicine-of-usc-and-keck-school-medicine-usc/
https://coronavirus.usc.edu/2020/03/06/process-update-for-keck-medicine-usc/
https://coronavirus.usc.edu/2020/03/04/community-update-of-covid-19/
https://coronavirus.usc.edu/2020/02/28/planning-and-preparedness-for-covid-19-no-cases-at-usc/
https://coronavirus.usc.edu/2020/02/05/update-on-travel-entry-restrict

https://coronavirus.usc.edu/2021/08/02/8-2-covid-19-safety-updates-for-in-person-activities/
https://studenthealth.usc.edu/vaccine-clinic 
https://coronavirus.usc.edu/category/student/page/2/
https://coronavirus.usc.edu/2021/07/15/7240/
https://studenthealth.usc.edu/ask-a-doc-and-vaccine-facts/
https://coronavirus.usc.edu/2021/05/06/5-6-health-advisory-on-covid-19-and-advancing-into-the-yellow-tier/
https://studenthealth.usc.edu/early-walk-ins-accepted-for-covid-19-vaccination/
https://coronavirus.usc.edu/2021/03/23/3-23-an-important-update-from-provost-zukoski/
https://policy.usc.edu/student/
https://coronavirus.usc.edu/2021/03/18/3-18-reminder-on-covid-19-safety-and-spring-travel/
https://coronavirus.usc.edu/category/student/page/3/
https://coronavirus.usc.edu/2021/01/20/1-20-reminder-trojan-check-begins-friday-jan-22/
https://studenthealth.usc.edu/flu-compliance-process-for-faculty-and-staff-november-1-2020/
https://studenthealth.usc.edu/spring-2021-testing-requirements/
https://cor

https://coronavirus.usc.edu/2020/03/27/health-in-the-workplace/
https://coronavirus.usc.edu/2020/03/25/message-to-staff-regarding-zoom-security-changes/
https://coronavirus.usc.edu/category/staff/page/9/
https://coronavirus.usc.edu/category/staff/page/10/
https://coronavirus.usc.edu/2020/03/13/covid-19-information-for-employees/
https://coronavirus.usc.edu/category/staff/page/11/
https://coronavirus.usc.edu/2020/03/02/academic-planning-for-covid-19/
https://coronavirus.usc.edu/2020/01/31/2019-novel-coronavirus-travel-advisory/
https://coronavirus.usc.edu/category/staff/page/12/
https://coronavirus.usc.edu/2021/06/29/6-29-masking-policy-updates/
https://coronavirus.usc.edu/category/restart/page/2/
https://coronavirus.usc.edu/category/restart/page/3/
https://coronavirus.usc.edu/category/provost/page/2/
https://coronavirus.usc.edu/category/provost/page/3/
https://coronavirus.usc.edu/category/provost/page/4/
https://coronavirus.usc.edu/category/faculty/page/2/
https://coronavirus.usc.edu/c

https://studenthealth.usc.edu/myshr-scheduled-maintenance/
https://studenthealth.usc.edu/covid-19-vaccinations-come-to-upc/
https://studenthealth.usc.edu/category/news/page/7/
https://studenthealth.usc.edu/q-a-sessions-with-usc-student-health/
https://studenthealth.usc.edu/revised-quarantine-period-in-california-10-days/
https://studenthealth.usc.edu/spring-2021-covid-19-screening-strategy-update/
https://studenthealth.usc.edu/local-service-for-pre-travel-testing-for-flights-to-china/
https://studenthealth.usc.edu/covid-19-health-advisory-11-6/
https://studenthealth.usc.edu/questions-about-insurance/
https://studenthealth.usc.edu/covid-19-antibody-study/
https://studenthealth.usc.edu/healthy-minds-survey-on-student-mental-health/
https://studenthealth.usc.edu/category/news/page/8/
https://studenthealth.usc.edu/surveillance-pop-testing-locations/
https://studenthealth.usc.edu/8-24-community-health-advisory/
https://coronavirus.usc.edu/frequently-asked-questions/
https://studenthealth.us

https://studenthealth.usc.edu/peace-with-food-2/
https://studenthealth.usc.edu/mindful-self-compassion/
https://studenthealth.usc.edu/increase-your-emotional-intelligence/
https://studenthealth.usc.edu/making-anxiety-your-friend/
https://studenthealth.usc.edu/graduate-student-support-group/
https://studenthealth.usc.edu/gender-spectrum-2/
https://studenthealth.usc.edu/effective-life-skills-regulate-communicate-tolerate/
https://studenthealth.usc.edu/category/counseling-groups/page/3/
https://studenthealth.usc.edu/circle-of-hope-2/
https://studenthealth.usc.edu/building-social-confidence/
https://studenthealth.usc.edu/bipolar-support-group/
https://studenthealth.usc.edu/assertiveness-skills-group/
https://studenthealth.usc.edu/rise-and-shine-for-graduate-women-cultivating-self-love-healthy-relationships-and-leadership/
https://studenthealth.usc.edu/circle-of-hope/
https://studenthealth.usc.edu/make-an-appointment/
https://studenthealth.usc.edu/mySHR/
https://studenthealth.usc.edu/have-a

In [6]:
len(urls)

1048