# Diversity Statements Scraper

In [3]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

### Get list of universities

In [4]:
input_path = "../data/input/pilot_websites.csv"
uni_list = pd.read_csv(input_path)
uni_list = uni_list.melt(id_vars=['University'], 
              value_vars=[x for x in uni_list.columns if "WebPage" in x or "Admissions" in x], 
              var_name='url_type', value_name='url')
uni_list["url_type"],uni_list["url_rank"] = uni_list["url_type"].str[:-3], uni_list["url_type"].str[-3:]
uni_list = uni_list[["University","url_type","url_rank","url"]].dropna(subset=["url"])
print(uni_list.shape)
uni_list = uni_list.drop_duplicates(subset="url")
print(uni_list.shape)
uni_list

(27, 4)
(20, 4)


Unnamed: 0,University,url_type,url_rank,url
0,Louisiana State University and Agricultural & ...,WebPageLink,1.1,https://www.lsu.edu/diversity/about_us/mission...
1,University of California-San Diego,WebPageLink,1.1,https://ucsd.edu/campus-life/diversity/index.html
2,Saint Louis University,WebPageLink,1.1,https://www.slu.edu/about/key-facts/diversity/...
3,SUNY College of Environmental Science and Fore...,WebPageLink,1.1,https://www.esf.edu/ide/
4,Stony Brook University,WebPageLink,1.1,https://www.stonybrook.edu/diversity/
5,George Washington University,WebPageLink,1.1,https://diversity.gwu.edu/vision-and-mission
6,University of New Mexico-Main Campus,WebPageLink,1.1,https://diverse.unm.edu/about/mission.html
7,Duke University,WebPageLink,1.1,https://oie.duke.edu/mission
8,University of La Verne,WebPageLink,1.1,https://laverne.edu/diversity/
9,Loyola University Chicago,WebPageLink,1.1,https://www.luc.edu/diversityandinclusion/abou...


In [8]:
from markdownify import markdownify
import time
from os.path import dirname 
import re

def attribute_finder(tag,kw,v=False):
    """
        Returns True if the param :kw: is in the tag's class or ID, and false otherwise
    """
    
    if 'class' in tag.attrs:
        if kw in tag.attrs['class'] or any([kw in x for x in tag.attrs['class']]):
            return True
    if 'id' in tag.attrs:
        if kw in tag.attrs['id'] or any([kw in x for x in tag.attrs['id']]):
            return True
    return False

def clean_webpage(url):
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    soup = bs(r.text).find("body")
    # Get rid of all JS, style forms, footers, headers and navs
    for script in soup(["script",'style' ,"footer","header","nav","noscript"]):
        script.decompose()    # rip it out
    # If the tag contains the word 'nav', 'invis', or 'hidden'
    nav = soup.findAll(lambda tag : attribute_finder(tag,'nav'))
    print('nav',len(nav))
    hidden = soup.findAll(lambda tag : attribute_finder(tag,'hidden')) + soup.findAll(lambda tag : attribute_finder(tag,'hide'))
    print('hidden',len(hidden))
    invis = soup.findAll(lambda tag : attribute_finder(tag,'invis'))
    print('invis',len(invis))
    skip = []#soup.findAll(lambda tag : attribute_finder(tag,'skip'))
    print('skip',len(skip))
    sidebar = []
#     sidebar = soup.findAll(lambda tag : attribute_finder(tag,'sidebar'))
#     print('sidebar',len(sidebar))
    footer = soup.findAll(lambda tag : attribute_finder(tag,'footer'))
    print('footer',len(footer))
    header = soup.findAll(lambda tag : attribute_finder(tag,'header'))
    print('header',len(header))
    hero = soup.findAll(lambda tag : attribute_finder(tag,'hero'))
    print('hero',len(hero))
    shortcut = soup.findAll(lambda tag : attribute_finder(tag,'shortcut'))
    print('shortcut',len(shortcut))
    for tag in nav+hidden+invis+skip+sidebar+footer+header+hero+shortcut:
        tag.decompose()
    return soup
    
def save_images(row,soup):
    uni,url,url_type = row["University"],row["url"],row["url_type"]
    base_url = dirname(url)
    print(url,'->',base_url)

    # Get images
    img_tags = soup.find_all('img')
    urls = [img.get('src') for img in img_tags if img.get('src')]
    
    for url in urls:
        filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
        if not filename:
            print("Regex didn't match with the url: {}".format(url))
            return
        fp = f"../data/output/images/{uni}-"+url.replace("/","_")
        with open(fp, 'wb') as f:
            if 'http' not in url:
                if not url[0] == "/": 
                    url = f"/{url}"
                # sometimes an image source can be relative 
                # if it is provide the base url which also happens 
                # to be the site variable atm. 
                url = '{}{}'.format(base_url, url) 
                print(url)
            response = requests.get(url)
            f.write(response.content)
    
def scrape_webpage(row):
    uni,url,url_type = row["University"],row["url"],row["url_type"]
    print(uni,url)
    soup = clean_webpage(url)
    save_images(row,soup)
    for script in soup(["img"]):
        script.decompose()    # rip it out
    str_soup = str(soup)
    overview = re.sub(r'\n\s*\n', '\n\n', markdownify(str_soup))
    # Add a linebreak
    bl = "\n** **\n\n"
    overview = f"{url_type} – {url} \n\n {overview} {bl}"
    with open(f"../data/output/text_files/{uni.strip()}.md", "a", encoding='utf-8') as file:
        file.write(str(overview))
    time.sleep(1)
    return re.sub(r'\n\s*\n','\n\n',soup.get_text())

In [9]:
uni_list['text'] = uni_list.apply(scrape_webpage,axis=1)
uni_list.to_csv(f"../data/output/pilot_websites.csv")

Louisiana State University and Agricultural & Mechanical College https://www.lsu.edu/diversity/about_us/mission_vision.php
nav 16
hidden 2
invis 0
skip 0
footer 1
header 1
hero 0
shortcut 0
https://www.lsu.edu/diversity/about_us/mission_vision.php -> https://www.lsu.edu/diversity/about_us
University of California-San Diego https://ucsd.edu/campus-life/diversity/index.html
nav 0
hidden 1
invis 0
skip 0
footer 0
header 0
hero 1
shortcut 0
https://ucsd.edu/campus-life/diversity/index.html -> https://ucsd.edu/campus-life/diversity
Saint Louis University https://www.slu.edu/about/key-facts/diversity/index.php
nav 0
hidden 2
invis 0
skip 0
footer 0
header 0
hero 0
shortcut 0
https://www.slu.edu/about/key-facts/diversity/index.php -> https://www.slu.edu/about/key-facts/diversity
https://www.slu.edu/about/key-facts/diversity/about/key-facts/diversity/img/diversity_inclusion-min.jpg
SUNY College of Environmental Science and Forestry https://www.esf.edu/ide/
nav 4
hidden 16
invis 0
skip 0
footer