In [7]:
from urllib.request import urlopen  # b_soup_1.py
from bs4 import BeautifulSoup
import csv

Must run the following commands in the appropriate conda env in order for this to work:
>  \> conda install spacy  
>  \> python -m spacy download en   
 (second one must be run as admin)

Alternatively run the top command and then replace the cell below with:  
`import spacy  
import en_core_web_sm  
nlp = en_core_web_sm.load()`  

In [14]:
import spacy
nlp = spacy.load('en')

In [71]:
# driver
def do_that_thing(skills_list):
    """
    Returns a dict mapping each skill in the skill_list
    to a list of course names associated with that skill.
    """
    # first get the links for all Heinz courses
    course_dict = get_course_links()
    course_descriptions = course_dict.copy()

    # then scrape the description text for each link
    for name, link in course_dict.items():
        course_descriptions[name] = get_course_description(link)
        
    # time for a little nlp (to lemmatize the descriptions)
    # we'll store the results in a new dictionary
    parsed_descriptions = course_descriptions.copy()
    for name, text in course_descriptions.items():
        parsed_text = nlp(text)
        parsed_tokens = []
        
        # remove all stopwords, punctuation, and spaces
        for token in parsed_text:
            lemma = token.lemma_.lower()
            if not (nlp.vocab[lemma].is_stop or token.pos_ == 'PUNCT' or token.pos_ == 'SPACE'):
                parsed_tokens.append(lemma)
        
        # finally add list to dictionary
        parsed_descriptions[name] = parsed_tokens
    
    
    # now we can iterate through the skills list and build our final dictionary
    # give each skill its own list
    skills_to_courses = {}.fromkeys(skills_list, [])
    
    for skill in skills_list:
        # now check every course for that skill
        for course_name in parsed_descriptions.keys():
            # if that skill appears in the description, add to the map
            if skill.lower() in parsed_descriptions[course_name]:
                skills_to_courses[skill].append(course_name)
    
    # return the final mapping
    return skills_to_courses

In [31]:
def get_course_links():
    """
    Get a list of urls to Heinz College course description 
    pages, so we can scrape each of those iteratively.
    
    Params
    -------
    skill_list
        A list of all the skills searched for in these pages
    
    Return
    -------
    <dict> {course_name: course_link, ...}
        course_name: string containing course name
        course_link: string containing link to course description page
    """
    html_base = 'https://api.heinz.cmu.edu'
    html = urlopen('https://api.heinz.cmu.edu/courses_api/course_list/')
    soup = BeautifulSoup(html.read(), "lxml")
    
    names = []
    links = []
    
    # Each course link is contained in a <tr> element with class="clickable-row"
    course_rows = soup.findAll('tr', {'class': 'clickable-row'})
    for row in course_rows:
        names.append(row.find_all('td')[1].string)
        links.append(html_base + row.get('data-href'))
    
    return dict(zip(names, links))

In [63]:
def get_course_description(link):
    # Retrieve the html
    html = urlopen(link)
    soup = BeautifulSoup(html.read(), "lxml")

    # Recover the description text
    text = soup.find_all('p')[2].get_text()
    clean = text.replace('\n', '').replace('\r', '')

In [9]:
html = urlopen('https://api.heinz.cmu.edu/courses_api/course_list/')
soup = BeautifulSoup(html.read(), "lxml")

In [10]:
course_rows = bsyc.findAll('tr', {'class': 'clickable-row'})

In [32]:
course_links = get_course_links()
course_links['Heinz Journal']

In [34]:
html = urlopen(course_links['Heinz Journal'])
soup = BeautifulSoup(html.read(), "lxml")

In [49]:
text = soup.find_all('p')[2].get_text()

In [None]:
# test it
test_list = ['Management', 'software', 'knowledge']
results = get_skill_map(test_list)
write_to_csv(results)