## The University of Montana Crawler

Imports.

In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request #handles urls
import urllib.parse 
import linkGrabber #extracts urls
import json #encodes/decodes json 
import csv 
import requests #downloads a webpage to scrape
from bs4 import BeautifulSoup, NavigableString, Tag #beautifulsoup pulls data from HTML
import nltk #NLP tasks
from nltk import word_tokenize
from nltk.stem import PorterStemmer #removes word endings
stemmer = PorterStemmer()

Keyword preprocessing and url list of relevant catalog years; 2018-19 and 2017-18. Also a list of departments.

In [2]:
#keyword preprocessing
def preprocess(keyword):
    keyword = keyword.lower() #lowercase
    keyword = word_tokenize(keyword) #tokenize
    for word in keyword:
        keyword = stemmer.stem(word) #stem 
    return (keyword)

#course catalog URLs - 2 academic years 
urls = ['http://catalog.umt.edu/courses/',
        'http://catalog.umt.edu/past-catalogs/2017-2018/courses/']

#list of all the departments to search through 
departments = ['actg/','act/','actv/','amgt/','aast/',
               'ahma/','ahms/','ahrc/','ahhs/','ahat/',
               'anty/','aasc/','arab/','artz/','arth/',
               'astr/','atep/','bch/','biol/','bioe/',
               'bioo/','bios/','biob/','bioh/','biom/',
               'bmed/','bfin/','bgen/','bmgt/','bmis',
               'bmkt/','cswa/','cas/','chmy/','chin/',
               'clas/','ccs/','coa/','comx/','csd/',
               'chth/','capp/','csci/','cstn/','coun/',
               'cp/','crwr/','cjus/','cula/','c_i/',
               'danc/','dst/','ddsn/','edec/','erth/',
               'ecns/','libm/','edsp/','edu/','edld/',
               'eele/','etec/','ecp/','ent/','enli/',
               'lit/','easl/','eli/','ensc/','enst/',
               'fme/','film/','wild/','fors/','frch/',
               'egen/','gphy/','geo/','grmn/','gh/','gbld/',
               'gyd/','gs/','grk/','hth/','hhp/','hee/','hit/',
               'heo/','hprv/','hsta/','hstr/','honr/','htr/',
               'hfd/','its/','ids/','irsh/','itln/','jpns/',
               'jrnl/','kin/','latn/','law/','leg/','lsci/',
               'ling/','mis/','mans/','m/','mba/','mart/',
               'mar/','mch/','msl/','mclg/','musi/','muse/',
               'must/','nasx/','nrsm/','nrsg/','nutr/','ptrm/',
               'phar/','phl/','pt/','phsx/','psci/','psyx/',
               'pubh/','ahxr/','rlst/','russ/','scn/','sw/',
               'soci/','ssea/','spns/','stat/','ahst/','srvy/',
               'nrgy/','thtr/','wldg/','wgss/','writ']

Creation of normative and technical keywords lists, the same as in example crawler.

In [3]:
#import keywords
keywords = pd.read_csv("keywords.csv")
technical = keywords[(keywords['Technical/Normative']=='T') & (keywords['Include']=='Y')].Keyword
normative = keywords[(keywords['Technical/Normative']=='N') & (keywords['Include']=='Y')].Keyword
normative = [preprocess(i) for i in normative]
technical = [preprocess(i) for i in technical] 

#replace keywords of interest
normative = [w.replace('privaci', 'privac') for w in normative]
normative = [w.replace('democraci', 'democra') for w in normative]
normative = [w.replace('equiti', 'equit') for w in normative]
normative = [w.replace('histori', 'histor') for w in normative]
normative = [w.replace('justice', 'justic') for w in normative]
normative = [w.replace('liberti', 'libert') for w in normative]
normative = [w.replace('philosophi', 'philosoph') for w in normative]
normative = [w.replace('societi', 'societ') for w in normative]
normative = [w.replace('polici', 'polic') for w in normative]

technical = [w.replace('ai', '^ai') for w in technical]
technical = [w.replace('cs', '^cs') for w in technical]
technical = [w.replace('ict', '^ict') for w in technical]
technical = [w.replace('ml', '^ml') for w in technical]
technical = [w.replace('nlp', '^nlp') for w in technical]

print(normative)
print(technical)

['account', 'critic', 'democra', 'discrimin', 'equal', 'equit', 'ethic', 'fair', 'femin', 'gender', 'govern', 'histor', 'inequ', 'justic', 'law', 'legal', 'libert', 'moral', 'norm', 'philosoph', 'polit', 'power', 'privac', 'race', 'religi', 'respons', 'right', 'secur', 'social', 'societ', 'surveil', 'transpar', 'valu', 'polic']
['^ai', 'algorithm', 'analyt', 'intellig', 'automat', 'code', 'comput', '^cs', 'cyber', 'data', 'digit', '^ict', 'inform', 'intelligen', 'internet', 'machin', '^ml', 'process', '^nlp', 'platform', 'program', 'robot', 'softwar', 'system', 'technolog']


Extraction process for University of Montana:
1. Loop through each years' catalog.
2. Loop through each of the departments' pages by concatenating the department code to the catalog url. Montana is just like Oregon, all the courses are listed on each departments page.
3. On the department page, make a list of all the courses by selecting the class id courseblock. This creates a list where each element is a list containing the full course title and description.
4. Loop through all the keywords in the normative list and check to see if the keyword can be found in the full course title.
5. If the keyword is in the title, then assign every element of the data columns that can be located.

Data columns are defined in the same way as below and have the same anatomy for each course:
* The course title - in between '-' and the first occurance of '.' in the full title: `title`
* The department and course number - before the first occurance of '.' in the full title: `dept_num`
* The course description - the third list element for the course: `description`
* The number of credits for the course - before '-' in the full title: `credits`
* The course instructor - school does not list in catalog: `instructor`
* The link to the course syllabus (if applicable) - school does not list in catalog: `syllabus`
* The university the course is extracted from - all from the same university: `university`
* The term that the course is offered during (fall, spring, summer / year) - For most classes, the text following 'offered' in the description then add the year by matching to the url it came from: `term`
* The keyword that triggered the extraction (this is for auditing purposes): `keyword`

In [4]:
#init dfs
montana = pd.DataFrame(columns=['title','dept_num','description','credits','instructor',
                                'syllabus','university','term','keyword'])
titles = []
dept_nums = []
descs = []
credit = []
profs = []
syllabi = []
uni = []
term = []     
keyword = []

The extraction process. The process to create the table is kept the same as the example crawler, just as a loop on it's own after all the titles, credits, etc. are all gathered.

In [None]:
#looping through each years catalog
for url in urls:
    #looping through all the departments pages to process individual course's information
    for dept in departments:
        page_link = url + dept
        page_response = requests.get(page_link)
        soup = BeautifulSoup(page_response.content, 'html.parser')
        courses = [p.get_text().split('\n') for p in soup.select(".courseblock")]
        for crs in courses:
            title = crs[1]
            descr = crs[2]
            for word in normative:
                if word in title.lower():
                    titles.append(title[title.find('-')+2:title.find('.')])
                    dept_nums.append(title[:title.find('-')])
                    if(descr[descr.find('.')+1:]==''): descs.append('No description available.')
                    else: descs.append(descr[descr.find('.')+1:])
                    credit.append(title[title.find('.')+2:title.rfind('.')])
                    profs.append('Not Listed')
                    syllabi.append('Not Listed')
                    uni.append('The University of Montana')
                    term.append(descr[descr.find("Offered"):descr.find('.')])
                    if(url=='http://catalog.umt.edu/courses/'): term[len(term)-1]+= ' in 2018-19'
                    else: term[len(term)-1]+= ' in 2017-18'
                    keyword.append(word)
            
for a,b,c,d,e,f,g,h,i in zip(titles,dept_nums,descs,credit,profs,syllabi,uni,term,keyword):
    montana = montana.append({'title': a, 
                              'dept_num': b,
                              'description': c,
                              'credits': d,
                              'instructor': e,
                              'syllabus': f,
                              'university': g,
                              'term': h,
                              'keyword': i}, ignore_index=True)


Post filtering of course. Code is identical to that of example crawler.

In [None]:
exceptions = montana.loc[(montana['keyword']=='privac') | (montana['keyword'] =='secur')]
exceptions

In [None]:
#loop through technical keyword list, extract relevant titles
for word in technical:
    df = montana[montana['title'].str.contains(word, flags = re.IGNORECASE)]
    df['keyword2'] = word
    
#join keyword cols
df["keyword"] = df["keyword"].map(str) + "," + df["keyword2"]
df = df.drop(columns="keyword2")

df

In [None]:
#combine dfs 
montana = pd.concat([df, exceptions])
montana

Exporting of code to csv.

In [None]:
#export as csv
montana.to_csv('39-The University of Montana.csv')