## Johnson University Catalog Crawler

Imports

In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request #handles urls
import urllib.parse 
import linkGrabber #extracts urls
import json #encodes/decodes json 
import csv 
import requests #downloads a webpage to scrape
from bs4 import BeautifulSoup, NavigableString, Tag #beautifulsoup pulls data from HTML
import nltk #NLP tasks
from nltk import word_tokenize
from nltk.stem import PorterStemmer #removes word endings
stemmer = PorterStemmer()

Since Johnson's catalog is a search page with all the features being javascript interactive modules, I condensed the orinal list urls to just the singular url search page. Even for different terms, the url stays the same so condensing saves time with the loop later on.

In [2]:
#keyword preprocessing
def preprocess(keyword):
    keyword = keyword.lower() #lowercase
    keyword = word_tokenize(keyword) #tokenize
    for word in keyword:
        keyword = stemmer.stem(word) #stem 
    return (keyword)

#course catalog URL
url = 'https://my.jcsu.edu/ICS/Academics/'

#list of all the departments to search through 
departments = ['AES/','ACC/','XAC/','ACE/','AAS/','AAP/',
               'AER/','ARA/','ART/','BAF/','BCT/','BRC/',
               'BIL/','BIO/','RTV/','BUS/','XCA/','CW/',
               'CRE/','CHE/','CHI/','COM/','CSE/','CSC/',
               'COO/','CRM/','DRA/','ECO/','EDU/',
               'ELE/','EGR/','ENN/','ENT/','ETH/','FLF/',
               'FLS/','FL/','FRE/','FR/','GEN/','GEO/',
               'GER/','GBA/','HLT/','HCP/','HIS/','WBA/',
               'HON/','CDR/','HDR/','AEN/','CEN/','HEN/',
               'HLA/','HLS/','CPO/','HPO/','HRH/','HUM/',
               'IDS/','ITA/','JP/','JOU/','JGA/','JGD/',
               'ENG/','LAT/','LA/','LS/','LY/','MGT/',
               'XMG/','MAR/','MKT/','XMK/','MPL/','XMT/',
               'MTH/','MED/','MSC/','MUS/','XHD/','NUR/',
               'ORT/','HED/','PHI/','PEH/','PED/','PHS/',
               'PHY/','PSC/','XXX/','PLC/','LAW/','POL/',
               'RPO/','PSY/','PLS/','PUR/','RT/','RDG/',
               'REL/','NSC/','RHC/','RUS/','SCE/','SPL/',
               'SMS/','SSC/','SWK/','SOC/','SPA/','SPE/',
               'SPM/','LPD/','INS/','SUS/','TEL/','ZZZ/',
               'URB/','VBA/','VPD/','VPF/','VPG/','VPM/',
               'VPS/','VPT/','VPA/','WEL/']

Keyword preprocessing occurs in exactly the same manner as for the example crawler.

In [3]:
#import keywords
keywords = pd.read_csv("keywords.csv")
technical = keywords[(keywords['Technical/Normative']=='T') & (keywords['Include']=='Y')].Keyword
normative = keywords[(keywords['Technical/Normative']=='N') & (keywords['Include']=='Y')].Keyword
normative = [preprocess(i) for i in normative]
technical = [preprocess(i) for i in technical] 

#replace keywords of interest
normative = [w.replace('privaci', 'privac') for w in normative]
normative = [w.replace('democraci', 'democra') for w in normative]
normative = [w.replace('equiti', 'equit') for w in normative]
normative = [w.replace('histori', 'histor') for w in normative]
normative = [w.replace('justice', 'justic') for w in normative]
normative = [w.replace('liberti', 'libert') for w in normative]
normative = [w.replace('philosophi', 'philosoph') for w in normative]
normative = [w.replace('societi', 'societ') for w in normative]
normative = [w.replace('polici', 'polic') for w in normative]

technical = [w.replace('ai', '^ai') for w in technical]
technical = [w.replace('cs', '^cs') for w in technical]
technical = [w.replace('ict', '^ict') for w in technical]
technical = [w.replace('ml', '^ml') for w in technical]
technical = [w.replace('nlp', '^nlp') for w in technical]

print(normative)
print(technical)

['account', 'critic', 'democra', 'discrimin', 'equal', 'equit', 'ethic', 'fair', 'femin', 'gender', 'govern', 'histor', 'inequ', 'justic', 'law', 'legal', 'libert', 'moral', 'norm', 'philosoph', 'polit', 'power', 'privac', 'race', 'religi', 'respons', 'right', 'secur', 'social', 'societ', 'surveil', 'transpar', 'valu', 'polic']
['^ai', 'algorithm', 'analyt', 'intellig', 'automat', 'code', 'comput', '^cs', 'cyber', 'data', 'digit', '^ict', 'inform', 'intelligen', 'internet', 'machin', '^ml', 'process', '^nlp', 'platform', 'program', 'robot', 'softwar', 'system', 'technolog']


The same idea as the example is followed here:
1. First, we want to find and extract all courses that contain any instance of a normative keyword.
2. Then, we want search within these courses to see if it also contains a technical keyword.

The items of interest remain:
* The course title: `title`
* The department and course number: `dept_num`
* The course description: `description`
* The number of credits for the course: `credits`
* The course instructor: `instructor`
* The link to the course syllabus (if applicable): `syllabus`
* The university the course is extracted from: `university`
* The term that the course is offered during (fall, spring, summer / year): `term`
* The keyword that triggered the extraction (this is for auditing purposes): `keyword`

In [4]:
#init dfs
johnson = pd.DataFrame(columns=['title','dept_num','description','credits','instructor',
                                'syllabus','university','term','keyword','URL'])
titles = []
dept_nums = []
descs = []
credit = []
profs = []
syllabi = []
uni = []
term = []     
keyword = []
URL = []

Part 1 is represented by the loop.

In [5]:
#loop through all normative words and extract relevant elements 
for dept in departments:
        page_link = url + dept
        page_response = requests.get(page_link)
        soup = BeautifulSoup(page_response.content, 'html.parser')
        courses = [p.get_text() for p in soup.find_all('a')]
        links = [r.get('href') for r in soup.find_all('a')]
        links = links[:courses.index('Privacy policy')]
        courses = courses[:courses.index('Privacy policy')]
        for title_pos in range(len(courses)):
            for word in normative:
                if word in courses[title_pos]:
                    classes = links[title_pos][links[title_pos].index('Academics'):]
                    page_response2 = requests.get('https://my.jcsu.edu/ICS/'+classes)
                    soup2 = BeautifulSoup(page_response2.content, 'html.parser')
                    courses2 = [j.get_text() for j in soup2.find_all('a')]
                    links2 = [h.get('href') for h in soup2.find_all('a')]
                    match = links[title_pos][links[title_pos].index('Academics'):]
                    match = match[14:len(match)-1]
                    match = match.replace('_',' ')
                    links2 = links2[:courses2.index('My Account Info')]
                    courses2 = courses2[:courses2.index('My Account Info')]
                    for course_pos in range(len(courses2)):
                        if match in courses2[course_pos]:
                            if any(yr in links2[course_pos] for yr in ['2017','2018','2019']):
                                page_response3 = requests.get('https://my.jcsu.edu'+links2[course_pos])
                                soup3 = BeautifulSoup(page_response3.content, 'html.parser')
                                titles.append(courses[title_pos])
                                termInfo = [x.get_text().split('\n') for x in soup3.find_all('div', attrs={'id': 'TermInfo'})]
                                termInfo = termInfo[0]
                                faculty = [y.get_text().split('\n') for y in soup3.find_all('div', attrs={'id': 'Faculty'})]
                                faculty = faculty[0]
                                courseDescrip = [z.get_text().split('\n') for z in soup3.find_all('div', attrs={'id': 'CourseDescription'})]
                                courseDescrip = courseDescrip[0]
                                term.append(termInfo[5])
                                dept_nums.append(termInfo[2][termInfo[2].index('(')+1:])
                                profs.append(faculty[6])
                                syllabi.append('None')
                                uni.append('Johnson C Smith University')
                                keyword.append(word)
                                descs.append(courseDescrip[2])
                                credit.append(courseDescrip[2])
                                URL.append('https://my.jcsu.edu'+links2[course_pos])

for a,b,c,d,e,f,g,h,i,j in zip(titles,dept_nums,descs,credit,profs,syllabi,uni,term,keyword,URL):
    johnson = johnson.append({'title': a, 
                              'dept_num': b,
                              'description': c,
                              'credits': d,
                              'instructor': e,
                              'syllabus': f,
                              'university': g,
                              'term': h,
                              'keyword': i,
                              'URL': j}, ignore_index=True)


Now that we've extracted all courses containing a normative keyword of interest, we need to filter our courses to only return titles that contain a normative AND a technical keyword. This is the case for all words except instances of our preprocessed `privac` and `secur`, for which we want to return all courses, even if they don't contain two keywords. To do this, we'll split the courses into two data frames, apply our respective conditions, and then merge them back together. 

In [6]:
exceptions = johnson.loc[(johnson['keyword']=='privac') | (johnson['keyword'] =='secur')]
exceptions

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword,URL


In [7]:
#loop through technical keyword list, extract relevant titles
for word in technical:
    df = johnson[johnson['title'].str.contains(word, flags = re.IGNORECASE)]
    df['keyword2'] = word
    
#join keyword cols
df["keyword"] = df["keyword"].map(str) + "," + df["keyword2"]
df = df.drop(columns="keyword2")

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword,URL


NOTE: the above cell is likely not the best nor most simple way to execute this step! Feel free to take special liberties here. It's probably wise to pick out a few titles that you know should be returned manually, then check to see if the script is working as desired. 

In [8]:
#combine dfs 
johnson = pd.concat([df, exceptions])
johnson

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword,URL


Lastly, we want to export our csv. Ideally, all csv files should be written to the courses directory in our repository. 

In [9]:
#export as csv
johnson.to_csv('95-Johnson C Smith University.csv')