## Johnson University Catalog Crawler

Imports

In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request #handles urls
import urllib.parse 
import linkGrabber #extracts urls
import json #encodes/decodes json 
import csv 
import requests #downloads a webpage to scrape
from bs4 import BeautifulSoup, NavigableString, Tag #beautifulsoup pulls data from HTML
import nltk #NLP tasks
from nltk import word_tokenize
from nltk.stem import PorterStemmer #removes word endings
stemmer = PorterStemmer()

Since Johnson's catalog is a search page with all the features being javascript interactive modules, I condensed the orinal list urls to just the singular url search page. Even for different terms, the url stays the same so condensing saves time with the loop later on.

In [2]:
#keyword preprocessing
def preprocess(keyword):
    keyword = keyword.lower() #lowercase
    keyword = word_tokenize(keyword) #tokenize
    for word in keyword:
        keyword = stemmer.stem(word) #stem 
    return (keyword)

#course catalog URL
url = 'https://my.jcsu.edu/ICS/Academics/'

#list of all the departments to search through 
departments = ['AES/','ACC/','XAC/','ACE/','AAS/','AAP/',
               'AER/','ARA/','ART/','BAF/','BCT/','BRC/',
               'BIL/','BIO/','RTV/','BUS/','XCA/','CW/',
               'CRE/','CHE/','CHI/','COM/','CSE/','CSC/',
               'CON/','COO/','CRM/','DRA/','ECO/','EDU/',
               'ELE/','EGR/','ENN/','ENT/','ETH/','FLF/',
               'FLS/','FL/','FRE/','FR/','GEN/','GEO/',
               'GER/','GBA/','HLT/','HCP/','HIS/','WBA/',
               'HON/','CDR/','HDR/','AEN/','CEN/','HEN/',
               'HLA/','HLS/','CPO/','HPO/','HRH/','HUM/',
               'IDS/','ITA/','JP/','JOU/','JGA/','JGD/',
               'ENG/','LAT/','LA/','LS/','LY/','MGT/',
               'XMG/','MAR/','MKT/','XMK/','MPL/','XMT/',
               'MTH/','MED/','MSC/','MUS/','XHD/','NUR/',
               'ORT/','HED/','PHI/','PEH/','PED/','PHS/',
               'PHY/','PSC/','XXX/','PLC/','LAW/','POL/',
               'RPO/','PSY/','PLS/','PUR/','RT/','RDG/',
               'REL/','NSC/','RHC/','RUS/','SCE/','SPL/',
               'SMS/','SSC/','SWK/','SOC/','SPA/','SPE/',
               'SPM/','LPD/','INS/','SUS/','TEL/','ZZZ/',
               'URB/','VBA/','VPD/','VPF/','VPG/','VPM/',
               'VPS/','VPT/','VPA/','WEL/']

Keyword preprocessing occurs in exactly the same manner as for the example crawler.

In [3]:
#import keywords
keywords = pd.read_csv("keywords.csv")
technical = keywords[(keywords['Technical/Normative']=='T') & (keywords['Include']=='Y')].Keyword
normative = keywords[(keywords['Technical/Normative']=='N') & (keywords['Include']=='Y')].Keyword
normative = [preprocess(i) for i in normative]
technical = [preprocess(i) for i in technical] 

#replace keywords of interest
normative = [w.replace('privaci', 'privac') for w in normative]
normative = [w.replace('democraci', 'democra') for w in normative]
normative = [w.replace('equiti', 'equit') for w in normative]
normative = [w.replace('histori', 'histor') for w in normative]
normative = [w.replace('justice', 'justic') for w in normative]
normative = [w.replace('liberti', 'libert') for w in normative]
normative = [w.replace('philosophi', 'philosoph') for w in normative]
normative = [w.replace('societi', 'societ') for w in normative]
normative = [w.replace('polici', 'polic') for w in normative]

technical = [w.replace('ai', '^ai') for w in technical]
technical = [w.replace('cs', '^cs') for w in technical]
technical = [w.replace('ict', '^ict') for w in technical]
technical = [w.replace('ml', '^ml') for w in technical]
technical = [w.replace('nlp', '^nlp') for w in technical]

print(normative)
print(technical)

['account', 'critic', 'democra', 'discrimin', 'equal', 'equit', 'ethic', 'fair', 'femin', 'gender', 'govern', 'histor', 'inequ', 'justic', 'law', 'legal', 'libert', 'moral', 'norm', 'philosoph', 'polit', 'power', 'privac', 'race', 'religi', 'respons', 'right', 'secur', 'social', 'societ', 'surveil', 'transpar', 'valu', 'polic']
['^ai', 'algorithm', 'analyt', 'intellig', 'automat', 'code', 'comput', '^cs', 'cyber', 'data', 'digit', '^ict', 'inform', 'intelligen', 'internet', 'machin', '^ml', 'process', '^nlp', 'platform', 'program', 'robot', 'softwar', 'system', 'technolog']


The same idea as the example is followed here:
1. First, we want to find and extract all courses that contain any instance of a normative keyword.
2. Then, we want search within these courses to see if it also contains a technical keyword.

The items of interest remain:
* The course title: `title`
* The department and course number: `dept_num`
* The course description: `description`
* The number of credits for the course: `credits`
* The course instructor: `instructor`
* The link to the course syllabus (if applicable): `syllabus`
* The university the course is extracted from: `university`
* The term that the course is offered during (fall, spring, summer / year): `term`
* The keyword that triggered the extraction (this is for auditing purposes): `keyword`

In [4]:
#init dfs
johnson = pd.DataFrame(columns=['title','dept_num','description','credits','instructor',
                                'syllabus','university','term','keyword','URL'])
titles = []
dept_nums = []
descs = []
credit = []
profs = []
syllabi = []
uni = []
term = []     
keyword = []
URL = []

Part 1 is represented by the loop.

In [5]:
#loop through all normative words and extract relevant elements 
for dept in departments:
        page_link = url + dept
        page_response = requests.get(page_link)
        soup = BeautifulSoup(page_response.content, 'html.parser')
        courses = [p.get_text() for p in soup.find_all('a')]
        links = [r.get('href') for r in soup.find_all('a')]
        for title_pos in range(len(courses)):
            for word in normative:
                if word in courses[title_pos]:
                    print('\n', courses[title_pos], '\n', links[title_pos][links[title_pos].index('Academics'):])
                    
                    titles.append(courses[title_pos])
                    classes = links[title_pos][links[title_pos].index('Academics'):]
                    page_response2 = requests.get('https://my.jcsu.edu/ICS/'+classes)
                    soup2 = BeautifulSoup(page_response2.content, 'html.parser')
                    courses2 = [j.get_text() for j in soup2.find_all('a')]
                    links2 = [h.get('href') for h in soup2.find_all('a')]
                    print('\n', courses2, '\n', links2, '\n', len(courses2)==len(links2))
                    
                    match = links[title_pos][links[title_pos].index('Academics'):]
                    match = match[14:len(match)-1]
                    match = match.replace('_',' ')
                    for course_pos in range(len(courses2)):
                        if match in courses2[course_pos]:
                            if any(yr in ['2017','2018','2019'] for yr in links2[course_pos]):
                                page_response3 = requests.get('https://my.jcsu.edu/ICS'+links2[course_pos][1:])
                                soup3 = BeautifulSoup(page_response3.content, 'html.parser')
                                information = [j.get_text() for j in soup3.find_all('a')]

# for a,b,c,d,e,f,g,h,i,j in zip(titles,dept_nums,descs,credit,profs,syllabi,uni,term,keyword,URL):
#         johnson = johnson.append({'title': a, 
#                                     'dept_num': b,
#                                     'description': c,
#                                     'credits': d,
#                                     'instructor': e,
#                                     'syllabus': f,
#                                     'university': g,
#                                     'term': h,
#                                     'keyword': i,
#                                     'URL': j}, ignore_index=True)



 National Security Affairs Prep II 
 Academics/AER/AER__322/

 ['Skip To Content', '', '', '', '', '', '', 'Johnson C. Smith University', 'Search', 'I forgot my password', 'Home', 'Admissions', 'Alumni', 'QEP', 'JCSU Bookstore', 'Support Services', 'Academics', 'Academics', 'Air Force ROTC - AER', 'National Security Affairs Prep II', 'Main Page', 'National Security Affairs Prep II', 'Main Page', 'JCSU IT Help Desk', 'IT Help Desk Assistance', 'My Account Info', 'JCSU eMarket', 'JCSU Canvas', 'Campus Email', 'James B. Duke Library', 'JCSU.edu', 'Golden Bull Sports', 'JCSU Bookstore', 'National Security Affairs Prep II', '\n\n', 'Privacy Policy', 'About Us', 'Contact Us', 'Campus Directory'] 
 ['#', 'https://www.facebook.com/smithites', 'https://twitter.com/JCSUniversity', 'https://www.youtube.com/user/JohnsonCSmithU', 'http://www.linkedin.com/company/johnson-c.-smith-university?utm_campaign=Argyle+Social-2013-01&utm_medium=Argyle+Social&utm_source=General+Use&utm_term=2013-01-07-17-47-


 Program Planning and Evaluation 
 Academics/HED/HED__334/

 ['Skip To Content', '', '', '', '', '', '', 'Johnson C. Smith University', 'Search', 'I forgot my password', 'Home', 'Admissions', 'Alumni', 'QEP', 'JCSU Bookstore', 'Support Services', 'Academics', 'Academics', 'Personal Health - HED', 'Program Planning and Evaluation', 'Main Page', 'Program Planning and Evaluation', 'Main Page', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'HED  334 A - Program Planning and Evaluation', 'JCSU IT Help Desk', 'IT Help


 Measurement & Evaluation in HP 
 Academics/PED/PED__341/

 ['Skip To Content', '', '', '', '', '', '', 'Johnson C. Smith University', 'Search', 'I forgot my password', 'Home', 'Admissions', 'Alumni', 'QEP', 'JCSU Bookstore', 'Support Services', 'Academics', 'Academics', 'Physical Education - PED', 'Measurement & Evaluation in HP', 'Main Page', 'Measurement & Evaluation in HP', 'Main Page', 'PED  341 A - Measurement & Evaluation in HP', 'PED  341 A - Measurement & Evaluation in HP', 'JCSU IT Help Desk', 'IT Help Desk Assistance', 'My Account Info', 'JCSU eMarket', 'JCSU Canvas', 'Campus Email', 'James B. Duke Library', 'JCSU.edu', 'Golden Bull Sports', 'JCSU Bookstore', 'Measurement & Evaluation in HP', '\n\n', 'PED  341 A - Measurement & Evaluation in HP', 'PED  341 A - Measurement & Evaluation in HP', 'Privacy Policy', 'About Us', 'Contact Us', 'Campus Directory'] 
 ['#', 'https://www.facebook.com/smithites', 'https://twitter.com/JCSUniversity', 'https://www.youtube.com/user/Johnson


 ['Skip To Content', '', '', '', '', '', '', 'Johnson C. Smith University', 'Search', 'I forgot my password', 'Home', 'Admissions', 'Alumni', 'QEP', 'JCSU Bookstore', 'Support Services', 'Academics', 'Academics', 'Psychology - PSY', 'Abnormal Psychology', 'Main Page', 'Abnormal Psychology', 'Main Page', 'PSY  432 - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 S - Abnormal Psychology', 'PSY  432 S - Abnormal Psychology', 'JCSU IT Help Desk', 'IT Help Desk Assistance', 'My Account Info', 'JCSU eMarket', 'JCSU Canvas', 'Campus Email', 'James B. Duke Library', 'JCSU.edu', 'Golden Bull Sports', 'JCSU Bookstore', 'Abnormal Psychology', '\n\n', 'PSY  432 - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 A - Abnormal Psychology', 'PSY  432 S - Abnormal Psychology', 'PSY  43

Now that we've extracted all courses containing a normative keyword of interest, we need to filter our courses to only return titles that contain a normative AND a technical keyword. This is the case for all words except instances of our preprocessed `privac` and `secur`, for which we want to return all courses, even if they don't contain two keywords. To do this, we'll split the courses into two data frames, apply our respective conditions, and then merge them back together. 

In [185]:
exceptions = johnson.loc[(johnson['keyword']=='privac') | (johnson['keyword'] =='secur')]
exceptions

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
20,Security and Privacy Concepts in the Wild,CS 5435,This course will impart a technical and social...,3,"Juels, A",1 available,cornell university,Fall 2017,privac
25,Fixed-Income Securities,AEM 4260,Focuses on fixed-income securities including c...,4,"Bogan, V",none,cornell university,Fall 2017,secur
49,Privacy in the Digital Age,CS 5436,This course introduces students to privacy tec...,3-4,"Nissenbaum, H",none,cornell university,Spring 2018,privac
54,Practitioner's Overview of Securities Markets ...,AEM 3060,A broad overview of various aspects of the Fix...,1,"Edwards, A",1 available,cornell university,Spring 2018,secur
93,Security and Privacy Concepts in the Wild,CS 5435,This course will impart a technical and social...,3,"Juels, A",1 available,cornell university,Fall 2018,privac
98,National Security Affairs / Preparation for Ac...,AIRS 4401,This course is designed for college seniors an...,3,"Heath, M",none,cornell university,Fall 2018,secur
123,"Internet Law, Privacy and Security",LAW 6568,"This is a survey course in Internet law, with ...",3,"Grimmelmann, J",1 available,cornell university,Spring 2019,privac
128,Practitioner's Overview of Securities Markets ...,AEM 3060,A broad overview of various aspects of the Fix...,1,"Edwards, A",1 available,cornell university,Spring 2019,secur


In [187]:
#loop through technical keyword list, extract relevant titles
for word in technical:
    df = johnson[johnson['title'].str.contains(word, flags = re.IGNORECASE)]
    df['keyword2'] = word
    
#join keyword cols
df["keyword"] = df["keyword"].map(str) + "," + df["keyword2"]
df = df.drop(columns="keyword2")

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
38,"Gendering Religion, Science and Technology",AMST 2621,"There are several ""just-so stories"" about scie...",4,"Rock-Singer, C",none,cornell university,Spring 2018,"gender,technolog"


NOTE: the above cell is likely not the best nor most simple way to execute this step! Feel free to take special liberties here. It's probably wise to pick out a few titles that you know should be returned manually, then check to see if the script is working as desired. 

In [188]:
#combine dfs 
johnson = pd.concat([df, exceptions])
johnson

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
38,"Gendering Religion, Science and Technology",AMST 2621,"There are several ""just-so stories"" about scie...",4,"Rock-Singer, C",none,cornell university,Spring 2018,"gender,technolog"
20,Security and Privacy Concepts in the Wild,CS 5435,This course will impart a technical and social...,3,"Juels, A",1 available,cornell university,Fall 2017,privac
25,Fixed-Income Securities,AEM 4260,Focuses on fixed-income securities including c...,4,"Bogan, V",none,cornell university,Fall 2017,secur
49,Privacy in the Digital Age,CS 5436,This course introduces students to privacy tec...,3-4,"Nissenbaum, H",none,cornell university,Spring 2018,privac
54,Practitioner's Overview of Securities Markets ...,AEM 3060,A broad overview of various aspects of the Fix...,1,"Edwards, A",1 available,cornell university,Spring 2018,secur
93,Security and Privacy Concepts in the Wild,CS 5435,This course will impart a technical and social...,3,"Juels, A",1 available,cornell university,Fall 2018,privac
98,National Security Affairs / Preparation for Ac...,AIRS 4401,This course is designed for college seniors an...,3,"Heath, M",none,cornell university,Fall 2018,secur
123,"Internet Law, Privacy and Security",LAW 6568,"This is a survey course in Internet law, with ...",3,"Grimmelmann, J",1 available,cornell university,Spring 2019,privac
128,Practitioner's Overview of Securities Markets ...,AEM 3060,A broad overview of various aspects of the Fix...,1,"Edwards, A",1 available,cornell university,Spring 2019,secur


Lastly, we want to export our csv. Ideally, all csv files should be written to the courses directory in our repository. 

In [189]:
#export as csv
johnson.to_csv('95-Johnson C Smith University.csv')