## Georgia Southern University Crawler

Imports.

In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request #handles urls
import urllib.parse 
import linkGrabber #extracts urls
import json #encodes/decodes json 
import csv 
import requests #downloads a webpage to scrape
from bs4 import BeautifulSoup, NavigableString, Tag #beautifulsoup pulls data from HTML
import nltk #NLP tasks
from nltk import word_tokenize
from nltk.stem import PorterStemmer #removes word endings
stemmer = PorterStemmer()

Keyword preprocessing and url list of relevant catalog years; 2019-20 and 2018-19. Also a list of departments.

In [2]:
#keyword preprocessing
def preprocess(keyword):
    keyword = keyword.lower() #lowercase
    keyword = word_tokenize(keyword) #tokenize
    for word in keyword:
        keyword = stemmer.stem(word) #stem 
    return (keyword)

#course catalog URLs - 2 academic years 
urls = ['https://catalog.georgiasouthern.edu/academics/course-descriptions/',
        'https://catalog.georgiasouthern.edu/archive/2018-2019/academics/course-descriptions/']

#list of all the departments to search through 
departments = ['aast/','acct/','aded/','afas/','anth/',
               'apan/','arab/','arch/','art/','artg/',
               'arth/','arts/','astr/','bchm/','biol/',
               'bios/','bkin/','busa/','ceng/','chbe/',
               'chem/','chfd/','chin/','cied/','cism/',
               'coed/','cohe/','coml/','comm/','coms/',
               'coop/','coun/','crju/','csci/','csds/',
               'cvis/','ddts/','ece/','eceg/','econ/',
               'edat/','edci/','edet/','edld/','edms/',
               'edmt/','edrd/','edsc/','educ/','eduf',
               'edur/','eele/','eeng/','eexe/','egc/',
               'elem/','emba/','engl/','engr/','envh/',
               'envs/','epid/','eprs/','epsf/','epy/',
               'esed/','esl/','espy/','ethc/','euro/',
               'facs/','film/','finc/','fmad/','forl',
               'foun/','frct/','frec/','fren/','frer/',
               'frit/','frlt/','frms/','fye/','gcm/',
               'geog/','geol/','geph/','gero/','gnst/',
               'grmn/','gsou/','gsu/','gwst/','hadm/',
               'hist/','hitc/','hlpr/','hlth/','hnrm/',
               'hons/','hsca/','hscc/','hscf/','hscg/',
               'hscp/','hspm/','humn/','ids/','inds/',
               'ints/','ipse/','irsh/','isci/','it/',
               'itec/','itw/','japn/','kins/','last/',
               'latn/','lead/','lesp/','ling/','logt/',
               'lscm/','lstd/','lwso/','maed/','math/',
               'medt/','meng/','metr/','mfge/','mged/',
               'mgms/','mgnt/','mgse/','mhsa/','mktg/',
               'mmfp/','mmj/','msci/','msed/','musa/',
               'musc/','muse/','nsci/','ntfs/','nucm/',
               'nurs/','ocea/','ontl/','oscm/','pbad/',
               'pbhs/','pbis/','peat/','pebc/','peci/',
               'peec/','pehm/','phil/','phld/','phsc/',
               'phth/','phys/','pols/','prca/','psyc/',
               'psyg/','pubh/','radr/','rads/','rdsc/',
               'read/','recr/','reli/','rels/','resp/',
               'rhab/','rlc/','rthr/','sabr/','sced/',
               'scie/','seac/','segc/','serd/','slpa/',
               'smed/','smgt/','soar/','soci/','sono/',
               'span/','sped/','ssci/','stat/','sust/',
               'tcgt/','tcld/','tcm/','tfg/','tget/',
               'thea/','tmae/','tmfg/','tsec/','tsle/',
               'wbit/','wbus/','wgss/','wgst/','wlst/',
               'wmac/','wmba/','writ/','xreg/']

Creation of normative and technical keywords lists, the same as in example crawler.

In [3]:
#import keywords
keywords = pd.read_csv("keywords.csv")
technical = keywords[(keywords['Technical/Normative']=='T') & (keywords['Include']=='Y')].Keyword
normative = keywords[(keywords['Technical/Normative']=='N') & (keywords['Include']=='Y')].Keyword
normative = [preprocess(i) for i in normative]
technical = [preprocess(i) for i in technical] 

#replace keywords of interest
normative = [w.replace('privaci', 'privac') for w in normative]
normative = [w.replace('democraci', 'democra') for w in normative]
normative = [w.replace('equiti', 'equit') for w in normative]
normative = [w.replace('histori', 'histor') for w in normative]
normative = [w.replace('justice', 'justic') for w in normative]
normative = [w.replace('liberti', 'libert') for w in normative]
normative = [w.replace('philosophi', 'philosoph') for w in normative]
normative = [w.replace('societi', 'societ') for w in normative]
normative = [w.replace('polici', 'polic') for w in normative]

technical = [w.replace('ai', '^ai') for w in technical]
technical = [w.replace('cs', '^cs') for w in technical]
technical = [w.replace('ict', '^ict') for w in technical]
technical = [w.replace('ml', '^ml') for w in technical]
technical = [w.replace('nlp', '^nlp') for w in technical]

print(normative)
print(technical)

['account', 'critic', 'democra', 'discrimin', 'equal', 'equit', 'ethic', 'fair', 'femin', 'gender', 'govern', 'histor', 'inequ', 'justic', 'law', 'legal', 'libert', 'moral', 'norm', 'philosoph', 'polit', 'power', 'privac', 'race', 'religi', 'respons', 'right', 'secur', 'social', 'societ', 'surveil', 'transpar', 'valu', 'polic']
['^ai', 'algorithm', 'analyt', 'intellig', 'automat', 'code', 'comput', '^cs', 'cyber', 'data', 'digit', '^ict', 'inform', 'intelligen', 'internet', 'machin', '^ml', 'process', '^nlp', 'platform', 'program', 'robot', 'softwar', 'system', 'technolog']


Extraction process for Georgia Southern University:
1. Loop through each years' catalog.
2. Loop through each of the departments' pages by concatenating the department code to the catalog url
3. On the department page, make a list of all the course titles, credits, and descriptions
4. Loop through all the keywords in the normative list and check to see if the keyword can be found in the course title
5. If the keyword is in the title, then assign every element of the data columns that can be located.

Data columns are defined in the same way as below and have the same anatomy for each course:
* The course title - in the title list, the string sequence after the last instance of \xa0: `title`
* The department and course number - in the title list, the string sequence before the first instance of \xa0: `dept_num`
* The course description - items in the description list: `description`
* The number of credits for the course - items in the credit list: `credits`
* The course instructor - school does not list in catalog: `instructor`
* The link to the course syllabus (if applicable) - school does not list in catalog: `syllabus`
* The university the course is extracted from - all from the same university: `university`
* The term that the course is offered during (fall, spring, summer / year) - Only matched by year through url: `term`
* The keyword that triggered the extraction (this is for auditing purposes): `keyword`

In [4]:
#init dfs
georgia = pd.DataFrame(columns=['title','dept_num','description','credits','instructor',
                                'syllabus','university','term','keyword'])
titles = []
dept_nums = []
descs = []
credit = []
profs = []
syllabi = []
uni = []
term = []     
keyword = []

The extraction process. The process to create the table is kept the same as the example crawler, just as a loop on it's own after all the titles, credits, etc. are all gathered.

In [5]:
#looping through each years catalog
for url in urls:
    #looping through all the departments pages to process individual course's information
    for dept in departments:
        page_link = url + dept
        page_response = requests.get(page_link)
        soup = BeautifulSoup(page_response.content, 'html.parser')
        courses = [p.get_text() for p in soup.select(".courseblocktitle")]
        hours = [p.get_text() for p in soup.select(".noindent.courseblockhours")]
        descrip = [p.get_text() for p in soup.select(".courseblockdesc")]
        for crs in range(len(courses)):
            title = courses[crs]
            for word in normative:
                if word in title.lower():
                    titles.append(title[title.rfind('\xa0')+1:])
                    dept_nums.append(title[:title.rfind('\xa0')-1])
                    if(descrip[crs]==''): descs.append('No description available.')
                    else: descs.append(descrip[crs][descrip[crs].find('\n')+1:])
                    credit.append(hours[crs])
                    profs.append('Not Listed')
                    syllabi.append('Not Listed')
                    uni.append('Georgia Southern University')
                    if(url=='https://catalog.georgiasouthern.edu/academics/course-descriptions/'): 
                        term.append('2019-20')
                    else: term.append('2018-19')
                    keyword.append(word)
            
for a,b,c,d,e,f,g,h,i in zip(titles,dept_nums,descs,credit,profs,syllabi,uni,term,keyword):
    georgia = georgia.append({'title': a, 
                              'dept_num': b,
                              'description': c,
                              'credits': d,
                              'instructor': e,
                              'syllabus': f,
                              'university': g,
                              'term': h,
                              'keyword': i}, ignore_index=True)


Post filtering of course. Code is identical to that of example crawler.

In [6]:
exceptions = georgia.loc[(georgia['keyword']=='privac') | (georgia['keyword'] =='secur')]
exceptions

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
79,Enterprise Infrastructure and Security,CISM 3134,An overview of the technology and management o...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
80,Principles of Enterprise Information Systems ...,CISM 3331,"An introduction to the various policy, adminis...","3 Credit Hours. 0,2 Lecture Hours. 0,1 Lab...",Not Listed,Not Listed,Georgia Southern University,2019-20,secur
120,"Indust, Commer & Private Secur",CRJU 3220,"History, development, and analysis of private...",3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
133,Issues in Homeland Security,CRJU 3931,"Explores the legal, practical, and ethical cha...",3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
163,Software Security and Secure Coding,CSCI 5380,This course covers methodological framework fo...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
164,Computer Security,CSCI 5431,Computer security theory and practice fundamen...,"3 Credit Hours. 0,2 Lecture Hours. 0,2 Lab...",Not Listed,Not Listed,Georgia Southern University,2019-20,secur
165,Computer Security \n,CSCI 5431G,Computer security theory and practice fundamen...,3 Credit Hours. 2 Lecture Hours. 2 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
166,Software Security and Secure Coding,CSCI 7380,This course covers methodological framework fo...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
167,Data and Database Security,CSCI 7433,This course covers data protection approac...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
169,Network and Computer Security,CSCI 7536,An overview of the fundamentals of network and...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur


In [7]:
#loop through technical keyword list, extract relevant titles
for word in technical:
    df = georgia[georgia['title'].str.contains(word, flags = re.IGNORECASE)]
    df['keyword2'] = word
    
#join keyword cols
df["keyword"] = df["keyword"].map(str) + "," + df["keyword2"]
df = df.drop(columns="keyword2")

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
251,Science and Technology Policy,EURO 4330,The purpose of this course is to introduce the...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"polic,technolog"
560,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"societ,technolog"
561,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"valu,technolog"
1092,Science and Technology Policy,EURO 4330,The purpose of this course is to introduce the...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"polic,technolog"
1386,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"societ,technolog"
1387,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"valu,technolog"


In [8]:
#combine dfs 
georgia = pd.concat([df, exceptions])
georgia

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
251,Science and Technology Policy,EURO 4330,The purpose of this course is to introduce the...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"polic,technolog"
560,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"societ,technolog"
561,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,"valu,technolog"
1092,Science and Technology Policy,EURO 4330,The purpose of this course is to introduce the...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"polic,technolog"
1386,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"societ,technolog"
1387,"Technology, Society and Human Values",PHIL 3200,A philisophical exploration of the formative ...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2018-19,"valu,technolog"
79,Enterprise Infrastructure and Security,CISM 3134,An overview of the technology and management o...,3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
80,Principles of Enterprise Information Systems ...,CISM 3331,"An introduction to the various policy, adminis...","3 Credit Hours. 0,2 Lecture Hours. 0,1 Lab...",Not Listed,Not Listed,Georgia Southern University,2019-20,secur
120,"Indust, Commer & Private Secur",CRJU 3220,"History, development, and analysis of private...",3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur
133,Issues in Homeland Security,CRJU 3931,"Explores the legal, practical, and ethical cha...",3 Credit Hours. 3 Lecture Hours. 0 Lab Hours.,Not Listed,Not Listed,Georgia Southern University,2019-20,secur


Exporting of code to csv.

In [9]:
#export as csv
georgia.to_csv('47-Georgia Southern University.csv')