## University of California - Los Angeles Crawler

Imports.

In [1]:
import pandas as pd
import numpy as np
import re
import urllib.request #handles urls
import urllib.parse 
import linkGrabber #extracts urls
import json #encodes/decodes json 
import csv 
import requests #downloads a webpage to scrape
from bs4 import BeautifulSoup, NavigableString, Tag #beautifulsoup pulls data from HTML
import nltk #NLP tasks
from nltk import word_tokenize
from nltk.stem import PorterStemmer #removes word endings
stemmer = PorterStemmer()

Keyword preprocessing and url list of relevant catalog years; 2019-20 only since 18-19 cannot be found. Also the backend for the search url.

In [2]:
#keyword preprocessing
def preprocess(keyword):
    keyword = keyword.lower() #lowercase
    keyword = word_tokenize(keyword) #tokenize
    for word in keyword:
        keyword = stemmer.stem(word) #stem 
    return (keyword)

#course catalog URLs - 2 academic years 
urls = ['https://www.registrar.ucla.edu/Academics/Course-Descriptions/Course-Details?qry=']

#list of all the departments to search through 
search = '&funsel=1'

Creation of normative and technical keywords lists, the same as in example crawler.

In [3]:
#import keywords
keywords = pd.read_csv("keywords.csv")
technical = keywords[(keywords['Technical/Normative']=='T') & (keywords['Include']=='Y')].Keyword
normative = keywords[(keywords['Technical/Normative']=='N') & (keywords['Include']=='Y')].Keyword
normative = [preprocess(i) for i in normative]
technical = [preprocess(i) for i in technical] 

#replace keywords of interest
normative = [w.replace('privaci', 'privac') for w in normative]
normative = [w.replace('democraci', 'democra') for w in normative]
normative = [w.replace('equiti', 'equit') for w in normative]
normative = [w.replace('histori', 'histor') for w in normative]
normative = [w.replace('justice', 'justic') for w in normative]
normative = [w.replace('liberti', 'libert') for w in normative]
normative = [w.replace('philosophi', 'philosoph') for w in normative]
normative = [w.replace('societi', 'societ') for w in normative]
normative = [w.replace('polici', 'polic') for w in normative]

technical = [w.replace('ai', '^ai') for w in technical]
technical = [w.replace('cs', '^cs') for w in technical]
technical = [w.replace('ict', '^ict') for w in technical]
technical = [w.replace('ml', '^ml') for w in technical]
technical = [w.replace('nlp', '^nlp') for w in technical]

print(normative)
print(technical)

['account', 'critic', 'democra', 'discrimin', 'equal', 'equit', 'ethic', 'fair', 'femin', 'gender', 'govern', 'histor', 'inequ', 'justic', 'law', 'legal', 'libert', 'moral', 'norm', 'philosoph', 'polit', 'power', 'privac', 'race', 'religi', 'respons', 'right', 'secur', 'social', 'societ', 'surveil', 'transpar', 'valu', 'polic']
['^ai', 'algorithm', 'analyt', 'intellig', 'automat', 'code', 'comput', '^cs', 'cyber', 'data', 'digit', '^ict', 'inform', 'intelligen', 'internet', 'machin', '^ml', 'process', '^nlp', 'platform', 'program', 'robot', 'softwar', 'system', 'technolog']


Extraction process for University of California Los Angeles:
1. Loop through each years' catalog.
2. Append keyword to url, then append the rest of the search string to make search page url, note that the search function for UCLA searches descriptions as well as titles.
3. Make a list of all the courses by selecting the class id media-body and loop through them.
4. Check to make sure the keyword is in the course title.
5. If the keyword is in the title, then assign every element of the data columns that can be located.

Data columns are defined in the same way as below and have the same anatomy for each course:
* The course title - after the first instance of '.' in the title, which is the second element in the course list: `title`
* The department and course number - before the first occurance of '.' in the title: `dept_num`
* The course description - the fourth list element for the course: `description`
* The number of credits for the course - the third element in the course list: `credits`
* The course instructor - school does not list in catalog: `instructor`
* The link to the course syllabus (if applicable) - school does not list in catalog: `syllabus`
* The university the course is extracted from - all from the same university: `university`
* The term that the course is offered during (fall, spring, summer / year) - there is only one year found: `term`
* The keyword that triggered the extraction (this is for auditing purposes): `keyword`

In [4]:
#init dfs
angeles = pd.DataFrame(columns=['title','dept_num','description','credits','instructor',
                                'syllabus','university','term','keyword'])
titles = []
dept_nums = []
descs = []
credit = []
profs = []
syllabi = []
uni = []
term = []     
keyword = []

The extraction process. The process to create the table is kept the same as the example crawler, just as a loop on it's own after all the titles, credits, etc. are all gathered.

In [5]:
#looping through each years catalog
for url in urls:
    for word in normative:
        page_link = url + word + search
        page_response = requests.get(page_link)
        soup = BeautifulSoup(page_response.content, 'html.parser')
        courses = [p.get_text().split('\n') for p in soup.select(".media-body")]
        for crs in courses:
            title = crs[1]
            if word in title.lower():
                titles.append(title[title.find('.')+2:])
                dept_nums.append(title[:title.find('.')])
                if(crs[3]==''): descs.append('No description available.')
                else: descs.append(crs[3])
                credit.append(crs[2])
                profs.append('Not Listed')
                syllabi.append('Not Listed')
                uni.append('University of California - Los Angeles')
                term.append('2019-20')
                keyword.append(word)
            
for a,b,c,d,e,f,g,h,i in zip(titles,dept_nums,descs,credit,profs,syllabi,uni,term,keyword):
    angeles = angeles.append({'title': a, 
                              'dept_num': b,
                              'description': c,
                              'credits': d,
                              'instructor': e,
                              'syllabus': f,
                              'university': g,
                              'term': h,
                              'keyword': i}, ignore_index=True)


Post filtering of course. Code is identical to that of example crawler.

In [6]:
exceptions = angeles.loc[(angeles['keyword']=='privac') | (angeles['keyword'] =='secur')]
exceptions

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
2301,Privacy versus National Security,165,"Seminar, four hours. Designed for College Hono...",Units: 5,Not Listed,Not Listed,University of California - Los Angeles,2019-20,privac
2302,"Privacy, Data and Technology",483,No description available.,Units: 1,Not Listed,Not Listed,University of California - Los Angeles,2019-20,privac
2576,National Security Affairs/Preparation for Acti...,140A,"Lecture, three hours. Requisites: courses 1A, ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2577,National Security Affairs/Preparation for Acti...,140B,"Lecture, three hours. Requisites: courses 1A, ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2578,National Security Affairs/Preparation for Acti...,140C,"Lecture, three hours. Requisites: courses 1A, ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2579,Hunger and Food Insecurity as Public Health Is...,233,"Lecture, three hours. Designed for graduate st...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2580,Public Health and National Security at U.S.-Me...,440,"Lecture, two hours; discussion, one hour; rese...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2581,Introduction to Computer Security,136,"Lecture, four hours; discussion, two hours; ou...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2582,Computer Security,236,"Lecture, four hours; outside study, eight hour...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur
2583,Seminar: Practical Aspects of Biosafety and Bi...,213,"Seminar/discussion, two hours. Preparation: on...",Units: 2,Not Listed,Not Listed,University of California - Los Angeles,2019-20,secur


In [7]:
#loop through technical keyword list, extract relevant titles
for word in technical:
    df = angeles[angeles['title'].str.contains(word, flags = re.IGNORECASE)]
    df['keyword2'] = word
    
#join keyword cols
df["keyword"] = df["keyword"].map(str) + "," + df["keyword2"]
df = df.drop(columns="keyword2")

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
13,Information Technology in Accounting,142A,"(Formerly numbered 142.) Lecture, seven and on...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"account,technolog"
14,"Communication Technology, Programming, and Acc...",142B,"Lecture, six hours. Preparation: intermediate ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"account,technolog"
215,Ethics and Impact of Technology on Society,181EW,"Lecture, five hours; discussion, three hours; ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"ethic,technolog"
540,"Ancient and Historic Metals: Corrosion, Techno...",C180,"Seminar, four hours; laboratory, four hours. O...",Units: 6,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
542,"Ancient and Historic Metals: Corrosion, Techno...",C280,"Seminar, four hours; laboratory, four hours. O...",Units: 6,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
780,Introduction to Historical Practice: Variable ...,97I,"Seminar, three hours. Discussion classes of no...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
943,Variable Topics Historiography Proseminar: Sci...,187I,"Seminar, three hours. Proseminar on historiogr...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
958,Capstone Seminar: History--Science/Technology,191I,"Seminar, three hours. Designed for seniors. Li...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
987,Advanced Historiography: Science/Technology,200O,"Seminar, three hours. May be repeated for credit.",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
1013,Topics in History: Science/Technology,201O,"Seminar, three hours. Graduate course involvin...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"


In [8]:
#combine dfs 
angeles = pd.concat([df, exceptions])
angeles

Unnamed: 0,title,dept_num,description,credits,instructor,syllabus,university,term,keyword
13,Information Technology in Accounting,142A,"(Formerly numbered 142.) Lecture, seven and on...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"account,technolog"
14,"Communication Technology, Programming, and Acc...",142B,"Lecture, six hours. Preparation: intermediate ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"account,technolog"
215,Ethics and Impact of Technology on Society,181EW,"Lecture, five hours; discussion, three hours; ...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"ethic,technolog"
540,"Ancient and Historic Metals: Corrosion, Techno...",C180,"Seminar, four hours; laboratory, four hours. O...",Units: 6,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
542,"Ancient and Historic Metals: Corrosion, Techno...",C280,"Seminar, four hours; laboratory, four hours. O...",Units: 6,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
780,Introduction to Historical Practice: Variable ...,97I,"Seminar, three hours. Discussion classes of no...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
943,Variable Topics Historiography Proseminar: Sci...,187I,"Seminar, three hours. Proseminar on historiogr...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
958,Capstone Seminar: History--Science/Technology,191I,"Seminar, three hours. Designed for seniors. Li...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
987,Advanced Historiography: Science/Technology,200O,"Seminar, three hours. May be repeated for credit.",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"
1013,Topics in History: Science/Technology,201O,"Seminar, three hours. Graduate course involvin...",Units: 4,Not Listed,Not Listed,University of California - Los Angeles,2019-20,"histor,technolog"


Exporting of code to csv.

In [9]:
#export as csv
angeles.to_csv('27 - University of California Los Angeles.csv')