Projects/SFU webscraper/SFU_webscraper.Rmd

---
title: "SFU Course Outline Webscraper"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

#
In this project, I wrote a function to scrape and clean data from SFU's course outline webpages and display it in a table by parsing HTML source codes

The the classes used are: STAT 270,STAT 100, STAT 240, and STAT 203 

The websites used are:
http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100, and https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100

```{r}
course_url=c("http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100","https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100")

courses=function(course_url){
  coursesdf1=NULL
  for(i in 1:length(course_url)){
course_page = readLines(course_url[i])

#Delivery method
heading_index=grep("<h3", course_page)
format=gsub("<[^>]+>","",course_page[heading_index]) #remove < to the end >
details=gsub("^\\s+|\\s+$", "", format) #remove spaces
classnum=gsub("[^[:digit:]]","",details[1])
getmethod=regexpr("[[:alpha:]]+\\s[[:alpha:]]+$",details[2])
delivmethod=regmatches(details[2],getmethod)

#Course number
heading_index2=grep("<h1", course_page)
format=gsub("<[^>]+>","",course_page[heading_index2][2]) 
format=gsub("^\\s+|\\s+$", "", format)
get=regexpr("[[:upper:]]{2,}\\s[[:digit:]]+\\b",format)
coursenum=regmatches(format,get)

#Course title
heading_index3=grep('<h2 id="title">', course_page)
index1=course_page[(heading_index3[length(heading_index3)]):(heading_index3[length(heading_index3)]+1)]
coursetitle=gsub("^\\s+","",index1[2])

#Name of the course instructor
heading_index4=grep("<h4>Instructor:</h4>", course_page)
index2=course_page[(heading_index4[length(heading_index4)]-1):(heading_index4[length(heading_index4)]+1)]
format2=gsub("<[^>]+>","",index2)
instructorname=gsub("^\\s+","",format2[3])

#Class times
heading_index5=grepl("</h4>|</p>", course_page)
format3=gsub("<[^>]+>"," ",course_page[heading_index5][2])
format3=gsub("(^\\s+| $)","",format3)
classtimes=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format3)
classtimesreadable=gsub("[[:digit:]]+\\s(PM)\\s[[:digit:]]", "[[:digit:]]+\\s(PM to)\\s[[:digit:]]",classtimes)

#Name of textbook
heading_index6=grep("READING", course_page)
format4=gsub("<[^>]+>"," ",course_page[(heading_index6):(heading_index6+5)])
format4=gsub("(^\\s+| $)","",format4[5])
format4=gsub("[&]+[[:alnum:]]{3,}[;]","",format4)
textbook=gsub("(\\s{2,})"," ",format4)

#Exam times
heading_index7=grep("Exam Times",course_page)
index7=course_page[(heading_index7[length(heading_index7)]):(heading_index7[length(heading_index7)]+7)]
format5=gsub("<[^>]+>"," ",index7[-c(2,5,6)])
format5=gsub("(^\\s+| $)","",format5)
format5=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format5)
gettimes=regexpr(".+[PM]\\b", format5)
times=regmatches(format5,gettimes)
getplace=regexpr("[[:upper:]]+\\s[[:digit:]]{4,}(\\s|[[:punct:]])+[[:alnum:]]+$", format5)
place=regmatches(format5,getplace)
examtimes=c(format5[1:2],times[1],place[1],format5[4],times[2],place[2])
exam=paste0(as.vector(na.omit(examtimes[3:7])),collapse=" ")


coursesdf=data.frame(Class.Number=classnum,
                     Delivery.Method=delivmethod,
                     Course.Name.And.Number=coursenum,
                     Title=coursetitle, 
                     Instructor=instructorname, 
                     Course.Times.and.Locations=classtimes, 
                     Textbook=textbook, 
                     Exam.Time.and.Location=exam)

coursesdf1=rbind(coursesdf1,coursesdf)
}
return(coursesdf1)
}

courses(course_url)
```