/
SFU_webscraper.Rmd
98 lines (80 loc) · 3.72 KB
/
SFU_webscraper.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
---
title: "SFU Course Outline Webscraper"
output: html_notebook
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
#
In this project, I wrote a function to scrape and clean data from SFU's course outline webpages and display it in a table by parsing HTML source codes
The the classes used are: STAT 270,STAT 100, STAT 240, and STAT 203
The websites used are:
http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100, https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100, and https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100
```{r}
course_url=c("http://www.sfu.ca/outlines.html?2019/spring/stat/100/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/203/d100","https://www.sfu.ca/outlines.html?2019/spring/stat/240/d100",
"https://www.sfu.ca/outlines.html?2019/spring/stat/270/d100")
courses=function(course_url){
coursesdf1=NULL
for(i in 1:length(course_url)){
course_page = readLines(course_url[i])
#Delivery method
heading_index=grep("<h3", course_page)
format=gsub("<[^>]+>","",course_page[heading_index]) #remove < to the end >
details=gsub("^\\s+|\\s+$", "", format) #remove spaces
classnum=gsub("[^[:digit:]]","",details[1])
getmethod=regexpr("[[:alpha:]]+\\s[[:alpha:]]+$",details[2])
delivmethod=regmatches(details[2],getmethod)
#Course number
heading_index2=grep("<h1", course_page)
format=gsub("<[^>]+>","",course_page[heading_index2][2])
format=gsub("^\\s+|\\s+$", "", format)
get=regexpr("[[:upper:]]{2,}\\s[[:digit:]]+\\b",format)
coursenum=regmatches(format,get)
#Course title
heading_index3=grep('<h2 id="title">', course_page)
index1=course_page[(heading_index3[length(heading_index3)]):(heading_index3[length(heading_index3)]+1)]
coursetitle=gsub("^\\s+","",index1[2])
#Name of the course instructor
heading_index4=grep("<h4>Instructor:</h4>", course_page)
index2=course_page[(heading_index4[length(heading_index4)]-1):(heading_index4[length(heading_index4)]+1)]
format2=gsub("<[^>]+>","",index2)
instructorname=gsub("^\\s+","",format2[3])
#Class times
heading_index5=grepl("</h4>|</p>", course_page)
format3=gsub("<[^>]+>"," ",course_page[heading_index5][2])
format3=gsub("(^\\s+| $)","",format3)
classtimes=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format3)
classtimesreadable=gsub("[[:digit:]]+\\s(PM)\\s[[:digit:]]", "[[:digit:]]+\\s(PM to)\\s[[:digit:]]",classtimes)
#Name of textbook
heading_index6=grep("READING", course_page)
format4=gsub("<[^>]+>"," ",course_page[(heading_index6):(heading_index6+5)])
format4=gsub("(^\\s+| $)","",format4[5])
format4=gsub("[&]+[[:alnum:]]{3,}[;]","",format4)
textbook=gsub("(\\s{2,})"," ",format4)
#Exam times
heading_index7=grep("Exam Times",course_page)
index7=course_page[(heading_index7[length(heading_index7)]):(heading_index7[length(heading_index7)]+7)]
format5=gsub("<[^>]+>"," ",index7[-c(2,5,6)])
format5=gsub("(^\\s+| $)","",format5)
format5=gsub(" [[:punct:]][[:alnum:]]+[[:punct:]]", "",format5)
gettimes=regexpr(".+[PM]\\b", format5)
times=regmatches(format5,gettimes)
getplace=regexpr("[[:upper:]]+\\s[[:digit:]]{4,}(\\s|[[:punct:]])+[[:alnum:]]+$", format5)
place=regmatches(format5,getplace)
examtimes=c(format5[1:2],times[1],place[1],format5[4],times[2],place[2])
exam=paste0(as.vector(na.omit(examtimes[3:7])),collapse=" ")
coursesdf=data.frame(Class.Number=classnum,
Delivery.Method=delivmethod,
Course.Name.And.Number=coursenum,
Title=coursetitle,
Instructor=instructorname,
Course.Times.and.Locations=classtimes,
Textbook=textbook,
Exam.Time.and.Location=exam)
coursesdf1=rbind(coursesdf1,coursesdf)
}
return(coursesdf1)
}
courses(course_url)
```