-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.rb
158 lines (135 loc) · 5.56 KB
/
spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
require 'rest_client'
require 'nokogiri'
require 'json'
require 'iconv'
require 'uri'
require_relative 'course.rb'
# 難得寫註解,總該碎碎念。
class Spider
attr_reader :semester_list, :courses_list, :query_url, :result_url
def initialize
@query_url = "http://140.118.31.215/querycourse/ChCourseQuery/QueryCondition.aspx"
@result_url = "http://140.118.31.215/querycourse/ChCourseQuery/QueryResult.aspx"
end
def prepare_post_data
r = RestClient.get @query_url
query_page = Nokogiri::HTML(r.to_s)
# 撈第一次資料,拿到 hidden 的表單驗證。
@view_state = query_page.css('input[name="__VIEWSTATE"]').first['value']
@view_state_generator = query_page.css('input[name="__VIEWSTATEGENERATOR"]').first['value']
@event_validation = query_page.css('input[name="__EVENTVALIDATION"]').first['value']
@semester_list = query_page.css('#semester_list option').map {|option| option['value']}
@cookies = r.cookies
nil
end
def get_courses(sem = 0)
# 初始 courses 陣列
@courses = []
# 把表單驗證,還有要送出的資料弄成一包 hash
post_data = {
:__VIEWSTATE => @view_state,
:__VIEWSTATEGENERATOR => @view_state_generator,
:__EVENTVALIDATION => @event_validation,
:Acb0101 => 'on',
:BCH0101 => 'on',
# 看是第幾學年度,預設用最新的
:semester_list => @semester_list[sem],
:QuerySend => '送出查詢'
}
# 先 post 一下,讓 server 知道你送出查詢(以及一些用不到 Google 來的 exception handling)
r = RestClient.post( @query_url, post_data , :cookies => @cookies){ |response, request, result, &block|
if [301, 302, 307].include? response.code
response.follow_redirection(request, result, &block)
else
# final_url = request.url
response.return!(request, result, &block)
end
}
# 然後再到結果頁看結果,記得 cookie,因為有 session id。
puts "loading Courses List..."
r = RestClient.get( @result_url, :cookies => @cookies )
@courses_list = Nokogiri::HTML(r.to_s)
# 跳過第一列,因為是 table header,何不用 th = =?
@courses_list.css('table').last.css('tr')[1..-1].each_with_index do |row, index|
# 稍微 log 下到哪了
print "#{index}, "
# 每一欄
table_data = row.css('td')
# 分配欄位,多麼機械化!
course_code = table_data[0].text
course_title = table_data[1].text
# 跳過 '空白列',覺得 buggy
next if table_data[2].css('a').empty?
detail_url = table_data[2].css('a').first['href']
credits = table_data[3].text
required_or_elective = table_data[4].text
full_or_half_semester = table_data[5].text
lecturer = table_data[6].text
course_time_location = table_data[7].text.split(' ')
people_in_course = table_data[10].text
notes = table_data[11].text
# 好,讓我們爬更深一層
r = RestClient.get(URI.encode(detail_url))
# 做一個編碼轉換的動作,防止 Nokogiri 解析失敗的動作
ic = Iconv.new("utf-8//translit//IGNORE","utf-8")
detail_page = Nokogiri::HTML(ic.iconv(r.to_s))
# 總共上下兩張大 table
table_head = detail_page.css('.tblMain').first
# table_detail = detail_page.css('.tblMain').last
# 解析時間教室字串!一般來說長這樣:M6(IB-509) M7(IB-509)
course_time_location = {}
table_head.css('#lbl_timenode').text.split(' ').each do |raw_timenode|
course_time_location.merge! ({
# {"M6" => IB-509} 的概念
"#{raw_timenode[0..1]}" => raw_timenode[2..-1].gsub(/[\(\)]/, '')
})
end
# 學年 / 課程宗旨 / 課程大綱 / 教科書 / 參考書目 / 修課學生須知 / 評量方式 / 備註說明
semester = detail_page.css('#lbl_semester').text
course_objective = detail_page.css('#tbx_object').text
course_outline = detail_page.css('#tbx_content').text
textbook = detail_page.css('#tbx_textbook').text
references = detail_page.css('#tbx_refbook').text
notice = detail_page.css('#tbx_note').text
grading = detail_page.css('#tbx_grading').text
det_note = detail_page.css('#tbx_remark').text
# 英語課程名稱 / 先修課程 / 課程相關網址
english_course_title = detail_page.css('#lbl_engname').text
prerequisites = detail_page.css('#lbl_precourse').text
course_website = detail_page.css('#hlk_coursehttp').text
# hash 化 course
@courses << Course.new({
"title" => course_title,
"code" => course_code,
"lecturer" => lecturer,
"credits" => credits,
"required" => (required_or_elective == '選'),
"full_or_half_semester" => full_or_half_semester,
"semester" => semester,
"people_in_course" => people_in_course,
"time_location" => course_time_location,
"english_title" => english_course_title,
"prerequisites" => prerequisites,
"website" => course_website,
"objective" => course_objective,
"outline" => course_outline,
"textbook" => textbook,
"references" => references,
"notice" => notice,
"grading" => grading,
"note" => det_note,
"about" => notes,
"url" => URI.encode(detail_url)
}).to_hash
end
nil
end
# 存檔
def save_to(filename='courses.json')
File.open(filename, 'w') {|f| f.write(JSON.pretty_generate(@courses))}
end
end
spider = Spider.new
spider.prepare_post_data
spider.get_courses
spider.save_to