-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
268 lines (213 loc) · 8.05 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from hashlib import sha256
from es import insert_url, insert_data, create_index, search_url
from pdf_extract import pdf_caller
SEED = "http://www.coep.org.in/"
TIMEOUT = 15
def remove_special_chars(text):
res = ""
for c in text:
# is `c` a letter (upper or lowercase)?
if (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'):
res += c
# is `c` any punctuation mark or special character?
elif c in ('.', ',', '!', '@', '#', '&', '*', '+', '/', '%', '(', ')', '[', ']', '-', '_', ':', '?'):
res += c
# is `c` a digit?
elif c.isdigit():
res += c
# is `c` an escape character?
elif c in ('\r', '\f', '\v', '\b', '\t', '\n'):
res += ' '
# is `c` a single or double quote?
#elif c in ('\'', '\"'):
# res += '\\\\' + c
# is `c` a whitespace?
elif c == ' ' and res:
if res[-1] != ' ':
res += c
return res
def extract_links(doc, seed="http://www.coep.org.in"):
'''
returns a list of links to be crawled
'''
a_list = doc.find_all('a', href=True)
links = [el['href'] for el in a_list]
if len(links) == 0:
return 0
nlinks = [] # final list of urls
for i in range(len(links)):
if not special_url(links[i]):
try:
if links[i][0] == '/' and links[i] != '/' and links[i] != '#': # link is relative
nlinks.append(seed + links[i]) # convert relative to absolute
elif "coep.org" in links[i]: # ignore absolute links not containing the "coep.org" in them
nlinks.append(links[i])
except:
print("Failed in extract_links for", links[i])
continue
return nlinks
starts = ["mailto", "http://www.coep.org.in/calendar", "http://www.coep.org.in/node", "http://www.coep.org.in/user", "http://www.coep.org.in/ham/", "http://nextcloud", "http://www.outlook.com", "https://www.outlook.com", "http://kpoint", "http://portal", "https://login", "http://moodle", "http://foss"]
ends = [".png", ".jpg", ".jpeg", ".doc", ".rar", ".xyz", ".zip", ".war", ".gz", ".tar.gz", "#main-content", "javascript:void(0)"]
misc = ["download/file/fid", "facebook", "twitter", "/node", "www.sedo.com"]
def special_url(url):
# check whether url starts with any element in `starts`
for spc_url in starts:
if url.startswith(spc_url):
return True
# check whether url ends with any element in `ends`
for spc_url in ends:
if url.endswith(spc_url):
return True
# check misc
for term in misc:
if term in url:
return True
return False
def get_title(doc):
# extract the document's title
try:
title = remove_special_chars(doc.find_all('title')[0].text)
ntitle = title[:-28] # to remove "College of Engineering, Pune" as it occurs in all titles
if len(ntitle) < 3:
return title
return ntitle
except:
return "DEFAULT_TITLE"
def get_content(doc):
'''
returns all text between <span> and <p> tags
'''
# find title
title = get_title(doc)
# find all span tags
span_text_list = doc.find_all('span')
span_text = set() # insert into a set as sometimes span elements repeat
for el in span_text_list:
if len(el.text):
span_text.add(el.text)
# find all p tags
p_text_list = doc.find_all('p')
for el in p_text_list:
if len(el.text):
span_text.add(el.text)
final_text = remove_special_chars(" ".join(span_text))
return title, final_text
def sha256_checksum(text):
# find sha256 hash of a string
# convert text to bytes as `sha256` requires `bytes` input
text = bytes(text.encode())
return sha256(text).hexdigest()
def remove_already_seen_links(links, prev_links):
res = []
for link in links:
# if `link` has been seen previously
if link in prev_links:
continue
else:
res.append(link)
return res
# main crawl code
def crawl(seed):
print("Crawling", seed)
# create a requests session for faster GET
req_sess = requests.Session()
resp = None
try:
resp = req_sess.get(seed, timeout=TIMEOUT)
print(seed, resp)
if resp.status_code != 200: # 200 OK
print("***", seed, resp.status_code)
exit()
except:
if resp:
print(seed, resp, resp.text)
else:
print(seed, resp)
exit()
doc = BeautifulSoup(resp.text, 'html.parser')
# list of all links
links = [seed]
# get all links in doc
links += extract_links(doc)
crawled_cntr = 1
for idx, link in enumerate(links):
if idx == 0:
continue
PDF_FLAG = False
print("--------------------")
print("%.2f"%(float(idx)/len(links)*100)+"% done | ("+str(idx)+"/"+str(len(links))+") | "+link)
try:
resp = req_sess.get(link, timeout=TIMEOUT)
if resp.status_code != 200:
print("***", link, resp.status_code)
continue
except requests.exceptions.ConnectionError as e:
print(link, "timed out")
continue
except requests.exceptions.ReadTimeout:
print(link, "read timed out")
continue
except:
continue
# check if resp is a PDF file
if (resp.text[:4] == "%PDF") or (link[-4:] == ".pdf"):
PDF_FLAG = True
pdftext = pdf_caller(link, req_sess)
if (pdftext is None) or (len(pdftext) < 5):
continue
body = remove_special_chars(pdftext)
title = link.split('/')[-1].replace("%20", ' ').replace(".pdf", '')
if not PDF_FLAG:
doc = BeautifulSoup(resp.text, 'html.parser')
title, body = get_content(doc)
# extract new links on link page
new_links = extract_links(doc)
if new_links:
new_links = remove_already_seen_links(new_links, links)
if new_links:
links += new_links # append new links to links
# find sha256 checksum of title and body
chksum = sha256_checksum(title+body)
# search whether link is already present in ES
present, res = search_url(link)
if present: # url is present in the `duplicate_urls` index
# check the previous and current checksum
print("URL exists in index.")
try:
if res["hits"]["total"] == 1:
# find previously stored checksum
prev_chksum = res["hits"]["hits"][0]["_source"]["checksum"]
if prev_chksum == chksum:
continue
else:
_id = res["hits"]["hits"][0]["_id"]
stat = delete_page(_id)
else:
print("Multiple records found", link)
print("res", res)
continue
except TypeError:
continue
else:
# insert url and document
stat = insert_url(link, chksum)
if not stat:
print("Link insertion failed", link)
continue
stat = insert_data(link.encode('utf-8'), title.encode('utf-8'), body.encode('utf-8'))
if not stat:
print("Data insertion failed", link)
continue
crawled_cntr += 1
return crawled_cntr
def main():
create_index()
total_crawled = crawl(SEED)
print("____________________")
print("* Successfully crawled %d URLs *" % total_crawled)
print("____________________")
if __name__ == "__main__":
main()