/
implementation3.py
122 lines (92 loc) · 3.29 KB
/
implementation3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from multiprocessing import Process
from multiprocessing.managers import BaseManager
from concurrentqueue import ConcurrentQueue
from spider import Spider
from BeautifulSoup import BeautifulSoup
import os
import random
import json
import requests
import re
import time
import redis
import pickle
from pymongo import MongoClient
NUM_PROCESSES = 4
# open an error log
log = open('error.log', 'w')
# set up database connection
client = MongoClient('localhost', 27017)
db = client.legal_db
collection = db.legal
# regex for getting the links to the actual data from the main link
link_re = re.compile("http://www\.legis\.state\.pa\.us//WU01/LI/LI/CT/HTM/[0-9]+/[0-9].*\'")
def parse_urls(red_serve):
"""
The main parsing function. Takes a concurrent queue as an argument.
"""
s = requests.Session()
while int(red_serve.llen('urls')) > 0:
item = red_serve.rpop('urls')
item = pickle.loads(item)
# keep dequing items until the queue is empty
# print 'process {} parsing url {}'.format(os.getpid(), item['link'])
# print 'queue size is', q.get_size()
r = s.get(item['link'])
# try to find link to data using regex
matches = link_re.findall(r.text)
if len(matches) > 0:
link = matches[0].replace("'", '')
r2 = s.get(link)
soup = BeautifulSoup(r2.text)
# try to find text surrounded by pre tag
# this applies to some documents and not others
pre = soup.find('pre')
if pre is not None:
data = pre.getText()
item['data'] = data.strip()
# otherwise, just get all the p tags
else:
try:
ps = soup.findAll('p')
text = ''
for p in ps:
text += (p.getText() + '\n')
item['data'] = text.strip()
except Exception as e:
log.write('error occured: ' + str(e))
log.write('url is ' + str(link))
try:
collection.insert(item)
except:
log.write('error adding item to database')
else:
log.write('data link not found in page ' + item['link'])
def crawl(url, r_server):
"""
Starts a spider crawling for all the useful urls in this domain and adding them
to the shared queue. After the spider finishes, this processes starts parse the
urls along with the other processes.
"""
Spider().crawl(url, r_server)
parse_urls(r_server)
def main(url):
url = "http://www.legis.state.pa.us/cfdocs/legis/LI/Public/cons_index.cfm"
# start the redis server
r = redis.StrictRedis(host='localhost', port=6379, db=0)
# start a spider crawling for urls
p = Process(target=crawl, args=(url, r, ))
p.start()
# start other processes for parsing the urls
processes = []
for i in range(NUM_PROCESSES-1):
processes.append(Process(target=parse_urls, args=(r, )))
# wait until some items are in the queue before starting the parsing threads
while int(r.llen('urls')) < 1:
pass
for i in range(NUM_PROCESSES-1):
processes[i].start()
for i in range(NUM_PROCESSES-1):
processes[i].join()
p.join()
main()