/
PlanningExplorer.py
794 lines (598 loc) · 31.2 KB
/
PlanningExplorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
import urllib2
import urllib
import urlparse
import cgi
import re
import datetime
import cookielib
cookie_jar = cookielib.CookieJar()
from BeautifulSoup import BeautifulSoup
from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText
# Date format to enter into search boxes
date_format = "%d/%m/%Y"
# Regex for getting the application code
# (needed for the comments url, when it exists)
app_code_regex = re.compile("PARAM0=(\d*)")
class PlanningExplorerParser:
# If this authority doesn't have a comments page,
# then set this email_address to an address for the
# planning department, and it will be used in lieu of
# a comments url.
comments_email_address = None
# These are the directories where the info urls, and search urls,
# usually live underneath the base_url.
# If these are different for a particular
# authority, then they can be overridden in a subclass.
info_url_path = "MVM/Online/Generic/"
search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
# This is the most common place for comments urls to live
# The %s will be filled in with an application code
comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
# Most authorities don't need the referer header on the post
# request. If one does, override this in the subclass
use_referer = False
# Some authorities won't give us anything back if we use the
# python urllib2 useragent string. In that case, override this
# in a subclass to pretend to be firefox.
use_firefox_user_agent = False
# This is the most common css class of the table containing the
# the search results. If it is different for a particular authority
# it can be overridden in a subclass
results_table_attrs = {"class": "ResultsTable"}
# These are the most common column positions for the
# council reference, the address, and the description
# in the results table.
# They should be overridden in subclasses if they are different
# for a particular authority.
reference_td_no = 0
address_td_no = 1
description_td_no = 2
# In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False
asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a
single page by choosing the right parameters.
If that hasn't been possible, override this method to get a
new response object which has all the apps in one page.
(See, for example, Hackney).
"""
return response
def _find_trs(self, results_table):
"""Normally, we just want a list of all the trs except the first one
(which is usually a header).
If the authority requires a different list of trs, override this method.
"""
return results_table.findAll("tr")[1:]
def _sanitisePostHtml(self, html):
"""This method can be overriden in subclasses if the
html that comes back from the post request is bad, and
needs tidying up before giving it to BeautifulSoup."""
return html
def _sanitiseInfoUrl(self, url):
"""If an authority has info urls which are for some reason full
of crap (like Broadland does), then this method should be overridden
in order to tidy them up."""
return ''.join(url.split())
def _getHeaders(self):
"""If the authority requires any headers for the post request,
override this method returning a dictionary of header key to
header value."""
headers = {}
if self.use_firefox_user_agent:
headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"
if self.use_referer:
headers["Referer"] = self.search_url
return headers
def _getPostData(self, asp_args, search_date):
"""Accepts asp_args (a tuple of key value pairs of the pesky ASP
parameters, and search_date, a datetime.date object for the day
we are searching for.
This seems to be the most common set of post data which is needed
for PlanningExplorer sites. It won't work for all of them, so
will sometimes need to be overridden in a subclass.
The parameter edrDateSelection is often not needed.
It is needed by Charnwood though, so I've left it in
to keep things simple.
"""
year_month_day = search_date.timetuple()[:3]
post_data = urllib.urlencode(asp_args + (
("_ctl0", "DATE_REGISTERED"),
("rbGroup", "_ctl5"),
("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("edrDateSelection", "1"),
("csbtnSearch", "Search"),
("cboNumRecs", "99999"),
))
return post_data
def _getAddress(self, tds, info_soup):
# If this td contains a div, then the address is the
# string in there - otherwise, use the string in the td.
address_td = tds[self.address_td_no]
if address_td.div is not None:
address = address_td.div.string
else:
address = address_td.string
return address
def _getPostCode(self, info_soup):
"""In most cases, the postcode can be got from the address in
the results table. Some councils put the address there without the
postcode. In this case we will have to go to the info page to get
the postcode. This should be done by overriding this method with
one that parses the info page."""
return getPostcodeFromText(self._current_application.address)
def _getDescription(self, tds, info_soup):
description_td = tds[self.description_td_no]
if description_td.div is not None:
# Mostly this is in a div
# Use the empty string if the description is missing
description = description_td.div.string or ""
else:
# But sometimes (eg Crewe) it is directly in the td.
# Use the empty string if the description is missing
description = description_td.string or ""
return description
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url
self.search_url = urlparse.urljoin(base_url, self.search_url_path)
self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
self.debug = debug
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)
# First do a get, to get some state
get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request)
cookie_jar.extract_cookies(get_response, get_request)
html = get_response.read()
# We need to find those ASP parameters such as __VIEWSTATE
# so we can use them in the next POST
# re.findall gets us a list of key value pairs.
# We want to concatenate it with a tuple, so we must
# make it a tuple
asp_args = tuple(re.findall(self.asp_args_regex, html))
# The post data needs to be different for different councils
# so we have a method on each council's scraper to make it.
post_data = self._getPostData(asp_args, search_date)
headers = self._getHeaders()
request = urllib2.Request(self.search_url, post_data, headers)
cookie_jar.add_cookie_header(request)
post_response = urllib2.urlopen(request)
# We have actually been returned here by an http302 object
# moved, and the response called post_response is really a get.
# In some cases, we can't get the page size set high
# until now. In that case, override _modify_response
# so that we get back a response with all the apps on one page.
# We pass in headers so that any
post_response = self._modify_response(post_response)
html = self._sanitisePostHtml(post_response.read())
soup = BeautifulSoup(html)
results_table = soup.find("table", attrs=self.results_table_attrs)
# If there is no results table, then there were no apps on that day.
if results_table:
trs = self._find_trs(results_table)
self._current_application = None
# The first tr is just titles, cycle through the trs after that
for tr in trs:
self._current_application = PlanningApplication()
# There is no need to search for the date_received, it's what
# we searched for
self._current_application.date_received = search_date
tds = tr.findAll("td")
self._current_application.council_reference = tds[self.reference_td_no].a.string
relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)
# Fetch the info page if we need it, otherwise set it to None
if self.fetch_info_page:
# We need to quote the spaces in the info url
info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
info_soup = BeautifulSoup(urllib2.urlopen(info_request))
else:
info_soup = None
# What about a comment url?
# There doesn't seem to be one, so we'll use the email address
if self.comments_email_address is not None:
# We're using the email address, as there doesn't seem
# to be a web form for comments
self._current_application.comment_url = self.comments_email_address
else:
# This link contains a code which we need for the comments url
# (on those sites that use it)
application_code = app_code_regex.search(relative_info_url).groups()[0]
relative_comments_url = self.comments_path %(application_code)
self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
self._current_application.address = self._getAddress(tds, info_soup)
self._current_application.postcode = self._getPostCode(info_soup)
self._current_application.description = self._getDescription(tds, info_soup)
self._results.addApplication(self._current_application)
return self._results
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class BroadlandLike:
# FIXME - BroadlandLike authorities don't have postcodes on their site, but
# they do have grid references. We should use these.
results_table_attrs = {"class": "display_table"}
info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx"
use_firefox_user_agent = True
use_referer = True
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("cboSelectDateValue", "DATE_RECEIVED"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("csbtnSearch", "Search"),
))
return post_data
def _sanitiseInfoUrl(self, url):
"""The broadland info urls arrive full of rubbish. This method tidies
them up."""
# We need to
# 1) Remove whitespace
# 2) Remove 
 and 
ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
return ''.join(ws_re.split(url))
class BlackburnParser(PlanningExplorerParser):
use_firefox_user_agent = True
class BroadlandParser(BroadlandLike, PlanningExplorerParser):
# FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx
# a better url for Broadland?
def _sanitisePostHtml(self, html):
"""The page that comes back from the post for the broadland site
has a broken doctype declaration. We need to tidy that up before
giving it to BeautifulSoup."""
# This is what it looks like - note the missing close doublequote
#<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>
# Split on the broken doctype and join with the doctype with
# closing quote.
html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.join(html.split('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>'))
return html
class CamdenParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
class CharnwoodParser(PlanningExplorerParser):
use_firefox_user_agent = True
class CreweParser(PlanningExplorerParser):
use_firefox_user_agent = True
use_referer = True
info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
results_table_attrs = {"class": "display_table"}
def _getPostData(self, asp_args, search_date):
year_month_day = search_date.timetuple()[:3]
post_data = urllib.urlencode(asp_args + (
("txtApplicantName", ""),
("txtAgentName", ""),
("cboStreetReferenceNumber", ""),
("txtProposal", ""),
("cboWardCode", ""),
("cboParishCode", ""),
("cboApplicationTypeCode", ""),
("cboDevelopmentTypeCode", ""),
("cboStatusCode", ""),
("cboSelectDateValue", "DATE_RECEIVED"),
("cboMonths", "1"),
("cboDays", "1"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data
class EastStaffsParser(PlanningExplorerParser):
use_firefox_user_agent = True
address_td_no = 4
description_td_no = 1
class EppingForestParser(PlanningExplorerParser):
use_firefox_user_agent = True
address_td_no = 3
description_td_no = 1
class ForestHeathParser(BroadlandLike, PlanningExplorerParser):
pass
class HackneyParser(PlanningExplorerParser):
# FIXME - This will only get the first ten records on this
# day. Need to deal with paging.
use_firefox_user_agent = True
address_td_no = 6
description_td_no = 5
def _modify_response(self, response):
# In order to make sure we don't have to worry about any paging,
# We'll fetch this url again with PS=99999.
real_url_tuple = urlparse.urlsplit(response.geturl())
query_string = real_url_tuple[3]
# Get the query as a list of key, value pairs
parsed_query_list = list(cgi.parse_qsl(query_string))
# Go through the query string replacing any PS parameters
# with PS=99999
for i in range(len(parsed_query_list)):
key, value = parsed_query_list[i]
if key == "PS":
value = "99999"
parsed_query_list[i] = (key, value)
new_query_string = urllib.urlencode(parsed_query_list)
new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:]
new_url = urlparse.urlunsplit(new_url_tuple)
new_request = urllib2.Request(new_url, None, self._getHeaders())
new_response = urllib2.urlopen(new_request)
return new_response
#txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search&
def _getPostData(self, asp_args, search_date):
"""Note - using date registered here, not date received. There is too much time taken
between the council 'receiving' an app and 'registering' it for the latter to be useful."""
post_data = urllib.urlencode(asp_args + (
("txtApplicationNumber", ""),
("ctl00", "DATE_REGISTERED"),
("ctl01", "1"),
("ctl02", "1"),
("rbGroup", "ctl05"),
("ctl07_hidden", ""),
("ctl07_input", search_date.strftime(date_format)),
("ctl08_hidden", ""),
("ctl08_input", search_date.strftime(date_format)),
("edrDateSelection", "1"),
("cboApplicationTypeCode", ""),
("txtLocality", ""),
("txtPostCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("txtSiteAddress", ""),
("txtStreetName", ""),
("csbtnSearch", "Search"),
)
)
return post_data
class KennetParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
class LincolnParser(PlanningExplorerParser):
use_firefox_user_agent = True
use_referer = True
results_table_attrs = {"class": "display_table"}
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorer/Generic/"
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("txtApplicationNumber", ""),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboApplicationTypeCode", ""),
("cboStatusCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("cboStreetReferenceNumber", ""),
("txtPostCode", ""),
("cboLocality", ""),
("txtProposal", ""),
("cboSelectDateValue", "DATE_REGISTERED"),
("cboMonths", "1"),
("rbGroup", "rbDay"),
("cboDays", "10"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data
class LiverpoolParser(PlanningExplorerParser):
comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
use_firefox_user_agent = True
use_referer = True
results_table_attrs = {"xmlns:mvm":"http://www.mvm.co.uk"}
info_url_path = "mvm/"
search_url_path = "mvm/planningsearch.aspx"
def _find_trs(self, results_table):
"""In this case we are after all trs except the first two which have a
class attribute row0 or row1."""
return results_table.findAll("tr", {"class":["row0", "row1"]})[3:]
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("dummy", "dummy field\tused for custom\tvalidator"),
("drReceived$txtStart", search_date.strftime(date_format)),
("drReceived$txtEnd", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("cmdSearch", "Search"),
))
return post_data
def _sanitiseInfoUrl(self, url):
"""The liverpool info urls arrive full of rubbish. This method tidies
them up."""
# We need to
# 1) Remove whitespace
# 2) Remove 
 and 
ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
return ''.join(ws_re.split(url))
class MertonParser(PlanningExplorerParser):
use_firefox_user_agent = True
fetch_info_page = True
def _getAddress(self, tds, info_soup):
return info_soup.find(text="Site Address").findNext("td").string.strip()
def _getDescription(self, tds, info_soup):
return info_soup.find(text="Development Proposal").findNext("td").string.strip()
class ShrewsburyParser(PlanningExplorerParser):
use_firefox_user_agent = True
class BirminghamParser(PlanningExplorerParser):
search_url_path = "PlanningExplorer/GeneralSearch.aspx"
info_url_path = "PlanningExplorer/Generic/"
comments_path = "PlanningExplorer/PLComments.aspx?pk=%s"
use_firefox_user_agent = True
use_referer = True
results_table_attrs = {"class": "display_table"}
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("txtApplicationNumber", ""),
("cboApplicationTypeCode", ""),
("txtSiteAddress", ""),
("txtProposal", ""),
("cboWardCode", ""),
("cboConstituencyCode", ""),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboDevelopmentTypeCode", ""),
("cboSelectDateValue", "DATE_REGISTERED"),
("cboMonths", "1"),
("cboDays", "10"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data
class SouthNorfolkParser(PlanningExplorerParser):
use_firefox_user_agent = True
class SouthShropshireParser(PlanningExplorerParser):
comments_email_address = "planning@southshropshire.gov.uk"
use_firefox_user_agent = True
info_url_path = "MVM/Online/PL/"
def _getPostData(self, asp_args, search_date):
local_date_format = "%d-%m-%Y"
year, month, day = search_date.timetuple()[:3]
post_data = urllib.urlencode(asp_args + (
("edrDateSelection:htxtRange", "radRangeBetween"),
("cboDateList", "DATE_REGISTERED"),
("edrDateSelection:txtStart", search_date.strftime(local_date_format)),
("edrDateSelection:txtEnd", search_date.strftime(local_date_format)),
("edrDateSelection:txtDateReceived", "%(day)d-%(month)d-%(year)d~%(day)d-%(month)d-%(year)d" %({"day":day, "month":month, "year":year})),
("cboNumRecs", "99999"),
("csbtnSearch", "Search"),
))
return post_data
class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
# Unlike the other BroadlandLike sites, there are postcodes :-)
pass
class StockportParser(PlanningExplorerParser):
comments_email_address = "admin.dc@stockport.gov.uk"
info_url_path = "MVM/Online/PL/"
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("drDateReceived:txtStart", search_date.strftime(date_format)),
("drDateReceived:txtEnd", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("csbtnSearch", "Search"),),
)
return post_data
class SwanseaParser(BroadlandLike, PlanningExplorerParser):
# Unlike the other BroadlandLike sites, there are postcodes :-)
pass
class TamworthParser(PlanningExplorerParser):
comments_email_address = "planningadmin@tamworth.gov.uk"
use_firefox_user_agent = True
info_url_path = "MVM/Online/PL/"
class TraffordParser(PlanningExplorerParser):
# There are no postcodes on the Trafford site.
use_firefox_user_agent = True
address_td_no = 3
class WestOxfordshireParser(PlanningExplorerParser):
address_td_no = 3
description_td_no = 1
use_firefox_user_agent = True
class WalthamForestParser(PlanningExplorerParser):
search_url_path = "PlanningExplorer/GeneralSearch.aspx"
info_url_path = "PlanningExplorer/Generic/"
use_firefox_user_agent = True
use_referer = True
# I know - I should change this so that the attribute is not comments_email_address, but
# something more general
comments_email_address = "https://www1.walthamforest.gov.uk/webforms/plan_comments/"
results_table_attrs = {"class": "display_table"}
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("txtApplicantName", ""),
("txtAgentName", ""),
("cboStreetReferenceNumber", ""),
("txtProposal", ""),
("cboWardCode", ""),
("cboParishCode", ""),
("cboApplicationTypeCode", ""),
("cboDevelopmentTypeCode", ""),
("cboStatusCode", ""),
("cboSelectDateValue", "DATE_REGISTERED"),
("cboMonths", "1"),
("cboDays", "10"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data
class ConwyParser(BroadlandLike, PlanningExplorerParser):
search_url_path = "Northgate/planningexplorerenglish/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorerEnglish/Generic/"
comments_path = "Northgate/PlanningExplorerEnglish/PLComments.aspx?pk=%s"
use_firefox_user_agent = True
class MendipParser(BroadlandLike, PlanningExplorerParser):
comments_email_address = "customerservices@mendip.gov.uk"
# search_url_path = "northgate/planningexplorer/generalsearch.aspx"
#&first=1&quick=1&search=&txtApplicationNumber=&txtApplicantName=&txtAgentName=&txtProposal=&txtSiteAddress=&txtStreetName=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=12%2F06%2F2009&dateEnd=12%2F06%2F2009&edrDateSelection=&csbtnSearch=Search
#&txtApplicationNumber=&txtProposal=&txtSiteAddress=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=10%2F07%2F2008&dateEnd=20%2F07%2F2008&edrDateSelection=&csbtnSearch=Search
#txtApplicantName=
#txtAgentName=
#cboStreetReferenceNumber=
#txtProposal=
#cboWardCode=
#cboParishCode=
#cboApplicationTypeCode=
#cboDevelopmentTypeCode=
#cboStatusCode=
#cboSelectDateValue=DATE_RECEIVED
#cboMonths=1
#cboDays=1
#rbGroup=rbRange
#dateStart=01%2F03%2F2008
#dateEnd=01%2F04%2F2008
#edrDateSelection=
#csbtnSearch=Search
if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
# parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
# parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
# parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
# parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
# parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
# parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.91/")
# parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
# parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
# parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/")
parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/")
print parser.getResults(12, 6, 2009)
# To Do
# Sort out paging:
# South Shropshire - pages on 6
# Investigate catching unavailable message:
# Charnwood
# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...
# Notes:
# Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation