-
Notifications
You must be signed in to change notification settings - Fork 207
/
planned_parenthood.py
45 lines (37 loc) · 1.64 KB
/
planned_parenthood.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
from scrapy.spiders import SitemapSpider
from locations.items import Feature
class PlannedParenthoodSpider(SitemapSpider):
name = "planned_parenthood"
item_attributes = {
"brand": "Planned Parenthood",
"brand_wikidata": "Q2553262",
"country": "US",
}
allowed_domains = ["www.plannedparenthood.org"]
sitemap_urls = ["https://www.plannedparenthood.org/sitemap.xml"]
sitemap_rules = [
(
r"https:\/\/www\.plannedparenthood\.org\/health-center\/[-\w]+\/[-\w]+\/(\d+)\/[-\w]+$",
"parse_venue",
)
]
# Note source Microdata is malformed
def parse_venue(self, response):
if response is None:
# Ignoring redirects
return
properties = {
"street_address": response.xpath('//*[@itemprop="streetAddress"]/text()').extract_first(),
"city": response.xpath('//*[@itemprop="addressLocality"]/text()').extract_first(),
"state": response.xpath('//*[@itemprop="addressRegion"]/text()').extract_first(),
"postcode": response.xpath('//*[@itemprop="postalCode"]/text()').extract_first(),
"phone": response.xpath('//a[@itemprop="telephone"][@data-link]/text()').extract_first(),
"ref": response.url,
"website": response.url,
}
if map_image := response.xpath('//img[@class="address-map"]/@data-lazy-interchange').get():
if match := re.search(r"center=(.*?),(.*?)&zoom", map_image):
properties["lat"] = float(match.group(1))
properties["lon"] = float(match.group(2))
yield Feature(**properties)