Add initial outline of scrapy project

alltheplaces · Jun 15, 2016 · 49cb7fc · 49cb7fc
1 parent aff10b9
commit 49cb7fc
Show file tree

Hide file tree

Showing 8 changed files with 319 additions and 0 deletions.
diff --git a/locations/__init__.py b/locations/__init__.py
diff --git a/locations/items.py b/locations/items.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class GeojsonPointItem(scrapy.Item):
+    properties = scrapy.Field()
+    lon_lat = scrapy.Field()
diff --git a/locations/pipelines.py b/locations/pipelines.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from scrapy.exceptions import DropItem
+
+
+class locationsPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+class GeoJsonWriterPipeline(object):
+
+    def __init__(self):
+        self.file = open('items.jl', 'wb')
+
+    def process_item(self, item, spider):
+        line = json.dumps({
+            "type": "Feature",
+            "properties": item['properties'],
+            "geometry": {
+                "type": "Point",
+                "coordinates": item['lon_lat']
+            }
+        }) + ",\n"
+        self.file.write(line)
+        return item
+
+class DuplicatesPipeline(object):
+
+    def __init__(self):
+        self.ids_seen = set()
+
+    def process_item(self, item, spider):
+        if item['properties']['ref'] in self.ids_seen:
+            raise DropItem("Duplicate item found: %s" % item)
+        else:
+            self.ids_seen.add(item['properties']['ref'])
+            return item
diff --git a/locations/settings.py b/locations/settings.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for locations project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'locations'
+
+SPIDER_MODULES = ['locations.spiders']
+NEWSPIDER_MODULE = 'locations.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'locations (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'locations.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'locations.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'locations.pipelines.DuplicatesPipeline': 200,
+   'locations.pipelines.GeoJsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/locations/spiders/__init__.py b/locations/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/locations/spiders/kroger.py b/locations/spiders/kroger.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import json
+
+from locations.items import GeojsonPointItem
+
+class KrogerSpider(scrapy.Spider):
+    name = "kroger"
+    allowed_domains = ["www.kroger.com"]
+    start_urls = (
+        'https://www.kroger.com/stores?address=37.7578595,-79.76804&includeThirdPartyFuel=true&maxResults=50&radius=3000&showAllStores=false&useLatLong=true',
+    )
+
+    store_types = {
+        '' : "unknown-blank",
+        'C': "grocery",
+        'F': "unknown-f",
+        'G': "gas station",
+        'I': "unknown-i",
+        'J': "unknown-j",
+        'M': "grocery",
+        'Q': "unknown-q",
+        'S': "grocery",
+        'X': "unknown-x",
+    }
+
+    ll_requests = set()
+
+    def store_hours(self, store_hours):
+        if all([h == '' for h in store_hours.values()]):
+            return None
+        else:
+            day_groups = []
+            this_day_group = None
+            for day in ('monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'):
+                day_open = store_hours[day + 'Open']
+                day_close = store_hours[day + 'Close']
+                hours = day_open + "-" + day_close
+                day_short = day.title()[:2]
+
+                if not this_day_group:
+                    this_day_group = dict(from_day=day_short, to_day=day_short, hours=hours)
+                elif this_day_group['hours'] == hours:
+                    this_day_group['to_day'] = day_short
+                elif this_day_group['hours'] != hours:
+                    day_groups.append(this_day_group)
+                    this_day_group = dict(from_day=day_short, to_day=day_short, hours=hours)
+            day_groups.append(this_day_group)
+
+            if len(day_groups) == 1:
+                opening_hours = day_groups[0]['hours']
+                if opening_hours == '07:00-07:00':
+                    opening_hours = '24/7'
+            else:
+                opening_hours = ''
+                for day_group in day_groups:
+                    if day_group['from_day'] == day_group['to_day']:
+                        opening_hours += '{from_day} {hours}; '.format(**day_group)
+                    else:
+                        opening_hours += '{from_day}-{to_day} {hours}; '.format(**day_group)
+                opening_hours = opening_hours[:-2]
+
+            return opening_hours
+
+    def phone_number(self, phone):
+        return '{}-{}-{}'.format(phone[0:3], phone[3:6], phone[6:10])
+
+    def address(self, address):
+        if not address:
+            return None
+
+        (num, rest) = address['addressLineOne'].split(' ', 1)
+        addr_tags = {
+            "addr:housenumber": num.strip(),
+            "addr:street": rest.strip(),
+            "addr:city": address['city'],
+            "addr:state": address['state'],
+            "addr:postcode": address['zipCode'],
+        }
+
+        return addr_tags
+
+
+    def parse(self, response):
+        data = json.loads(response.body_as_unicode())
+
+        bounding_box = {
+            'min_lat': 100,
+            'max_lat': -100,
+            'min_lon': 300,
+            'max_lon': -300,
+        }
+
+        for store in data:
+            store_information = store['storeInformation']
+            store_hours = store['storeHours']
+
+            properties = {
+                "phone": self.phone_number(store_information['phoneNumber']),
+                "ref": store_information['recordId'],
+                "name": store_information['localName'],
+                "type": self.store_types[store_information['storeType']],
+                "hours": self.store_hours(store_hours),
+            }
+
+            address = self.address(store_information['address'])
+            if address:
+                properties.update(address)
+
+            lon_lat = [
+                float(store_information['latLong']['longitude']),
+                float(store_information['latLong']['latitude']),
+            ]
+
+            bounding_box['min_lat'] = min(bounding_box['min_lat'], lon_lat[1])
+            bounding_box['max_lat'] = max(bounding_box['max_lat'], lon_lat[1])
+            bounding_box['min_lon'] = min(bounding_box['min_lon'], lon_lat[0])
+            bounding_box['max_lon'] = max(bounding_box['max_lon'], lon_lat[0])
+
+
+            yield GeojsonPointItem(
+                properties=properties,
+                lon_lat=lon_lat,
+            )
+
+        if data:
+            box_corners = [
+                '{},{}'.format(bounding_box['min_lat'], bounding_box['min_lon']),
+                '{},{}'.format(bounding_box['max_lat'], bounding_box['min_lon']),
+                '{},{}'.format(bounding_box['min_lat'], bounding_box['max_lon']),
+                '{},{}'.format(bounding_box['max_lat'], bounding_box['max_lon']),
+            ]
+
+            for corner in box_corners:
+                if corner in self.ll_requests:
+                    self.logger.info("Skipping request for %s because we already did it", corner)
+                else:
+                    self.ll_requests.add(corner)
+                    yield scrapy.Request(
+                        'https://www.kroger.com/stores?address={}&includeThirdPartyFuel=true&maxResults=50&radius=3000&showAllStores=false&useLatLong=true'.format(
+                            corner
+                        ),
+                    )
+        else:
+            self.logger.info("No results")
diff --git a/locations/spiders/walmart.py b/locations/spiders/walmart.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class WalmartSpider(scrapy.Spider):
+    name = "walmart"
+    allowed_domains = ["walmart.com"]
+    start_urls = (
+        'http://www.walmart.com/',
+    )
+
+    def parse(self, response):
+        pass
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = locations.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = locations