Skip to content

Commit

Permalink
Add initial outline of scrapy project
Browse files Browse the repository at this point in the history
  • Loading branch information
iandees committed Jun 15, 2016
1 parent aff10b9 commit 49cb7fc
Show file tree
Hide file tree
Showing 8 changed files with 319 additions and 0 deletions.
Empty file added locations/__init__.py
Empty file.
13 changes: 13 additions & 0 deletions locations/items.py
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class GeojsonPointItem(scrapy.Item):
properties = scrapy.Field()
lon_lat = scrapy.Field()
42 changes: 42 additions & 0 deletions locations/pipelines.py
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from scrapy.exceptions import DropItem


class locationsPipeline(object):
def process_item(self, item, spider):
return item

class GeoJsonWriterPipeline(object):

def __init__(self):
self.file = open('items.jl', 'wb')

def process_item(self, item, spider):
line = json.dumps({
"type": "Feature",
"properties": item['properties'],
"geometry": {
"type": "Point",
"coordinates": item['lon_lat']
}
}) + ",\n"
self.file.write(line)
return item

class DuplicatesPipeline(object):

def __init__(self):
self.ids_seen = set()

def process_item(self, item, spider):
if item['properties']['ref'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['properties']['ref'])
return item
91 changes: 91 additions & 0 deletions locations/settings.py
@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-

# Scrapy settings for locations project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'locations'

SPIDER_MODULES = ['locations.spiders']
NEWSPIDER_MODULE = 'locations.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'locations (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'locations.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'locations.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'locations.pipelines.DuplicatesPipeline': 200,
'locations.pipelines.GeoJsonWriterPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions locations/spiders/__init__.py
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
145 changes: 145 additions & 0 deletions locations/spiders/kroger.py
@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
import scrapy
import json

from locations.items import GeojsonPointItem

class KrogerSpider(scrapy.Spider):
name = "kroger"
allowed_domains = ["www.kroger.com"]
start_urls = (
'https://www.kroger.com/stores?address=37.7578595,-79.76804&includeThirdPartyFuel=true&maxResults=50&radius=3000&showAllStores=false&useLatLong=true',
)

store_types = {
'' : "unknown-blank",
'C': "grocery",
'F': "unknown-f",
'G': "gas station",
'I': "unknown-i",
'J': "unknown-j",
'M': "grocery",
'Q': "unknown-q",
'S': "grocery",
'X': "unknown-x",
}

ll_requests = set()

def store_hours(self, store_hours):
if all([h == '' for h in store_hours.values()]):
return None
else:
day_groups = []
this_day_group = None
for day in ('monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'):
day_open = store_hours[day + 'Open']
day_close = store_hours[day + 'Close']
hours = day_open + "-" + day_close
day_short = day.title()[:2]

if not this_day_group:
this_day_group = dict(from_day=day_short, to_day=day_short, hours=hours)
elif this_day_group['hours'] == hours:
this_day_group['to_day'] = day_short
elif this_day_group['hours'] != hours:
day_groups.append(this_day_group)
this_day_group = dict(from_day=day_short, to_day=day_short, hours=hours)
day_groups.append(this_day_group)

if len(day_groups) == 1:
opening_hours = day_groups[0]['hours']
if opening_hours == '07:00-07:00':
opening_hours = '24/7'
else:
opening_hours = ''
for day_group in day_groups:
if day_group['from_day'] == day_group['to_day']:
opening_hours += '{from_day} {hours}; '.format(**day_group)
else:
opening_hours += '{from_day}-{to_day} {hours}; '.format(**day_group)
opening_hours = opening_hours[:-2]

return opening_hours

def phone_number(self, phone):
return '{}-{}-{}'.format(phone[0:3], phone[3:6], phone[6:10])

def address(self, address):
if not address:
return None

(num, rest) = address['addressLineOne'].split(' ', 1)
addr_tags = {
"addr:housenumber": num.strip(),
"addr:street": rest.strip(),
"addr:city": address['city'],
"addr:state": address['state'],
"addr:postcode": address['zipCode'],
}

return addr_tags


def parse(self, response):
data = json.loads(response.body_as_unicode())

bounding_box = {
'min_lat': 100,
'max_lat': -100,
'min_lon': 300,
'max_lon': -300,
}

for store in data:
store_information = store['storeInformation']
store_hours = store['storeHours']

properties = {
"phone": self.phone_number(store_information['phoneNumber']),
"ref": store_information['recordId'],
"name": store_information['localName'],
"type": self.store_types[store_information['storeType']],
"hours": self.store_hours(store_hours),
}

address = self.address(store_information['address'])
if address:
properties.update(address)

lon_lat = [
float(store_information['latLong']['longitude']),
float(store_information['latLong']['latitude']),
]

bounding_box['min_lat'] = min(bounding_box['min_lat'], lon_lat[1])
bounding_box['max_lat'] = max(bounding_box['max_lat'], lon_lat[1])
bounding_box['min_lon'] = min(bounding_box['min_lon'], lon_lat[0])
bounding_box['max_lon'] = max(bounding_box['max_lon'], lon_lat[0])


yield GeojsonPointItem(
properties=properties,
lon_lat=lon_lat,
)

if data:
box_corners = [
'{},{}'.format(bounding_box['min_lat'], bounding_box['min_lon']),
'{},{}'.format(bounding_box['max_lat'], bounding_box['min_lon']),
'{},{}'.format(bounding_box['min_lat'], bounding_box['max_lon']),
'{},{}'.format(bounding_box['max_lat'], bounding_box['max_lon']),
]

for corner in box_corners:
if corner in self.ll_requests:
self.logger.info("Skipping request for %s because we already did it", corner)
else:
self.ll_requests.add(corner)
yield scrapy.Request(
'https://www.kroger.com/stores?address={}&includeThirdPartyFuel=true&maxResults=50&radius=3000&showAllStores=false&useLatLong=true'.format(
corner
),
)
else:
self.logger.info("No results")
13 changes: 13 additions & 0 deletions locations/spiders/walmart.py
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
import scrapy


class WalmartSpider(scrapy.Spider):
name = "walmart"
allowed_domains = ["walmart.com"]
start_urls = (
'http://www.walmart.com/',
)

def parse(self, response):
pass
11 changes: 11 additions & 0 deletions scrapy.cfg
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = locations.settings

[deploy]
#url = http://localhost:6800/
project = locations

0 comments on commit 49cb7fc

Please sign in to comment.