In [1]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random
import PIL
from PIL import Image
import datetime

In [2]:
class Scraper():
    def save_image(self):
        pass
    def get_recipe(self):
        pass

In [3]:
class AllRecipesScraper(Scraper):
    def save_image(self,element,idx,basewidth = 300):
        first_rec_photo = element.by_class("rec-photo")[0]
        url = first_rec_photo.attributes.get('src','')
        img_url = URL(url)
        img = Image.open(img_url)
        wpercent = (basewidth / float(img.size[0]))
        hsize = int((float(img.size[1]) * float(wpercent)))
        img = img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
        img.save("img/"+str(idx) + extension(img_url.page))
        
    def get_ingredients(self,element):
        ing_nodes = element.by_class("recipe-ingred_txt added")
        return "\n".join([plaintext(a.content) for a in ing_nodes 
                          if "Add all ingredients to list" not in plaintext(a.content)])

    
    def get_instructions(self,element):
        instr_nodes = element.by_class("recipe-directions__list--item")
        return "\n".join([plaintext(a.content) for a in instr_nodes])
    
    def get_recipe(self,element):
        return self.get_ingredients(element)+"\n"+self.get_instructions(element)
        

In [4]:
class AllRecipesCrawler(Crawler):
    def __init__(self,links, delay,recipe_list = None):
        super( AllRecipesCrawler, self ).__init__(links=links, delay=delay)
        self.scraper = AllRecipesScraper()
        if recipe_list is None:
            self.recipe_list = {}
        else:
            self.recipe_list = recipe_list
        self.count = 0
        
        
    def reset_count(self):
        self.count = 0
        
    def follow(self, link):
        if "recipes/" in str(link.url):
            yield True
        else:
            yield False
            
    def visit(self, link, source=None):
        if "recipe/" in str(link.url):
            print("visiting", str(link.url),self.count)
            try:
                rec_id = re.search(".*recipe/(.*)(/.*/)+", str(link.url)).group(1)
                if rec_id not in self.recipe_list.keys():
                    self.scrape(source, rec_id)
                else:
                    print("already scraped",rec_id)
            except Exception as detail:
                print 'Run-time error:', detail
            
            
    def scrape(self,source,rec_id):
        print("scraping", rec_id)
        element = Element(source)
        try:
            recipe = self.scraper.get_recipe(element)
            self.scraper.save_image(element, rec_id)
            self.recipe_list[rec_id]=recipe
            self.count += 1
        except Exception as detail:
            print 'Handling run-time error:', detail
            
    


In [16]:
base_url = "http://allrecipes.com/"
limit = 5000
crawler = AllRecipesCrawler(links=[base_url], delay=1)


In [None]:
#rerun this code every time
crawler.reset_count()
limit = 5000
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) ) 
print "Saved as", save_as

('visiting', 'http://allrecipes.com/recipe/246946/hatch-chile-corn/', 0)
('scraping', '246946')
('visiting', 'http://allrecipes.com/recipe/246947/pulled-pork-hatch-chile-stew/', 1)
('scraping', '246947')
('visiting', 'http://allrecipes.com/recipe/18417/spanakopita-greek-spinach-pie/?src=VD_Summary', 2)
('already scraped', '18417')
('visiting', 'http://allrecipes.com/recipe/18417/spanakopita-greek-spinach-pie', 2)
Run-time error: 'NoneType' object has no attribute 'group'
('visiting', 'http://allrecipes.com/recipe/24202/shepherds-pie-vi/', 2)
('scraping', '24202')
('visiting', 'http://allrecipes.com/recipe/11689/greek-pasta-salad-i/', 3)
('scraping', '11689')


In [None]:
#In case of kernel restart run this (need to combined saved dicts)

In [6]:
recipe_list = pickle.load( open( "recipe_lists/recipe_list2016-04-22.p", "rb" ) )

In [None]:
base_url = "http://allrecipes.com/"
limit = 5000
crawler = AllRecipesCrawler(links=[base_url], delay=1,recipe_list = recipe_list )
crawler.reset_count()
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)
save_as = "recipe_lists/recipe_list"+str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))+".p"
pickle.dump( crawler.recipe_list, open(save_as , "wb" ) ) 
print "Saved as", save_as


('visiting', 'http://allrecipes.com/recipe/20876/crustless-spinach-quiche/', 0)
('scraping', '20876')
('visiting', 'http://allrecipes.com/recipe/221043/sensational-slow-cooked-beef-brisket/', 1)
('scraping', '221043')
('visiting', 'http://allrecipes.com/recipe/246350/easy-cloud-bread/', 2)
('scraping', '246350')
('visiting', 'http://allrecipes.com/recipe/232745/mother-earths-baked-beans/', 3)
('scraping', '232745')
('visiting', 'http://allrecipes.com/recipe/23600/worlds-best-lasagna/', 4)
('already scraped', '23600')
