In [47]:
from pattern.web import URL, DOM, plaintext, Element, extension, Crawler, DEPTH
import re
import pickle
import random

In [5]:
class Scraper():
    def save_image(self):
        pass
    def get_recipe(self):
        pass

In [25]:
class AllRecipesScraper(Scraper):
    def save_image(self,element,idx):
        first_rec_photo = element.by_class("rec-photo")[0]
        url = first_rec_photo.attributes.get('src','')
        print("Image URL", url)
        img_url = URL(url)
        f = open("img/"+str(idx) + extension(img_url.page), 'wb')
        f.write(img_url.download())
        f.close()
        
    def get_ingredients(self,element):
        ing_nodes = element.by_class("recipe-ingred_txt added")
        return "\n".join([plaintext(a.content) for a in ing_nodes])

    
    def get_instructions(self,element):
        instr_nodes = element.by_class("recipe-directions__list--item")
        return "\n".join([plaintext(a.content) for a in instr_nodes])
    
    def get_recipe(self,element):
        return self.get_ingredients(element)+"\n"+self.get_instructions(element)
        

In [43]:
class AllRecipesCrawler(Crawler):
    def __init__(self,links, delay):
        super( AllRecipesCrawler, self ).__init__(links=links, delay=delay)
        self.scraper = AllRecipesScraper()
        self.recipe_list = {}
        self.count = 0
        
    def reset_count(self):
        self.count = 0
        
    def follow(self, link):
        print("following", str(link.url))
        if "recipes/" in str(link.url):
            yield True
        else:
            yield False
            
    def visit(self, link, source=None):
        if "recipe/" in str(link.url):
            print("visiting", str(link.url))
            rec_id = re.search(".*recipe/(.*)(/.*/)+", str(link.url)).group(1)
            print("rec_id",rec_id)
            self.scrape(source, rec_id)
            
    def scrape(self,source,rec_id):
        print("scraping", rec_id)
        element = Element(source)
        try:
            recipe = self.scraper.get_recipe(element)
            self.scraper.save_image(element, rec_id)
            self.recipe_list[rec_id]=recipe
            self.count += 1
        except Exception as detail:
            print 'Handling run-time error:', detail
            
    


In [59]:
class AllRecipesRandomSearch():
    def __init__(self,tried_ids = None, recipe_list = None ):
        self.scraper = AllRecipesScraper()
        if tried_ids is None:
            self.tried_ids = set()
        else:
            self.tried_ids = tried_ids
        if recipe_list is None:
            self.recipe_list = {}
        else:
            self.recipe_list = recipe_list
        self.count = 0
        
    def reset_count(self):
        self.count = 0
        
    def new_id(self,rec_id):
        return rec_id not in self.tried_ids
          
    def visit(self,rec_id):
        url = URL("http://allrecipes.com/recipe/"+str(rec_id))
        try:
            source = url.download(cached=True)
            self.scrape(source, rec_id)
        except Exception as detail:
            print 'Unable to Scrape:', rec_id
            self.tried_ids.add(rec_id)
            
    def scrape(self,source,rec_id):
        print("scraping", rec_id)
        element = Element(source)
        recipe = self.scraper.get_recipe(element)
        self.scraper.save_image(element, rec_id)
        self.recipe_list[rec_id]=recipe
        self.tried_ids.add(rec_id)
        self.count += 1

            

In [60]:
rand_limit = 10
search = AllRecipesRandomSearch()

while search.count < limit:
    rec_id = random.randint(1,10000)
    if search.new_id(rec_id):
        search.visit(rec_id)

pickle.dump( search.recipe_list, open( "recipes.p", "wb" ) )
pickle.dump( search.recipe_list, open( "tried_ids.p", "wb" ) )

Unable to Scrape: 2169
Unable to Scrape: 4582
Unable to Scrape: 6490
Unable to Scrape: 1265
Unable to Scrape: 5850
Unable to Scrape: 25
Unable to Scrape: 6166
Unable to Scrape: 3603
Unable to Scrape: 1000
Unable to Scrape: 4515
Unable to Scrape: 737
Unable to Scrape: 1315
('scraping', 9749)
('Image URL', u'http://images.media-allrecipes.com/global/recipes/nophoto/nopicture-910x511.png')
Unable to Scrape: 5993
Unable to Scrape: 6318
Unable to Scrape: 3974
Unable to Scrape: 604
Unable to Scrape: 4371
('scraping', 8418)
('Image URL', u'http://images.media-allrecipes.com/userphotos/250x250/203263.jpg')


In [44]:
base_url = "http://allrecipes.com/"
limit = 2
crawler = AllRecipesCrawler(links=[base_url], delay=3)
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)

for key, value in crawler.recipe_list.iteritems():
    print value

('visiting', 'http://allrecipes.com/recipe/242279/spicy-chipotle-lettuce-wraps/')
('rec_id', '242279')
('scraping', '242279')
('Image URL', u'http://images.media-allrecipes.com/userphotos/720x405/3481781.jpg')
('visiting', 'http://allrecipes.com/recipe/141370/mexican-strawberry-water-agua-de-fresa/')
('rec_id', '141370')
('scraping', '141370')
('Image URL', u'http://images.media-allrecipes.com/userphotos/720x405/1033332.jpg')
Sauce:
2 (15 ounce) cans tomato sauce
1 cup water
1/2 cup chipotle peppers in adobo sauce
3 tablespoons chili powder
1 teaspoon dried oregano
1 teaspoon sea salt
Wraps:
2 tablespoons extra-virgin olive oil
2 small onions, finely chopped
2 green bell peppers, finely chopped
1 pinch salt
1 1/2 pounds ground beef
4 cloves garlic, minced
2 (15 ounce) cans kidney beans, rinsed and drained
1 (15 ounce) can black beans, rinsed and drained
16 romaine lettuce leaves
Add all ingredients to list
Add all ingredients to list
Blend tomato sauce, water, chipotle peppers in adobo

In [45]:
crawler.reset_count()
while (not crawler.done) and crawler.count < limit:
    crawler.crawl(method=DEPTH, cached=False)
for key, value in crawler.recipe_list.iteritems():
    print value

('visiting', 'http://allrecipes.com/recipe/222744/dandelion-greens-with-a-kick/')
('rec_id', '222744')
('scraping', '222744')
('Image URL', u'http://images.media-allrecipes.com/userphotos/250x250/838595.jpg')
('visiting', 'http://allrecipes.com/recipe/221936/quick-fish-tacos/')
('rec_id', '221936')
('scraping', '221936')
('Image URL', u'http://images.media-allrecipes.com/userphotos/720x405/1017268.jpg')
Sauce:
2 (15 ounce) cans tomato sauce
1 cup water
1/2 cup chipotle peppers in adobo sauce
3 tablespoons chili powder
1 teaspoon dried oregano
1 teaspoon sea salt
Wraps:
2 tablespoons extra-virgin olive oil
2 small onions, finely chopped
2 green bell peppers, finely chopped
1 pinch salt
1 1/2 pounds ground beef
4 cloves garlic, minced
2 (15 ounce) cans kidney beans, rinsed and drained
1 (15 ounce) can black beans, rinsed and drained
16 romaine lettuce leaves
Add all ingredients to list
Add all ingredients to list
Blend tomato sauce, water, chipotle peppers in adobo sauce, chili powder, o

In [41]:
pickle.dump( crawler.recipe_list, open( "recipes.p", "wb" ) )

In [None]:
#TODO
#get rid of "Add all ingredients to list"
#resize images?
#random generation instead of crawling?