In [3]:
import boto3

bucket_name='cocktail-ingredient-images'
session = boto3.Session(profile_name='personal')
client = session.client('s3', region_name='us-west-2')
s3_images = [key['Key'] for key in client.list_objects(Bucket=bucket_name)['Contents']]

In [4]:
import pickle
cdf = pickle.load(open('cocktails_db.pkl', 'rb'))
idf = pickle.load(open('ingredients_db.pkl', 'rb'))

## Get best ingredient category mapping from category_id: image_link based on prevalence of various ingredients

In [5]:
from collections import Counter, defaultdict
ings = [ing for ing_list in cdf.ingredients.tolist() for ing in ing_list]
ing_counts = dict(Counter([ing['id'] for ing in ings if ing['link']]))
ing_links = dict([(ing['id'], ing['link']) for ing in ings if ing['link']])

ing_cat_ids = defaultdict(list)
for ing in ings:
    if ing['link']:
        ing_cat_ids[ing['category_id']].append(ing['id'])
        
ing_cat_links = {}
for cat in ing_cat_ids:
    ing_cat_links[cat] = ing_links[max(ing_cat_ids[cat],key=lambda x: ing_counts[x])]

In [6]:
from bs4 import BeautifulSoup as soup
images = dict()
for category_id in ing_cat_links:
    if category_id + '.png' not in s3_images:
        url = ing_cat_links[category_id]
        file_path = url.replace('/', '_') + '.txt'
        with open('../0_data_scraping/diffordsguide_ingredients/' + file_path,'rb') as f:
            content = f.read()
            page = soup(content)
            # Get image_url
            image_container_classes = ['product-gallery__display', 'product-gallery-static']
            image = None
            for iclass in image_container_classes:
                div = page.find('div', {'class':iclass})
                if div and div.find('img'):
                    image = div.find('img').get('src')
                    break
            if not image:
                image = page.find('section', {'class':'grid-container'}).find('img').get('src')
            if not image:
                print('failed to pull image: ', category_id)
            else:
                images[category_id] = image

# Download various images and upload to s3

In [8]:
import requests

for category_id in images:
    response = requests.get(images[category_id])
    file_name = category_id + '.png'
    file = open(file_name, "wb")
    file.write(response.content)
    file.close()
    client.upload_file(file_name, bucket_name, file_name)

# Check Ingredient Counts ready for Production