In [1]:
import os
import io
import os
import time
import json
import click
import requests
import pandas as pd
from PIL import Image
from tqdm import tqdm
import urllib.request
import plotly.express as px
import multiprocessing as mp
from calendar import Calendar
from datetime import datetime
from typing import Any, Dict, List, Tuple
from toolz.curried import compose, pipe, curry

In [3]:
def _image_worker(args):
    r"""Helper method for parallelizing image downloads."""
    image_urls, image_uid, downloader_fn = args
    download_status = []
    for ix, image_url in enumerate(image_urls):
        save_to = f"datasets/catalog/{image_uid}/{ix}.png"
        download_status.append(downloader_fn(image_url, save_to=save_to))

    # Sleep for 2 seconds for Imgur, and 0.1 seconds for Reddit and Flickr.
    # This takes care of all request rate limits.
    if "imgur" in image_url:
        time.sleep(2.0)
    else:
        time.sleep(0.1)

    return download_status

@curry
def image_downloader(url: str, save_to: str, longer_resize: int = 512) -> bool:
    r"""
    Download image from ``url`` and save it to ``save_to``.

    Args:
        url: Image URL to download from.
        save_to: Local path to save the downloaded image.

    Returns:
        Boolean variable indicating whether the download was successful
        (``True``) or not (``False``).
    """
    try:
        # 'response.content' will have our image (as bytes) if successful.
        response = requests.get(url)

        # Check if image was downloaded (response must be 200). One exception:
        # Imgur gives response 200 with "removed.png" image if not found.
#         urllib.request.urlretrieve("http://www.gunnerkrigg.com//comics/00000001.jpg", "00000001.jpg")
        if response.status_code != 200 or "removed.png" in response.url:
            return False

        # Write image to disk if it was downloaded successfully.
        pil_image = Image.open(io.BytesIO(response.content)).convert("RGB")

        # Resize image to longest max size while preserving aspect ratio if
        # longest max size is provided (not -1), and image is bigger.
        if longer_resize > 0:
            image_width, image_height = pil_image.size

            scale = longer_resize / float(max(image_width, image_height))

            if scale != 1.0:
                new_width, new_height = tuple(
                    int(round(d * scale)) for d in (image_width, image_height)
                )
                pil_image = pil_image.resize((new_width, new_height))

        # Save the downloaded image to disk.
        os.makedirs(os.path.dirname(save_to), exist_ok=True)
        pil_image.save(save_to)

        return True

    except Exception as err:
        print(err)
        return False   
    
    
def download_imgs(
    annotations_list: str,
    resize: int,
    workers: int,
):

    # Parallelize image downloads.
    with mp.Pool(processes=workers) as p:

        worker_args: List[Tuple] = []
            
        for ann in annotations_list:
            
            worker_args.append((ann["image_links"], ann["id"], image_downloader(longer_resize=resize)))
        
        download_status = []
        with tqdm(total=len(worker_args), desc="Downloading Images") as pbar:
            for _status in p.imap(_image_worker, worker_args):
                download_status.append(_status)
                pbar.update()


# uid = annotations_list[0]["id"]
# for ix, url in enumerate(annotations_list[0]['image_links']):
#     save_to = f"datasets/catalog/{uid}/{ix}.png"
#     output = image_downloader(url=url, save_to=save_to, longer_resize=resize)

#### The following code snipped was used to generate and save (locally) a csv file containing "Scrapped Atributes" of 10000 ready to send annotations.

<code>
select SCRAPED_ATTRIBUTES 
from PUBLIC.PRODUCTVARIANTS pv
where STATUS = 'ready_to_send'
limit 10000
<code>

In [4]:
df = pd.read_csv("datasets/catalog/result_10000.csv")
annotations_list = [json.loads(row[1].item()) for row in df.iterrows()] 

The following snippet can be used to download images associated with one of the annotations in the above created list.

In [42]:
# ann = annotations_list[0]
# args = (ann["image_links"], ann["id"], image_downloader(longer_resize=resize))
# output = _image_worker(args)
# print("{}, Broken link fraction: {:1.3f}".format(ix, 1.-(sum(output)/len(output))))

The following snippet can be used to iterate over all items in the annotations list and save the images locally in the "dataset/catalog" folder, with subfolders to distinguish between items with different ids. 

In [46]:
resize = 512
for ix, ann in enumerate(annotations_list[10:20]):
    args = (ann["image_links"], ann["id"], image_downloader(longer_resize=resize))
    output = _image_worker(args)
    print("{}, Broken link fraction: {:1.3f}".format(ix, 1.-(sum(output)/len(output))))

0, Broken link fraction: 0.000
1, Broken link fraction: 0.000
2, Broken link fraction: 0.000
3, Broken link fraction: 0.667
4, Broken link fraction: 0.000
5, Broken link fraction: 0.000
6, Broken link fraction: 0.000
7, Broken link fraction: 0.000
8, Broken link fraction: 0.000
9, Broken link fraction: 0.000


In [12]:
annotations_list[13]

{'availability': True,
 'brand': 'Faber-Castell',
 'bullets': ['Faber-Castell Albrecht Dürer Watercolour Markers will amaze you with their highly pigmented, lightfast color and stunning precision. These double ended markers allow artists to achieve unbeatable definition and control with watercolor painting. Simply blend your strokes with water to create soft and vibrant watercolor washes.'],
 'currency': 'USD',
 'description': 'Faber-Castell Albrecht Dürer Watercolour Markers will amaze you with their highly pigmented, lightfast color and stunning precision. These double ended markers allow artists to achieve unbeatable definition and control with watercolor painting. Simply blend your strokes with water to create soft and vibrant watercolor washes.\n',
 'description_structured': {'sections': [{'content': '<meta charset="utf-8">\n<p>Faber-Castell Albrecht Dürer Watercolour Markers will amaze you with their highly pigmented, lightfast color and stunning precision. These double ended mar

In [14]:
folder = os.path.join("datasets", "catalog")

In [18]:
ann = annotations_list[13]
ix = 0

In [19]:
os.path.join(folder, ann["id"], f"{ix}.png")

'datasets/catalog/31320113512518/0.png'

In [20]:
annotations_list[755]

{'age_group': '',
 'availability': '',
 'brand': 'Balmain',
 'breadcrumbs': 'Home > Men > Bags > View All',
 'bullets': ["['Protective pouch provided', 'Size: 4 x 44 x 14 cm / 1.5 x 17.3 x 5.5 in', 'Made in Italy', 'Main materials: Cotton, Polyester, Calfskin', 'Item: VM0AB146TJMOGFE']"],
 'color_family': '45564761fg',
 'currency': 'USD',
 'description': 'Ivory and black jacquard belt bag \n\nAdjustable leather strap, zipper closure on top, burnished silver-tone hardware details, black leather panel, interior flat pocket',
 'description_structured': {'sections': [{'content': 'Ivory and black jacquard belt bag <br>\nAdjustable leather strap, zipper closure on top, burnished silver-tone hardware details, black leather panel, interior flat pocket',
    'description_placement': 'DESCRIPTION_MAIN',
    'name': 'Description\n                <span class="icon"></span>'},
   {'content': 'Protective pouch provided <br> \nSize: 4 x 44 x 14 cm / 1.5 x 17.3 x 5.5 in <br> \nMade in Italy <br> \nMai

In [13]:
annotations_list[13]["id"]

'31320113512518'

In [7]:
annotations_list[13]["image_links"]

['https://cdn.shopify.com/s/files/1/0827/1023/products/FCWCM_colors.jpg?v=1592520129',
 'https://cdn.shopify.com/s/files/1/0827/1023/products/5302_fabercastell_WCM.jpg?v=1592520129',
 'https://cdn.shopify.com/s/files/1/0827/1023/products/5302_fabercastell_WCM2.jpg?v=1592520129']

In [None]:
# download_imgs(annotations_list[20:30], resize=512, workers=4)