# Imports


In [1]:
import re
import os
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
from pathlib import Path
from functools import partial
import urllib
from PIL import Image

# Helper Functions

In [2]:
# Entity-unit mapping as provided
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Extract all allowed units dynamically from entity_unit_map
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Helper functions

def common_mistake(unit):
    """ Correct common unit mistakes like 'centimeter' to 'centimetre' and 'feet' to 'foot'. """
    if unit in allowed_units:
        return unit
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    """ Parse the numeric value and unit from a string (e.g., '100 gram'). """
    s_stripped = "" if s == None or str(s) == 'nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError(f"Invalid format in {s}")
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in allowed_units:
        raise ValueError(f"Invalid unit [{unit}] found in {s}. Allowed units: {allowed_units}")
    return number, unit

def create_placeholder_image(image_save_path):
    """ Create a placeholder image if the download fails. """
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        print(f"Error creating placeholder image: {e}")

def download_image(image_link, save_folder, retries=3, delay=3):
    """ Download a single image from a URL, with retries on failure. """
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except Exception as e:
            print(f"Error downloading {image_link}: {e}")
            time.sleep(delay)
    
    create_placeholder_image(image_save_path)  # Create a black placeholder image for invalid links

def download_images(image_links, download_folder, allow_multiprocessing=True):
    """ Download multiple images either sequentially or in parallel. """
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=5, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

# Main script to read image URLs from a CSV file and download a specified number of them

def main(csv_file_path, num_images_to_download):
    """ Download a specified number of images from the CSV file. """
    # Reading the image URLs from a CSV file
    df = pd.read_csv(csv_file_path)
    
    # Get the specified number of image links
    image_links = df['image_link'].head(num_images_to_download).tolist()
    
    # Folder to save the downloaded images
    download_folder = '../dataset/images'
    
    # Start the download process
    start_time = timer()
    download_images(image_links, download_folder, allow_multiprocessing=True)
    print(f"Downloaded {num_images_to_download} images in {timer() - start_time:.2f} seconds")

if __name__ == '__main__':
    # Path to the CSV file
    csv_file_path = '../dataset/cleaned/train_clean.csv'  # Update with your CSV file path
    
    # Number of images to download
    num_images_to_download = 5000  # Set the number of images you want to download
    
    # Run the main function
    main(csv_file_path, num_images_to_download)


Process SpawnPoolWorker-1:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-7:
Process SpawnPoolWorker-8:
Traceback (most recent call last):
  File "/Users/arnav/anaconda3/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/arnav/anaconda3/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/arnav/anaconda3/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/arnav/anaconda3/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'download_image' on <module '__main__' (built-in)>
Process SpawnPoolWorker-11:
Traceback (most recent call last):
  File "/Users/arnav/anaconda3/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.r

KeyboardInterrupt: 