In [None]:
import concurrent.futures
import re
import threading
import urllib
from collections import defaultdict

import numpy as np
import pandas as pd
import requests
from lxml.html import fromstring



In [None]:
# Read in seed spreadsheet
catalog_df = pd.read_excel('jelitto_pricelist.xls')

In [None]:
# Don't truncate text
pd.set_option('display.max_colwidth', 0)

In [None]:
# Subset of rows to test
catalog_df = catalog_df.iloc[0:25, :]

In [None]:
# Check dataframe
catalog_df.head()

In [None]:
# Add row for image url
image_urls = catalog_df['Item No.'].apply(lambda row: f'https://www.jelitto.com/out/pictures/master/product/1/{row.lower()}.jpg')


In [None]:
def response_code_error(url: str) -> int:
    """Confirm that the catalog has an image of the plant.

    Parameters
    ----------
    url: str
        URL to check
    
    Returns
    -------
        int|np.nan
    """
    print(f"Checking url: {url}")
    try:
        code = urllib.request.urlopen(url).getcode()
    except urllib.error.URLError as err:
        print(f"No Jelitto image {err}")
        code = np.nan
    return code

In [None]:
# Save checked urls to a dictionary.
confident_urls = defaultdict(str)

# Threaded catalog image url checking.
with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
    check_url = {executor.submit(response_code_error, url): url for url in image_urls}
    for future in concurrent.futures.as_completed(check_url):
        url = check_url[future]
        try:
            data = future.result()
            if data == 200:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = url
            else:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = np.nan 
        except Exception as exc:
            print(f"{url} generated an exception: {exc}")


In [None]:
# Cast confident_urls to a dataframe sorted by index (the Item No.)
confident_urls_df = pd.DataFrame.from_dict(confident_urls, orient='index', columns=['url'])
confident_urls_df.sort_index(inplace=True)

In [None]:
# Set the index of the catalog df to Item No & sort
catalog_df.set_index('Item No.', inplace=True)
catalog_df.sort_index(inplace=True)


In [None]:
# Add image_url column to the catalog df
catalog_df['image_url'] = confident_urls_df['url'].values

In [None]:
# Create list of scientific name wikimedia urls for rows where the catalog had no image 
# and create alternative lookups for the common name to check if the scientific name has no results

_url = 'https://commons.wikimedia.org/w/index.php?search='
updates = catalog_df[catalog_df['image_url'].isnull()]
scientific_names = [(ix, _url + x) for ix, x in updates[['Genus', 'Species ']].apply(lambda x: ' '.join(x).lower(), axis=1).items()]
common_names = {i: _url+'+'.join(x.split(',')[0].split()) for i,x in updates['Common Names'].items() if not isinstance(x, float)}


In [None]:
def alt_url(url: tuple) -> tuple:
    """Test tuple of _id and url to find an image on wikimedia commons.
    
    Parameters
    ----------
    url: tuple
        A tuple of _id and a url
    
    Returns
    -------
    alt_url: tuple
        (_id, new_url)
    """
    _id = url[0]
    url = url[1]
    page = requests.get(url)
    text = fromstring(page.content)
    new_url = text.xpath("//li[contains(@class,'mw-search-result')]//a/@href|//ul[contains(@class, 'gallery')]//img/@src")[0]
    if new_url.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
        alt_url = (_id, f"https://commons.wikimedia.org{new_url}")
    else:
        alt_url = (_id, None)
    return alt_url



In [None]:
alt_urls = []
# Checking wikimedia urls
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    check_url = {executor.submit(alt_url, url): url for url in scientific_names}  
    for future in concurrent.futures.as_completed(check_url):
        url_tup = future.result()
        if url_tup[1]:
            alt_urls.append(url_tup)

In [None]:
# Checking wikimedia common names from alt_lookups
if common_names:
    lookups = [(k,v) for k,v in common_names.items()]
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        check_url = {executor.submit(alt_url, url): url for url in lookups}
        for future in concurrent.futures.as_completed(check_url):
            url_tup = future.result()
            alt_urls.append(url_tup)

In [None]:
# Cast alternate_urls to df with index aligned to the catalog df
altdf = pd.DataFrame(alt_urls)
altdf.rename({0: '_id', 1: 'image_url'}, axis=1, inplace=True)
altdf.set_index('_id', inplace=True)


In [None]:
# Replace null values in image_url with urls from alt_urls
catalog_df = catalog_df.combine_first(altdf)

In [None]:
# Check results
catalog_df[['Genus', 'Species ', 'image_url']]

In [None]:
# Output to csv
catalog_df.to_csv('image_urls_2.csv')