In [None]:
import pandas as pd
import numpy as np
import requests
import urllib
from lxml.html import fromstring
import threading
import concurrent.futures
from collections import defaultdict
import re
import logging



In [None]:
# Read in seed spreadsheet
jelitto = pd.read_excel('jelitto_pricelist.xls')

In [None]:
# Don't truncate text
pd.set_option('display.max_colwidth', -1)

In [None]:
# Subset of rows to test
jelitto = jelitto.iloc[0:25, :]

In [None]:
# Check dataframe
jelitto.head()

In [None]:
# Add row for image url
image_url = jelitto['Item No.'].apply(lambda row: f'https://www.jelitto.com/out/pictures/master/product/1/{row.lower()}.jpg')


In [None]:
def response_code_error(url):
    """Confirm that Jelitto has an image of the plant.

    Parameters
    ----------
    url: str
        URL to check
    
    Returns
    -------
        int|np.nan
    """
    print(f"Checking url: {url}")
    try:
        code = urllib.request.urlopen(url).getcode()
        print(code)
        return code
    except urllib.error.URLError as err:
        print(f"No Jelitto image {err}")
        return np.nan

In [None]:
# Save checked urls to a dictionary.
confident_urls = defaultdict(str)

# Threaded Jelitto image url checking.
with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
    check_url = {executor.submit(response_code_error, url): url for url in image_url}
    for future in concurrent.futures.as_completed(check_url):
        url = check_url[future]
        try:
            data = future.result()
            if data == 200:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = url
            else:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = np.nan 
        except Exception as exc:
            print(f"{url} generated an exception: {exc}")


In [None]:
# Cast confident_urls to a dataframe sorted by index (the Item No.)
df = pd.DataFrame.from_dict(confident_urls, orient='index', columns=['url'])
df.sort_index(inplace=True)

In [None]:
# Set the index of the jelitto df to Item No & sort
jelitto.set_index('Item No.', inplace=True)
jelitto.sort_index(inplace=True)


In [None]:
# Add image_url column to jelitto
jelitto['image_url'] = df['url'].values

In [None]:
# Create list of wikimedia_urls for rows where jelitto had no image 
# and create alternative lookups for the common name to check if the scientific name has no results
wikimedia_urls = []
alternate_lookups = {}
_url = 'https://commons.wikimedia.org/w/index.php?search='
updates = jelitto[jelitto['image_url'].isnull()]
wikimedia_urls = [(ix, _url + x) for ix, x in updates[['Genus', 'Species ']].apply(lambda x: ' '.join(x).lower(), axis=1).iteritems()]
alternate_lookups = {i: _url+'+'.join(x.split(',')[0].split()) for i,x in updates['Common Names'].iteritems() if not isinstance(x, float)}

# Create list of alternative urls for the threaded wikimedia lookups to add to
ALT_URLS = []


In [None]:
def alt_url(url: tuple, scientific:bool = False, alt_urls:list = ALT_URLS):
    """Test tuple of _id and url to find an image on wikimedia commons.
    
    Parameters
    ----------
    url: tuple
        A tuple of _id and a url
    scientific: bool
        Optional, by default False
    alt_urls: list
        The list to append to if an alternative image url is found.
        Optional, by default ALT_URLS
    """
    _id = url[0]
    url = url[1]
    page = requests.get(url)
    text = fromstring(page.content)
    new_url = text.xpath("//li[@class='mw-search-result']//a/@href|//ul[contains(@class, 'gallery')]//img/@src")[0]
    if new_url.lower().endswith(('.png', '.jpg', '.jpeg')):
        if (_id, "https://commons.wikimedia.org"+new_url) not in alt_urls:
            alt_urls.append((_id, "https://commons.wikimedia.org"+new_url))
        if scientific:
            del alternate_lookups[_id]
    else:
        if scientific:
            pass # try common name
        else:
            alt_urls.append((_id, np.nan)) 



In [None]:
# Checking wikimedia urls with scientific name lookup
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    check_url = {executor.submit(alt_url, url, scientific=True): url for url in wikimedia_urls}  
    for future in concurrent.futures.as_completed(check_url):
        url = check_url[future]

In [None]:
# Checking wikimedia common name lookup
if alternate_lookups:
    lookups = [(k,v) for k,v in alternate_lookups.items()]
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        check_url = {executor.submit(alt_url, url): url for url in lookups}
        for future in concurrent.futures.as_completed(check_url):
            url = check_url[future]

In [None]:
# Cast alternate_urls to df with index aligned to jelitto
altdf = pd.DataFrame(ALT_URLS)
altdf.rename({0: '_id', 1: 'image_url'}, axis=1, inplace=True)
altdf.set_index('_id', inplace=True)


In [None]:
# Replace null values in image_url with urls from alternate_urls
jelitto = jelitto.combine_first(altdf)

In [None]:
# Check results
jelitto[['Genus', 'Species ', 'image_url']]

In [None]:
# Output to csv
jelitto.to_csv('image_urls.csv')