In [111]:
import concurrent.futures
import re
import threading
import urllib
from collections import defaultdict

import numpy as np
import pandas as pd
import requests
from lxml.html import fromstring



In [112]:
# Read in seed spreadsheet
jelitto = pd.read_excel('jelitto_pricelist.xls')

In [113]:
# Don't truncate text
pd.set_option('display.max_colwidth', 0)

In [114]:
# Subset of rows to test
jelitto = jelitto.iloc[0:25, :]

In [115]:
# Check dataframe
jelitto.head()

Unnamed: 0,Characteristic (GOLD NUGGET SEED® etc.),Herbs/Ornamental Grasses,Item No.,Genus,Species,Series,Variety,Marketing Name,Synonyme,Common Names,...,Height to (cm),Flowering from,Flowering to,Hardiness Zone from,Hardiness Zone to,g/1.000 Plants,Flower Habit,Catalogue Description,Restrictions,Alphabetical Sorting
0,,,AA008,ABUTILON,vitifolium,,,,,"Indian Mallow, Flowering Maple",...,150,May,June,Z8,Z11,10.0,,"mostly blue, rarely white, large Mallow flowers",,10002000
1,,,AA001,ACAENA,buchananii,,,,,"New Zealand Bur, Bidibidi, Piripiri",...,10,July,August,Z5,Z8,5.0,,blue-green foliage,Australia:prohibited,10003500
2,,,AA002,ACAENA,caesiiglauca,,,,,"New Zealand Bur, Bidibidi, Piripiri",...,10,July,August,Z5,Z8,5.0,,grey-green leaves,Australia:prohibited,10004000
3,,,AA011,ACAENA,fissistipula,,,,,New Zealand Burr,...,10,July,August,Z5,Z8,5.0,,"fine, blue-green leaves, white flowers with red anthers",Australia:prohibited,10004500
4,,,AA003,ACAENA,inermis,,,,,"New Zealand Burr, Sheep's Burr",...,10,July,August,Z5,Z8,8.0,,"large, unprickled blossom heads, green foliage",Australia:prohibited,10005000


In [116]:
# Add row for image url
image_url = jelitto['Item No.'].apply(lambda row: f'https://www.jelitto.com/out/pictures/master/product/1/{row.lower()}.jpg')


In [117]:
def response_code_error(url: str) -> int:
    """Confirm that Jelitto has an image of the plant.

    Parameters
    ----------
    url: str
        URL to check
    
    Returns
    -------
        int|np.nan
    """
    print(f"Checking url: {url}")
    try:
        code = urllib.request.urlopen(url).getcode()
    except urllib.error.URLError as err:
        print(f"No Jelitto image {err}")
        code = np.nan
    return code

In [118]:
# Save checked urls to a dictionary.
confident_urls = defaultdict(str)

# Threaded Jelitto image url checking.
with concurrent.futures.ThreadPoolExecutor(max_workers=25) as executor:
    check_url = {executor.submit(response_code_error, url): url for url in image_url}
    for future in concurrent.futures.as_completed(check_url):
        url = check_url[future]
        try:
            data = future.result()
            if data == 200:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = url
            else:
                confident_urls[re.search(r'\/1\/(.*)\.', url).group(1)] = np.nan 
        except Exception as exc:
            print(f"{url} generated an exception: {exc}")


Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa008.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa001.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa002.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa011.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa003.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa019.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa005.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa021.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa007.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa009.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa012.jpg
Checking url: https://www.jelitto.com/out/pictures/master/product/1/aa014.jpg
Checking url: https://www.jelitto.com/out/pictures/master/produc

In [119]:
# Cast confident_urls to a dataframe sorted by index (the Item No.)
df = pd.DataFrame.from_dict(confident_urls, orient='index', columns=['url'])
df.sort_index(inplace=True)

In [120]:
# Set the index of the jelitto df to Item No & sort
jelitto.set_index('Item No.', inplace=True)
jelitto.sort_index(inplace=True)


In [121]:
# Add image_url column to jelitto
jelitto['image_url'] = df['url'].values

In [122]:
# Create list of wikimedia_urls for rows where Jelitto had no image 
# and create alternative lookups for the common name to check if the scientific name has no results
wikimedia_urls = []
alt_lookups = {}
_url = 'https://commons.wikimedia.org/w/index.php?search='
updates = jelitto[jelitto['image_url'].isnull()]
wikimedia_urls = [(ix, _url + x) for ix, x in updates[['Genus', 'Species ']].apply(lambda x: ' '.join(x).lower(), axis=1).items()]
alt_lookups = {i: _url+'+'.join(x.split(',')[0].split()) for i,x in updates['Common Names'].items() if not isinstance(x, float)}


In [123]:
def alt_url(url: tuple) -> tuple:
    """Test tuple of _id and url to find an image on wikimedia commons.
    
    Parameters
    ----------
    url: tuple
        A tuple of _id and a url
    
    Returns
    -------
    alt_url: tuple
        (_id, new_url)
    """
    _id = url[0]
    url = url[1]
    page = requests.get(url)
    text = fromstring(page.content)
    new_url = text.xpath("//li[contains(@class,'mw-search-result')]//a/@href|//ul[contains(@class, 'gallery')]//img/@src")[0]
    if new_url.lower().endswith(('.png', '.jpg', '.jpeg', '.pdf')):
        alt_url = (_id, f"https://commons.wikimedia.org{new_url}")
    else:
        alt_url = (_id, None)
    return alt_url



In [124]:
ALT_URLS = []
# Checking wikimedia urls
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    check_url = {executor.submit(alt_url, url): url for url in wikimedia_urls}  
    for future in concurrent.futures.as_completed(check_url):
        url_tup = future.result()
        if url_tup[1]:
            ALT_URLS.append(url_tup)

In [125]:
# Checking wikimedia common names from alt_lookups
if alt_lookups:
    lookups = [(k,v) for k,v in alt_lookups.items()]
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        check_url = {executor.submit(alt_url, url): url for url in lookups}
        for future in concurrent.futures.as_completed(check_url):
            url_tup = future.result()
            ALT_URLS.append(url_tup)

In [126]:
# Cast alternate_urls to df with index aligned to Jelitto
altdf = pd.DataFrame(ALT_URLS)
altdf.rename({0: '_id', 1: 'image_url'}, axis=1, inplace=True)
altdf.set_index('_id', inplace=True)


In [127]:
# Replace null values in image_url with urls from alt_urls
jelitto = jelitto.combine_first(altdf)

In [128]:
# Check results
jelitto[['Genus', 'Species ', 'image_url']]

Unnamed: 0_level_0,Genus,Species,image_url
Item No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA001,ACAENA,buchananii,https://www.jelitto.com/out/pictures/master/product/1/aa001.jpg
AA002,ACAENA,caesiiglauca,https://www.jelitto.com/out/pictures/master/product/1/aa002.jpg
AA003,ACAENA,inermis,https://www.jelitto.com/out/pictures/master/product/1/aa003.jpg
AA005,ACAENA,microphylla,https://www.jelitto.com/out/pictures/master/product/1/aa005.jpg
AA007,ACAENA,novae-zelandiae,https://www.jelitto.com/out/pictures/master/product/1/aa007.jpg
AA008,ABUTILON,vitifolium,https://www.jelitto.com/out/pictures/master/product/1/aa008.jpg
AA009,ACAENA,saccaticupula,https://www.jelitto.com/out/pictures/master/product/1/aa009.jpg
AA011,ACAENA,fissistipula,https://www.jelitto.com/out/pictures/master/product/1/aa011.jpg
AA012,ACANTHUS,hungaricus,https://www.jelitto.com/out/pictures/master/product/1/aa012.jpg
AA014,ACANTHUS,mollis,https://www.jelitto.com/out/pictures/master/product/1/aa014.jpg


In [129]:
# Output to csv
jelitto.to_csv('image_urls.csv')