# Web srapping mushroom pictures with python

### Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import random
import pickle
import shutil
import os

### Data set up

In [2]:
# Edibility lists
elements = ['🟢!','🟢','🟡🟢','🟡','🔴','🔴☠']

edible = ['🟢!','🟢']
non_edible = ['🟡🟢','🟡']
deadly = ['🔴','🔴☠']

# Edibility dict
edibility = {'edible':['🟢!','🟢'],'non_edible':['🟡🟢','🟡'],'deadly':['🔴','🔴☠']}

Create 2 lists:
* mushroom names (in format 'Amanita_echinocephala')
* mushroom webpages (in format 'https://grzyby.pl/pelna/gatunki/Amanita_echinocephala')

In [3]:
os.chdir('C:\\Users\\User\\OneDrive\\Edu\\Deep Learning')

# Credentials to use full page functionality
credentials = dict(l.rstrip().split('=') for l in open('config.properties') if not l.startswith("#"))
URL_init = "https://www.grzyby.pl/logowanie"

# Log in using credential
form_data = credentials
s = requests.Session()
server = s.post(URL_init, data = form_data)

# Get full list of species
URL = 'https://www.grzyby.pl/pelna/gatunki/'
page = s.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

mushroom_names = []
mushroom_webs = []
for item in soup.find_all('td'):
    if item.find('a', href = True) and '_' in str(item) and '-' not in str(item):
        name = item.find('a').text
        name_clean = item.find('a').text.replace('.htm', '')
        if len(name_clean.split('_')) == 2:
            mushroom_webs.append('https://grzyby.pl/pelna/gatunki/'+name)
            mushroom_names.append(name_clean)

print(f'*********************************')
print(f'Length of mushroom_names: {len(mushroom_names)}.\nLength of mushroom_webs: {len(mushroom_webs)}.')
print(f'*********************************')

*********************************
Length of mushroom_names: 4909.
Length of mushroom_webs: 4909.
*********************************


### Functions

##### F: Dictionary set up

In [4]:
# name - name of dictionary
# l_names - list with mushroom names
# l_webs - list with mushroom websites

def mushroom_dict_init(l_names:list, l_webs:list, name:str=''):
    try:
        mushroom_dict = {l_names[i]: {'edible': None, 'web': l_webs[i], 'img' : []} for i in range(len(l_names))}

        if name != '':
            mushroom_dict['dict_name'] = (f'mushroom_dict_{name}')
        else:
            mushroom_dict['dict_name'] = (f'mushroom_dict')

        print(f'Created {mushroom_dict["dict_name"]} with {len(mushroom_dict)} elements.')
        return mushroom_dict

    except Exception as e:
        print(f'ERROR - not possible to create dictionary: {e}.')

##### F: Short dictionary set up
Short dictionary used for further tests - download selected genuses

In [5]:
# l_names - list with mushroom names
# l_webs - list with mushroom websites
# l_mushs - list with expected families to be included in final dictionary
# name - name of dictionary

def mushroom_dict_short_init(l_names:list, l_webs:list, l_mushs:list, name:str=''):
    l_names_short = []
    l_webs_short = []

    try:
        for mushroom_name in l_names:
            for el in l_mushs:
                if mushroom_name.lower().startswith(el.lower()):
                    l_names_short.append(mushroom_name)

        for mushroom_web in l_webs:
            for el in l_mushs:
                if mushroom_web.lower().startswith('https://grzyby.pl/pelna/gatunki/'+el.lower()):
                    l_webs_short.append(mushroom_web)

        mushroom_dict_short = {l_names_short[i]: {'edible': None, 'web': l_webs_short[i], 'img' : []} for i in range(len(l_names_short))}

        if name != '':
            mushroom_dict_short['dict_name'] = (f'mushroom_dict_short_{name}')
        else:
            mushroom_dict_short['dict_name'] = (f'mushroom_dict_short')

        print(f'Created {mushroom_dict_short["dict_name"]} with {len(mushroom_dict_short)} elements.')
        return mushroom_dict_short

    except Exception as e:
        print(f'ERROR - not possible to create dictionary: {e}.')

##### F: Dictionary clean up (remove mushrooms w/o edibility)

In [6]:
# t - short/long sleep time
# dict_1 - dictionary to clean
# dict_2 - copy of dict_1

def dict_edibility_cleanup(dict1:dict, dict2:dict, t = 'long'):
    elements = ['🟢!','🟢','🟡🟢','🟡','🔴','🔴☠']

    # Edibility counters
    edible = ['🟢!','🟢']
    edible_cnt = 0
    non_edible = ['🟡🟢','🟡']
    non_edible_cnt = 0
    deadly = ['🔴','🔴☠']
    deadly_cnt = 0

    dictionary_name = dict1['dict_name']

    try:
        for key, value in dict2.items():
            if key != 'dict_name':
                URL = value['web']
                page = ''

                while page == '':
                    try:
                        page = requests.get(URL)
                        break
                    except:
                        if t == 'long':
                            x = 10
                        else:
                            x = 5

                        print('Connection refused by the server.')
                        print(f'{x} sec break.')
                        time.sleep(x)
                        print('Continue...')
                        continue

                soup = BeautifulSoup(page.content, "html.parser")

                try:
                    if t == 'long':
                        y = random.randint(7,17)
                    else:
                        y = random.randint(3,7)

                    print(f'Starting with {key} ({list(dict2.keys()).index(key)+1}/{len(dict2)}).')
                    edibility = soup.find('div', {'id' : 'tytul-blok'}).find('a').get_text().split()[0]

                    if edibility not in elements:
                        del dict1[key]
                        print(f'>>>>>{key} removed. Mushrooms in clean dictionary: {len(dict1)}/{len(dict2)}. Break for {y} sec.')

                    else:
                        dict1[key]['edible'] = edibility

                        if edibility in edible:
                            edible_cnt += 1
                        if edibility in non_edible:
                            non_edible_cnt += 1
                        if edibility in deadly:
                            deadly_cnt += 1

                        print(f'>>>>>Edibility {edibility} added for: {key}. Break for {y} sec.')

                    time.sleep(y)

                except Exception as e1:
                    if t == 'long':
                        z = random.randint(5,17)
                    else:
                        z = random.randint(3,7)

                    print(f'Error:{str(e1)}. Break for {z} sec.')
                    time.sleep(z)
            else:
                continue

        with open(f'{dictionary_name}.pkl', 'wb') as fp:
            pickle.dump(dict1, fp)

        print('*************************')
        print(f'Clean up done! Dictionary saved as {dictionary_name}.')
        print(f'Mushrooms in clean dictionary: {len(dict1)}')
        print(f'Edible mushrooms: {edible_cnt}')
        print(f'Non-edible mushrooms: {non_edible_cnt}')
        print(f'Deadly mushrooms: {deadly_cnt}')
        print('*************************')

    except Exception as e2:
        print(f'Error:{str(e2)}.')

##### F: Find pictures

In [21]:
def photo_search(dict1:dict, t = 'short'):
    photos_cnt = 0
    dictionary_name = f'{dict1["dict_name"]}_photos'

    # Credentials
    credentials = dict(l.rstrip().split('=') for l in open('config.properties') if not l.startswith("#"))
    URL_init = "https://www.grzyby.pl/logowanie"

    # Log in using credential
    form_data = credentials
    s = requests.Session()
    server = s.post(URL_init, data = form_data)

    try:
        for key, value in dict1.items():
            #  # Credentials
            # credentials = dict(l.rstrip().split('=') for l in open('config.properties') if not l.startswith("#"))
            # URL_init = "https://www.grzyby.pl/logowanie"
            #
            # # Log in using credential
            # form_data = credentials
            # s = requests.Session()
            # server = s.post(URL_init, data = form_data)

            if key != 'dict_name':

                URL = value['web']
                page = ''

                while page == '':
                    try:
                        page = s.get(URL)
                        break
                    except:
                        if t == 'short':
                            x = 7
                        else:
                            x = 17

                        print('Connection refused by the server.')
                        print(f'{x} sec break.')
                        time.sleep(x)
                        print('Continue...')
                        continue

                soup = BeautifulSoup(page.content, "html.parser")
                latin_name = ' '.join(soup.find('title').text.split(' ')[0:2])

                try:
                    if t == 'short':
                        y = random.randint(1,7)
                    else:
                        y = random.randint(7,17)

                    print(f'Starting with {key} ({list(dict1.keys()).index(key)+1}/{len(dict1)}).')

                    for item in soup.find_all('img'):

                        try:
                            if 'alt' in item.attrs and latin_name in str(item['alt']) and 'mapa' not in str(item['src']) and 'icon' not in str(item['src']):
                                if 'pelna' in str(item['src']):
                                    value['img'].append(('https://www.grzyby.pl'+item['src']))
                                if 'pelna' not in str(item['src']):
                                    value['img'].append(('https://www.grzyby.pl/pelna'+item['src']))

                                v = random.randint(3,11)
                                print(f'>>>>>>>>>>Photo found! Break for {v} sec.')
                                time.sleep(v)

                        except Exception as e3:
                            print(f"Error on level 3 for {item}: {e3}.")

                    photos_cnt += len(dict1[key]['img'])
                    print(f">>>>>{len(dict1[key]['img'])} photos found. Break for {y} sec.")
                    time.sleep(y)

                except Exception as e2:
                    if t == 'long':
                        z = random.randint(1,7)
                    else:
                        z = random.randint(7,17)

                    print(f'Error on level 2: {e2}. Break for {z} sec.')
                    time.sleep(z)

            else:
                continue

        with open(f'{dictionary_name}.pkl', 'wb') as fp:
            pickle.dump(dict1, fp)

        print('*************************')
        print(f'{photos_cnt} photos found!')
        print(f'Dictionary saved as {dictionary_name}.pkl.')
        print('*************************')

    except Exception as e1:
        print(f'Error on level 1:{e1}.')

##### F: Download picture per specie/genus

In [8]:
#1st version - creates folder based on family
def photo_download_specie(dict1:dict,t = 'long'):
    total_photo_cnt = 0
    folders_cnt = 0

    # Credentials
    credentials = dict(l.rstrip().split('=') for l in open('config.properties') if not l.startswith("#"))
    URL_init = "https://www.grzyby.pl/logowanie"

    # Log in using credential
    form_data = credentials
    s = requests.Session()
    server = s.post(URL_init, data = form_data)

    for key1, value1 in dict1.items():
        if key1 != 'dict_name':
            photo_cnt = 0
            print(f'Starting with {key1} ({list(dict1.keys()).index(key1)+1}/{len(dict1)}).')

            if t == 'long':
                y = random.randint(7,17)
            else:
                y = random.randint(1,5)

            for key2, value2 in value1.items():
                if key2 == 'img':
                    for web in value2:

                        # Create folder based on specie name
                        folder_name = str(key1).split("_")[0].lower()

                        if not os.path.exists(folder_name):
                            os.makedirs(folder_name)
                            folders_cnt += 1
                            print(f'>>>>>Created folder {folder_name}.')

                        path = f'{os.getcwd()}\{folder_name}'

                        # File name in format Specie_specie2_000.jpg
                        filename = f'{path}\{key1}_{(value2.index(web)+1):03d}.jpg'

                        URL = web
                        page = ''

                        while page == '':
                            try:
                                page = s.get(URL, stream = True)
                                break

                            except:
                                if t == 'long':
                                    x = 17
                                else:
                                    x = 7

                                print('Connection refused by the server.')
                                print(f'{x} sec break.')
                                time.sleep(x)
                                print('Continue...')
                                continue

                        if page.status_code == 200:
                            page.raw.decode_content = True

                            with open(filename,'wb') as f:
                                shutil.copyfileobj(page.raw, f)
                                photo_cnt += 1
                                #time.sleep(random.randint(1,3))

                        else:
                            print('Image couldn\'t be retreived.')


                    if photo_cnt == 0:
                        print(f'>>>>>No photos found. Break {y} sec.')
                        time.sleep(y)
                    else:
                        print(f'>>>>>Downloaded {photo_cnt} pictures to {path}. Break {y} sec.')
                        total_photo_cnt += photo_cnt
                        time.sleep(y)

    print('*************************')
    print(f'Job done!')
    print(f'Photos downloaded: {total_photo_cnt}.')
    print(f'Folders created: {folders_cnt}.')
    print('*************************')

##### F: Downlaod picture per edibility label

In [10]:
#2nd version - creates folder based on edibility
def photo_download_edibility(dict1:dict,t = 'long'):
    edibility = {'edible':['🟢!','🟢'],'non_edible':['🟡🟢','🟡'],'deadly':['🔴','🔴☠']}
    total_photo_cnt = 0
    folders_cnt = 0

    # Credentials
    credentials = dict(l.rstrip().split('=') for l in open('config.properties') if not l.startswith("#"))
    URL_init = "https://www.grzyby.pl/logowanie"

    # Log in using credential
    form_data = credentials
    s = requests.Session()
    server = s.post(URL_init, data = form_data)

    for key1, value1 in dict1.items():
        if key1 != 'dict_name':
            photo_cnt = 0
            print(f'Starting with {key1} ({list(dict1.keys()).index(key1)+1}/{len(dict1)}).')

            if t == 'long':
                y = random.randint(7,17)
            else:
                y = random.randint(1,5)

            for key2, value2 in value1.items():
                if key2 == 'edible':
                    for key3, value3 in edibility.items():
                        if value2 in value3:

                            folder_name = key3

                            if not os.path.exists(folder_name):
                                os.makedirs(folder_name)
                                print(f'>>>>>Created folder {folder_name}.')
                                folders_cnt += 1

                if key2 == 'img':
                    for web in value2:

                        path = f'{os.getcwd()}\{folder_name}'
                        filename = f'{path}\{key1}_{(value2.index(web)+1):03d}.jpg'

                        URL = web
                        page = ''

                        while page == '':
                            try:
                                page = s.get(URL, stream = True)
                                break

                            except:
                                if t == 'long':
                                    x = 17
                                else:
                                    x = 5

                                print('Connection refused by the server.')
                                print(f'{x} sec break.')
                                time.sleep(x)
                                print('Continue...')
                                continue

                        if page.status_code == 200:
                            page.raw.decode_content = True

                            with open(filename,'wb') as f:
                                shutil.copyfileobj(page.raw, f)
                                photo_cnt += 1

                        else:
                            print('Image couldn\'t be retreived.')

                    total_photo_cnt += photo_cnt

                    if photo_cnt == 0:
                        print(f'>>>>>No photos found. Break {y} sec.')
                        time.sleep(y)
                    else:
                        print(f'>>>>>Downloaded {photo_cnt} pictures to {path}. Break {y} sec.')
                        time.sleep(y)

    print('*************************')
    print(f'Job done!')
    print(f'Total pictures downloaded: {total_photo_cnt}.')
    print(f'Folders created: {folders_cnt}.')
    print('*************************')

##### F: Dictionary split
With too big dictionaries download functions may stop.

In [11]:
def dict_split(dict1:dict, n:int):

    dict_name = dict1['dict_name']

    keys = list(dict1.keys())
    keys.remove('dict_name')

    num_items_per_dict = len(keys) // n

    keys_lists = [keys[i:i + num_items_per_dict] for i in range(0, len(keys), num_items_per_dict)]

    # Add remaining keys to sub-list:
    if len(keys) % n != 0:
        keys_lists[-1].extend(keys[-(len(keys) % n):])

    # Create the sub-dictionaries.
    sub_dicts = []
    for i in range(n):
        sub_dict = {k: v for k, v in dict1.items() if k in keys_lists[i]}
        sub_dict['dict_name'] = f"{dict_name}_{i+1}"
        sub_dicts.append(sub_dict)
        print(f'{sub_dict["dict_name"]} created.')

    return sub_dicts

### Run functions

In [12]:
# Create list with 5 common genuses
mush_short = ['agaricus','amanita','boletus','leccinum','hygrocybe','russula']

In [22]:
mushroom_dict_short = mushroom_dict_short_init(mushroom_names,mushroom_webs,mush_short)

Created mushroom_dict_short with 321 elements.


In [23]:
mushroom_dict_short

{'Agaricus_altipes': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_altipes.htm',
  'img': []},
 'Agaricus_arvensis': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_arvensis.htm',
  'img': []},
 'Agaricus_augustus': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_augustus.htm',
  'img': []},
 'Agaricus_benesii': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_benesii.htm',
  'img': []},
 'Agaricus_bernardii': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_bernardii.htm',
  'img': []},
 'Agaricus_bisporus': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_bisporus.htm',
  'img': []},
 'Agaricus_bitorquis': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_bitorquis.htm',
  'img': []},
 'Agaricus_bohusii': {'edible': None,
  'web': 'https://grzyby.pl/pelna/gatunki/Agaricus_bohusii.htm',
  'img': []},
 'Agaricus_bresadolanus': {'edible': None,
  'web'

Since we split mushrooms per genus, we don't use edibility clean up function - skip dict_edibility_cleanup() function

In [24]:
photo_search(mushroom_dict_short,'long')

Starting with Agaricus_altipes (1/321).
>>>>>0 photos found. Break for 10 sec.
Starting with Agaricus_arvensis (2/321).
>>>>>>>>>>Photo found! Break for 4 sec.
>>>>>>>>>>Photo found! Break for 5 sec.
>>>>>>>>>>Photo found! Break for 8 sec.
>>>>>>>>>>Photo found! Break for 5 sec.
>>>>>4 photos found. Break for 14 sec.
Starting with Agaricus_augustus (3/321).
>>>>>>>>>>Photo found! Break for 11 sec.
>>>>>>>>>>Photo found! Break for 6 sec.
>>>>>>>>>>Photo found! Break for 5 sec.
>>>>>>>>>>Photo found! Break for 5 sec.
>>>>>4 photos found. Break for 9 sec.
Starting with Agaricus_benesii (4/321).
>>>>>0 photos found. Break for 11 sec.
Starting with Agaricus_bernardii (5/321).
>>>>>>>>>>Photo found! Break for 5 sec.
>>>>>>>>>>Photo found! Break for 11 sec.
>>>>>2 photos found. Break for 9 sec.
Starting with Agaricus_bisporus (6/321).
>>>>>>>>>>Photo found! Break for 11 sec.
>>>>>>>>>>Photo found! Break for 7 sec.
>>>>>>>>>>Photo found! Break for 10 sec.
>>>>>>>>>>Photo found! Break for 5 sec

In [26]:
os.chdir('C:\\Users\\User\\OneDrive\\Edu\\Deep Learning\\deep_mushroom')

In [28]:
photo_download_specie(mushroom_dict_short,'long')

Starting with Agaricus_altipes (1/321).
>>>>>No photos found. Break 16 sec.
Starting with Agaricus_arvensis (2/321).
>>>>>Created folder agaricus.
>>>>>Downloaded 4 pictures to C:\Users\User\OneDrive\Edu\Deep Learning\deep_mushroom\agaricus. Break 12 sec.
Starting with Agaricus_augustus (3/321).
>>>>>Downloaded 4 pictures to C:\Users\User\OneDrive\Edu\Deep Learning\deep_mushroom\agaricus. Break 14 sec.
Starting with Agaricus_benesii (4/321).
>>>>>No photos found. Break 9 sec.
Starting with Agaricus_bernardii (5/321).
>>>>>Downloaded 2 pictures to C:\Users\User\OneDrive\Edu\Deep Learning\deep_mushroom\agaricus. Break 7 sec.
Starting with Agaricus_bisporus (6/321).
>>>>>Downloaded 84 pictures to C:\Users\User\OneDrive\Edu\Deep Learning\deep_mushroom\agaricus. Break 15 sec.
Starting with Agaricus_bitorquis (7/321).
>>>>>Downloaded 93 pictures to C:\Users\User\OneDrive\Edu\Deep Learning\deep_mushroom\agaricus. Break 8 sec.
Starting with Agaricus_bohusii (8/321).
>>>>>No photos found. Break