# Importing Wild Food UK dataset

This dataset will be used as a separate validation dataset. It contains images of species not referenced in the main dataset

Please note that to use the Kaggle API, you need to have a Kaggle account and a Kaggle API token. 
The token is a JSON file that you can download from your Kaggle account settings page. 
Once downloaded, place it in the location ~/.kaggle/kaggle.json on your machine.

In [2]:
from bs4 import BeautifulSoup
import kaggle
import os
import pandas as pd
from PIL import Image
import shutil



In [5]:
project_path = '../..'
dataset_path = os.path.join(project_path, 'dataset', 'wildfooduk')

In [11]:
# Authenticate with your Kaggle account
kaggle.api.authenticate()

# Download the dataset
kaggle.api.dataset_download_files('daniilonishchenko/mushrooms-images-classification-215', path=dataset_path, unzip=True)





MaxRetryError: HTTPSConnectionPool(host='www.kaggle.com', port=443): Max retries exceeded with url: /api/v1/datasets/download/daniilonishchenko/mushrooms-images-classification-215?datasetVersionNumber=None (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fbd682e7df0>, 'Connection to www.kaggle.com timed out. (connect timeout=None)'))

## Identifying scientific name of complementary dataset

In [6]:
# only common names are indicated. 
# creating a dataframe common english name

# List of common names
reference_table = pd.read_csv(os.path.join(dataset_path, 'mushrooms.txt'), names=['common_name']  )
reference_table['scientific_name'] = ''
reference_table['edibility'] = ''
reference_table.head()


Unnamed: 0,common_name,scientific_name,edibility
0,almond_mushroom,,
1,amanita_gemmata,,
2,amethyst_chanterelle,,
3,amethyst_deceiver,,
4,aniseed_funnel,,


In [7]:
from bs4 import BeautifulSoup
import requests

# Make a GET request to the website
response = requests.get('https://www.wildfooduk.com/mushroom-guide/')

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the <td> elements with class="spotlight-text"
td_elements = soup.find_all('td', class_='spotlight-text')

# Extract the text from each <td> element
scientific_names  = [td.text.strip() for td in td_elements]


In [8]:
# Find all the <td> elements with class="mushroom-image"
td_elements = soup.find_all('td', class_='mushroom-image')

# Find all the <img> elements within the <td> element
img_elements = [td.find('img') for td in td_elements] 

# Extract the text from each <td> element
common_names  = [img.get('alt') for img in img_elements]

In [9]:
# Find all the <td> elements with class="mushroom-icon"
td_elements = soup.find_all('td', class_='mushroom-icon')

# Find all the <img> elements within the <td> element
img_elements = [td.find('img') for td in td_elements] 

# Extract the text from each <td> element
edibility  = [img.get('alt') for img in img_elements]

In [10]:
reference_table = pd.DataFrame({'common_name': common_names, 'scientific_name': scientific_names, 'edibility' : edibility})
reference_table.head()

Unnamed: 0,common_name,scientific_name,edibility
0,Blushing Rosette,Abortiporus biennis,Inedible
1,Horse Mushroom,Agaricus arvensis,Edible
2,The Prince,Agaricus augustus,Edible
3,Pavement Mushroom,Agaricus bitorquis,Edible
4,Medusa Mushroom,Agaricus bohusii,Edible


In [None]:
reference_table['edibility'].value_counts()

In [11]:
reference_table['edibility'] = reference_table['edibility'].replace({'Edible': 1, 'Poisonous': 0, 'Inedible': 0})

In [12]:
reference_table['edibility'].value_counts()

edibility
1    125
0     97
Name: count, dtype: int64

In [13]:
reference_table['common_name'] = reference_table['common_name'].str.split('/').str[-1].str.lstrip().str.rstrip().str.replace('.',"").str.replace("'","").str.replace("-"," ")
reference_table['scientific_name'] = reference_table['scientific_name'].str.split('/').str[-1].str.lstrip().str.rstrip()

In [16]:
reference_table.to_csv('../../dataset/wildfooduk_mapping_table.csv', index=False, sep=';')

## Cross analysing base dataset and complementary dataset

In [17]:
df = pd.read_csv('../../dataset/observations_mushroom.csv')
df.head()

  df = pd.read_csv('../../dataset/observations_mushroom.csv')


Unnamed: 0,image_lien,image_id,observation,label,image_url,user,date,gbif_info/kingdom,gbif_info/family,gbif_info/speciesKey,...,gbif_info/phylumKey,gbif_info/class,gbif_info/synonym,gbif_info/scientificName,gbif_info/genus,gbif_info/order,thumbnail,location,gbif_info/note,gbif_info
0,1.jpg,1,1,Xylaria polymorpha,http://mushroomobserver.org/images/320/1,1,2006-05-21 07:17:05,Fungi,Xylariaceae,5255104.0,...,95.0,Sordariomycetes,False,"Xylaria polymorpha (Pers.) Grev., 1824",Xylaria,Xylariales,1,214.0,,
1,2.jpg,2,2,Xylaria magnoliae,http://mushroomobserver.org/images/320/2,1,2006-05-21 07:17:06,Fungi,Xylariaceae,3461845.0,...,95.0,Sordariomycetes,False,"Xylaria magnoliae J.D. Rogers, 1979",Xylaria,Xylariales,1,53.0,,
2,3.jpg,3,3,Xylaria hypoxylon,http://mushroomobserver.org/images/320/3,1,2006-05-21 07:17:08,Fungi,Xylariaceae,8631710.0,...,95.0,Sordariomycetes,False,"Xylaria hypoxylon (L.) Grev., 1824",Xylaria,Xylariales,1,60.0,,
3,4.jpg,4,4,Xylaria hypoxylon,http://mushroomobserver.org/images/320/4,1,2006-05-21 07:17:10,Fungi,Xylariaceae,8631710.0,...,95.0,Sordariomycetes,False,"Xylaria hypoxylon (L.) Grev., 1824",Xylaria,Xylariales,1,5.0,,
4,5.jpg,5,5,Xeromphalina,http://mushroomobserver.org/images/320/5,1,2006-05-21 07:17:12,Fungi,Mycenaceae,,...,34.0,Agaricomycetes,False,Xeromphalina Kühner & Maire,Xeromphalina,Agaricales,1,36.0,,


In [18]:

match_count = 0
print("species not present in the initial dataset")
print("-------------------------------------------")   

for element in reference_table['scientific_name'].unique():
    if element in df['label'].unique():
        match_count = match_count + 1
    else:
        print(element)

print("-------------------------------------------")        
print("matching species count: " + str(match_count))

species not present in the initial dataset
-------------------------------------------
Agaricus bohusii
Amanita citrina var. alba
Amanita citrina var. citrina
Hortiboletus bubalinus
Neoboletus praestigiator
Xerocomellus porosporus
Boletus reticulatus
Thaxterogaster purpurascens
Hapalopilus rutilans
Heboloma crustuliniforme
Gliophorus reginae
Porpolomopsis calyptriformis
Inosperma erubescens
Jackrogersella multiformis
Lactifluus vellereus
Apioperdon pyriforme
Mucidula mucida
Russula undulata
Scleroderma spp.
-------------------------------------------
matching species count: 203


## Adding identified species into the edible dataset

In [None]:
# edible mushrooms of the imported dataset will be filtered to be added in the edible dataset
# images will get a unique identifier not already used by the existing dataset starting from 1 000 000 for clarity
# only the scientific name has been deduced. The reste of the data frame needs also to be filled
# images are converted to jpeg for homogeneity

In [None]:
df.index.max

In [21]:
df_edible = pd.read_csv('../../dataset/order_classification/edible_mushrooms.csv')
df_edible.head()

Unnamed: 0,image_lien,image_id,family,rank,phylum,species,confidence,matchType,status,canonicalName,class,synonym,scientificName,genus,order
0,16.jpg,16,Pluteaceae,SPECIES,Basidiomycota,Volvopluteus gloiocephalus,98.0,EXACT,ACCEPTED,Volvopluteus gloiocephalus,Agaricomycetes,False,"Volvopluteus gloiocephalus (DC.) Vizzini, Cont...",Volvopluteus,Agaricales
1,54.jpg,54,Tricholomataceae,SPECIES,Basidiomycota,Tricholoma atrosquamosum,98.0,EXACT,ACCEPTED,Tricholoma atrosquamosum,Agaricomycetes,False,"Tricholoma atrosquamosum Sacc., 1887",Tricholoma,Agaricales
2,55.jpg,55,Tricholomataceae,SPECIES,Basidiomycota,Tricholoma atrosquamosum,98.0,EXACT,ACCEPTED,Tricholoma atrosquamosum,Agaricomycetes,False,"Tricholoma atrosquamosum Sacc., 1887",Tricholoma,Agaricales
3,95.jpg,95,Sparassidaceae,SPECIES,Basidiomycota,Sparassis crispa,98.0,EXACT,SYNONYM,Sparassis radicata,Agaricomycetes,True,"Sparassis radicata Weir, 1917",Sparassis,Polyporales
4,94.jpg,94,Sparassidaceae,SPECIES,Basidiomycota,Sparassis crispa,98.0,EXACT,SYNONYM,Sparassis radicata,Agaricomycetes,True,"Sparassis radicata Weir, 1917",Sparassis,Polyporales


In [22]:
df_classes = df_edible.drop(columns = ['image_lien', 'image_id'])
df_classes = df_classes[[ 'species','phylum', 'class', 'order', 'family','genus']].drop_duplicates()
df_classes.head()

Unnamed: 0,species,phylum,class,order,family,genus
0,Volvopluteus gloiocephalus,Basidiomycota,Agaricomycetes,Agaricales,Pluteaceae,Volvopluteus
1,Tricholoma atrosquamosum,Basidiomycota,Agaricomycetes,Agaricales,Tricholomataceae,Tricholoma
3,Sparassis crispa,Basidiomycota,Agaricomycetes,Polyporales,Sparassidaceae,Sparassis
5,Russula virescens,Basidiomycota,Agaricomycetes,Russulales,Russulaceae,Russula
6,Russula cyanoxantha,Basidiomycota,Agaricomycetes,Russulales,Russulaceae,Russula


In [23]:
# handling exceptions:
name_exceptions = {
"blackening polypore" : "giant polypore",
"cauliflower fungus" : "wood cauliflower",
"clouded agaric" : "clouded funnel"
}


In [24]:
# reviewing possibles matches : common name, synonyms, and scientific name used as a common name

def find_scientific_name(common_name, reference_table, name_exceptions):
    try:
        scientific_name = reference_table[reference_table['common_name'].str.lower() == common_name]['scientific_name'].iloc[0]
    except:
        try:
            scientific_name = reference_table[reference_table['scientific_name'].str.lower() == common_name]['scientific_name'].iloc[0]
        except:
            try:
                synonym = name_exceptions[common_name.lower()]
                scientific_name = reference_table[reference_table['common_name'].str.lower() == synonym]['scientific_name'].iloc[0]
            except:
                print( "not found : " + common_name)   
                scientific_name = ""
    return scientific_name

In [25]:
def scrape_wikipedia_page(mushroom_name):

    # Preparing classification
    classification = dict()
    classification["species"] = mushroom_name

    # Replace spaces in the mushroom name with underscores to match Wikipedia's URL format
    mushroom_name = mushroom_name.replace(' ', '_')

    # Make the HTTP request
    response = requests.get(f"https://en.wikipedia.org/wiki/{mushroom_name}")

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to get page: {response.status_code} " + " for " + mushroom_name)
        return

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title of the page
    title = soup.find(id="firstHeading")

    # Print the title
    #print(title.string)

    # Find the table with the scientific classification
    rows = soup.find_all('td')

    # Iterate over each row
    iterator = iter(rows)
    for row in iterator:
        #print(repr(row.text.replace(":","").rstrip()))
        if(row.text.replace(":","").rstrip() == "Division"):
            classification["phylum"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Class"):
            classification["class"] = next(iterator).text.strip()
        
        if(row.text.replace(":","").rstrip() == "Order"):
            classification["order"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Family"):
            classification["family"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Genus"):
            classification["genus"] = next(iterator).text.strip()

    # Check for edibility
    edibility = soup.find('a', {'href': '/wiki/Edible_mushroom'})
    classification["edible"] = 1 if (edibility is not None) else 0

    return classification

In [26]:

# checking genus if species is not found in the main dataframeb
not_found_cnt = 0
species_cnt = 0
classification_list = []
wiki_match_cnt = 0
species_match = 0

for dir_name, subdir_list, file_list in os.walk(os.path.join(dataset_path, 'data', 'data')):
    species_cnt = species_cnt + 1  

    # getting common name from subdirectory name       
    common_name = dir_name.split(os.sep)[-1].replace('_', ' ')
    scientific_name = find_scientific_name(common_name, reference_table,name_exceptions)

    # getting information related to the species
    try:
        scientific_classification = df_classes[df_classes["species"].str.lower() == scientific_name.lower()].iloc[0]
        species_match = species_match + 1
        # no common name is stored at the moment, which is required to build the test dataset
        classification = scientific_classification.to_dict()
        classification['common_name'] = common_name
        classification_list.append(classification)       
    except:
        try:
            # in this case the  classification from wikipedia is used
            classification = scrape_wikipedia_page(scientific_name)
            
            # species is added only if edible
            if classification['edible'] == 1:
                classification['common_name'] = common_name
                classification_list.append(classification)

            wiki_match_cnt = wiki_match_cnt + 1
        except:
            print("not found : " + str(species_cnt) + " " + scientific_name)
            not_found_cnt = not_found_cnt + 1

print("species match: " + str(species_match) + " wiki match: " + str(wiki_match_cnt) + " not found: " + str(not_found_cnt) +  " total species: " + str(species_cnt))



not found : data
Failed to get page: 404  for Russula_undulata
not found : 7 Russula undulata
Failed to get page: 404  for Lycoperdon_excipuliforme
not found : 9 Lycoperdon excipuliforme
Failed to get page: 404  for Agaricus_crocodilinus
not found : 22 Agaricus crocodilinus
Failed to get page: 404  for Lycoperdon_utriforme
not found : 25 Lycoperdon utriforme
Failed to get page: 404  for Russula_parazurea
not found : 27 Russula parazurea
Failed to get page: 404  for Cuphophyllus_flavipes
not found : 41 Cuphophyllus flavipes
Failed to get page: 404  for Agaricus_bohusii
not found : 42 Agaricus bohusii
Failed to get page: 404  for Jackrogersella_multiformis
not found : 58 Jackrogersella multiformis
Failed to get page: 404  for Amanita_sect._Vaginatae
not found : 66 Amanita sect. Vaginatae
Failed to get page: 404  for Neoboletus_praestigiator
not found : 83 Neoboletus praestigiator
Failed to get page: 404  for Amanita_citrina_var._alba
not found : 100 Amanita citrina var. alba
Failed to ge

In [27]:
# Those which don't have a matching wikipedia page are completely removed
filtered_list = [d for d in classification_list if (d is not None) ]

# now we can update the classification dataframe we the newly identified species. 
df_classes_update = pd.DataFrame(filtered_list)
df_classes_update = df_classes_update.dropna(axis=0)
df_classes_update = df_classes_update.drop(columns=['edible'])
df_classes_update.tail(10)


Unnamed: 0,species,phylum,class,order,family,genus,common_name
109,Hymenopellis radicata,Basidiomycota,Agaricomycetes,Agaricales,Physalacriaceae,Hymenopellis,rooting shank
110,Agaricus subrufescens,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,almond mushroom
111,Porpolomopsis calyptriformis,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Porpolomopsis,pink waxcap
113,Mycena galericulata,Basidiomycota,Agaricomycetes,Agaricales,Mycenaceae,Mycena,common bonnet
114,Hygrocybe coccinea,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Hygrocybe,scarlet waxcap
116,Amanita phalloides,Basidiomycota,Agaricomycetes,Agaricales,Amanitaceae,Amanita,deathcap
118,Fomes fomentarius,Basidiomycota,Agaricomycetes,Polyporales,Polyporaceae,Fomes,hoof fungus
120,Pseudohydnum gelatinosum,Basidiomycota,Agaricomycetes,Auriculariales,incertae sedis,Pseudohydnum,jelly tooth
121,Agaricus sylvicola,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,wood mushroom
123,Gliophorus laetus,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Gliophorus,heath waxcap


In [28]:
df_class_augmented = pd.concat([df_classes, df_classes_update],ignore_index=True)
df_class_augmented.tail(10)


Unnamed: 0,species,phylum,class,order,family,genus,common_name
184,Hymenopellis radicata,Basidiomycota,Agaricomycetes,Agaricales,Physalacriaceae,Hymenopellis,rooting shank
185,Agaricus subrufescens,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,almond mushroom
186,Porpolomopsis calyptriformis,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Porpolomopsis,pink waxcap
187,Mycena galericulata,Basidiomycota,Agaricomycetes,Agaricales,Mycenaceae,Mycena,common bonnet
188,Hygrocybe coccinea,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Hygrocybe,scarlet waxcap
189,Amanita phalloides,Basidiomycota,Agaricomycetes,Agaricales,Amanitaceae,Amanita,deathcap
190,Fomes fomentarius,Basidiomycota,Agaricomycetes,Polyporales,Polyporaceae,Fomes,hoof fungus
191,Pseudohydnum gelatinosum,Basidiomycota,Agaricomycetes,Auriculariales,incertae sedis,Pseudohydnum,jelly tooth
192,Agaricus sylvicola,Basidiomycota,Agaricomycetes,Agaricales,Agaricaceae,Agaricus,wood mushroom
193,Gliophorus laetus,Basidiomycota,Agaricomycetes,Agaricales,Hygrophoraceae,Gliophorus,heath waxcap


# building the order test dataset

In [29]:
# renaming and converting to jpg all images with a unique identifier in their source directory
cnt_img = 0
start_index = 1000000

for dir_name, subdir_list, file_list in os.walk(os.path.join(dataset_path, 'data', 'data')):
    for file_name in file_list:
        if file_name.endswith(".png"):
            # create unique identifier
            unique_identifier  = str(start_index + cnt_img)
            cnt_img = cnt_img + 1

            # Get the full path of the file
            old_file_path = os.path.join(dir_name, file_name)

            # Construct the new file name with the unique identifier
            new_file_name = unique_identifier + '.jpg'

            # Construct the new full path with the new file name
            new_file_path = os.path.join(dir_name, new_file_name)

            # Open and convert the PNG image to JPG using Pillow
            image = Image.open(old_file_path)
            image = image.convert("RGB")
            image.save(new_file_path, "JPEG")

            # Remove the old PNG file
            os.remove(old_file_path)

            print(f"Converted '{old_file_path}' to '{new_file_path}'")    
            

print(cnt_img)

    

0


In [30]:
validation_datatset_path = '../../dataset/order_classification/validation'

# Create the folder silently
os.makedirs(validation_datatset_path, exist_ok=True)

In [34]:
import os
import shutil

def copy_folder(src, dst):
    if not os.path.exists(dst):
        # If the destination directory doesn't exist, copy entire source directory
        shutil.copytree(src, dst)
    else:
        # If the destination directory exists, copy each file in the source directory
        for item in os.listdir(src):
            s = os.path.join(src, item)
            d = os.path.join(dst, item)
            if os.path.isdir(s):
                copy_folder(s, d)  # Call function recursively if item is a directory
            else:
                shutil.copy2(s, d)  # Copy files

In [35]:
# from the subdirectory name, common name is extracted and matched to its order
not_found_cnt = 0
rootdir = os.path.join(dataset_path, 'data', 'data')

# Get a list of all subdirectories
subdirectories = [d for d in os.listdir(rootdir)]

# Create a dictionary to store the mapping of subdirectories to orders
subdir_to_order = {}

# Iterate through subdirectories and match with 'common_name' or 'species'
for subdir in subdirectories:
    common_name = subdir.replace('_', ' ')

    scientific_name = find_scientific_name(common_name, df_classes_update.rename(columns={'species': 'scientific_name'}),name_exceptions)

    matching_species = df_classes_update[df_classes_update['species'] == scientific_name]

    if not matching_species.empty:
        order_value = matching_species.iloc[0]['order'].lower()
        subdir_to_order[subdir] = order_value

        # Copy the source folder and its contents to the destination folder
        copy_folder(os.path.join(rootdir, subdir), os.path.join(validation_datatset_path, order_value))
        #shutil.copytree(os.path.join(rootdir, subdir), os.path.join(validation_datatset_path, order_value))
    else: 
        # at that stage not found means not edible
        not_found_cnt = not_found_cnt + 1

print("found: " + str(len(subdirectories) - not_found_cnt) + " / " + str(df_classes_update['species'].count()))

not found : scarlet elfcup
not found : turkey tail
not found : purple brittlegill
not found : brown birch bolete
not found : pestle puffball
not found : poplar bell
not found : cauliflower fungus
not found : hen of the woods
not found : stinking dapperling
not found : deadly fibrecap
not found : golden bootleg
not found : horn of plenty
not found : common morel
not found : common rustgill
not found : blushing bracket
not found : macro mushroom
not found : yellow false truffle
not found : mosaic puffball
not found : pine bolete
not found : powdery brittlegill
not found : stinkhorn
not found : amethyst chanterelle
not found : the blusher
not found : birch polypore
not found : splitgill
not found : thimble morel
not found : devils bolete
not found : slimy waxcap
not found : pavement mushroom
not found : yellow foot waxcap
not found : medusa mushroom
not found : horse mushroom
not found : cinnamon bracket
not found : trooping funnel
not found : freckled dapperling
not found : blackening br