# Importing Wild Food UK dataset

This dataset will be used as a separate validation dataset. It contains images of species not referenced in the main dataset

Please note that to use the Kaggle API, you need to have a Kaggle account and a Kaggle API token. 
The token is a JSON file that you can download from your Kaggle account settings page. 
Once downloaded, place it in the location ~/.kaggle/kaggle.json on your machine.

In [None]:
from bs4 import BeautifulSoup
import kaggle
import os
import pandas as pd
from PIL import Image
import shutil

In [None]:
project_path = '..'
dataset_path = os.path.join(project_path, 'dataset', 'wildfooduk')

In [None]:
# Authenticate with your Kaggle account
kaggle.api.authenticate()

# Download the dataset
kaggle.api.dataset_download_files('daniilonishchenko/mushrooms-images-classification-215', path=dataset_path, unzip=True)

## Identifying scientific name of complementary dataset

In [None]:
# only common names are indicated. 
# creating a dataframe common english name

# List of common names
reference_table = pd.read_csv(os.path.join(dataset_path, 'mushrooms.txt'), names=['common_name']  )
reference_table['scientific_name'] = ''
reference_table['edibility'] = ''
reference_table.head()


In [None]:
from bs4 import BeautifulSoup
import requests

# Make a GET request to the website
response = requests.get('https://www.wildfooduk.com/mushroom-guide/')

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the <td> elements with class="spotlight-text"
td_elements = soup.find_all('td', class_='spotlight-text')

# Extract the text from each <td> element
scientific_names  = [td.text.strip() for td in td_elements]


In [None]:
# Find all the <td> elements with class="mushroom-image"
td_elements = soup.find_all('td', class_='mushroom-image')

# Find all the <img> elements within the <td> element
img_elements = [td.find('img') for td in td_elements] 

# Extract the text from each <td> element
common_names  = [img.get('alt') for img in img_elements]

In [None]:
# Find all the <td> elements with class="mushroom-icon"
td_elements = soup.find_all('td', class_='mushroom-icon')

# Find all the <img> elements within the <td> element
img_elements = [td.find('img') for td in td_elements] 

# Extract the text from each <td> element
edibility  = [img.get('alt') for img in img_elements]

In [None]:
reference_table = pd.DataFrame({'common_name': common_names, 'scientific_name': scientific_names, 'edibility' : edibility})
reference_table.head()

In [None]:
reference_table['edibility'].value_counts()

In [None]:
reference_table['edibility'] = reference_table['edibility'].replace({'Edible': 1, 'Poisonous': 0, 'Inedible': 0})

In [None]:
reference_table['edibility'].value_counts()

In [None]:
reference_table['common_name'] = reference_table['common_name'].str.split('/').str[-1].str.lstrip().str.rstrip().str.replace('.',"").str.replace("'","").str.replace("-"," ")
reference_table['scientific_name'] = reference_table['scientific_name'].str.split('/').str[-1].str.lstrip().str.rstrip()

In [None]:
reference_table.to_csv('../dataset/wildfooduk_mapping_table.csv', index=False, sep=';')

## Cross analysing base dataset and complementary dataset

In [None]:
df = pd.read_csv('../dataset/observations_mushroom.csv')
df.head()

In [None]:

match_count = 0
print("species not present in the initial dataset")
print("-------------------------------------------")   

for element in reference_table['scientific_name'].unique():
    if element in df['label'].unique():
        match_count = match_count + 1
    else:
        print(element)

print("-------------------------------------------")        
print("matching species count: " + str(match_count))

## Adding identified species into the edible dataset

In [None]:
# edible mushrooms of the imported dataset will be filtered to be added in the edible dataset
# images will get a unique identifier not already used by the existing dataset starting from 1 000 000 for clarity
# only the scientific name has been deduced. The reste of the data frame needs also to be filled
# images are converted to jpeg for homogeneity

In [None]:
df.index.max

In [None]:
df_edible = pd.read_csv('../dataset/edible_mushrooms.csv')
df_edible.head()

In [None]:
df_classes = df_edible.drop(columns = ['image_lien', 'image_id'])
df_classes = df_classes[[ 'species','phylum', 'class', 'order', 'family','genus']].drop_duplicates()
df_classes.head()

In [None]:
# handling exceptions:
name_exceptions = {
"blackening polypore" : "giant polypore",
"cauliflower fungus" : "wood cauliflower",
"clouded agaric" : "clouded funnel"
}


In [None]:
# reviewing possibles matches : common name, synonyms, and scientific name used as a common name

def find_scientific_name(common_name, reference_table, name_exceptions):
    try:
        scientific_name = reference_table[reference_table['common_name'].str.lower() == common_name]['scientific_name'].iloc[0]
    except:
        try:
            scientific_name = reference_table[reference_table['scientific_name'].str.lower() == common_name]['scientific_name'].iloc[0]
        except:
            try:
                synonym = name_exceptions[common_name.lower()]
                scientific_name = reference_table[reference_table['common_name'].str.lower() == synonym]['scientific_name'].iloc[0]
            except:
                print( "not found : " + common_name)   
                scientific_name = ""
    return scientific_name

In [None]:
def scrape_wikipedia_page(mushroom_name):

    # Preparing classification
    classification = dict()
    classification["species"] = mushroom_name

    # Replace spaces in the mushroom name with underscores to match Wikipedia's URL format
    mushroom_name = mushroom_name.replace(' ', '_')

    # Make the HTTP request
    response = requests.get(f"https://en.wikipedia.org/wiki/{mushroom_name}")

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to get page: {response.status_code} " + " for " + mushroom_name)
        return

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the title of the page
    title = soup.find(id="firstHeading")

    # Print the title
    #print(title.string)

    # Find the table with the scientific classification
    rows = soup.find_all('td')

    # Iterate over each row
    iterator = iter(rows)
    for row in iterator:
        #print(repr(row.text.replace(":","").rstrip()))
        if(row.text.replace(":","").rstrip() == "Division"):
            classification["phylum"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Class"):
            classification["class"] = next(iterator).text.strip()
        
        if(row.text.replace(":","").rstrip() == "Order"):
            classification["order"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Family"):
            classification["family"] = next(iterator).text.strip()

        if(row.text.replace(":","").rstrip() == "Genus"):
            classification["genus"] = next(iterator).text.strip()

    # Check for edibility
    edibility = soup.find('a', {'href': '/wiki/Edible_mushroom'})
    classification["edible"] = 1 if (edibility is not None) else 0

    return classification

In [None]:

# checking genus if species is not found in the main dataframeb
not_found_cnt = 0
species_cnt = 0
classification_list = []
wiki_match_cnt = 0
species_match = 0

for dir_name, subdir_list, file_list in os.walk(os.path.join(dataset_path, 'data', 'data')):
    species_cnt = species_cnt + 1  

    # getting common name from subdirectory name       
    common_name = dir_name.split(os.sep)[-1].replace('_', ' ')
    scientific_name = find_scientific_name(common_name, reference_table,name_exceptions)

    # getting information related to the species
    try:
        scientific_classification = df_classes[df_classes["species"].str.lower() == scientific_name.lower()].iloc[0]
        species_match = species_match + 1
        # no common name is stored at the moment, which is required to build the test dataset
        classification = scientific_classification.to_dict()
        classification['common_name'] = common_name
        classification_list.append(classification)       
    except:
        try:
            # in this case the  classification from wikipedia is used
            classification = scrape_wikipedia_page(scientific_name)
            
            # species is added only if edible
            if classification['edible'] == 1:
                classification['common_name'] = common_name
                classification_list.append(classification)

            wiki_match_cnt = wiki_match_cnt + 1
        except:
            print("not found : " + str(species_cnt) + " " + scientific_name)
            not_found_cnt = not_found_cnt + 1

print("species match: " + str(species_match) + " wiki match: " + str(wiki_match_cnt) + " not found: " + str(not_found_cnt) +  " total species: " + str(species_cnt))



In [None]:
# Those which don't have a matching wikipedia page are completely removed
filtered_list = [d for d in classification_list if (d is not None) ]

# now we can update the classification dataframe we the newly identified species. 
df_classes_update = pd.DataFrame(filtered_list)
df_classes_update = df_classes_update.dropna(axis=0)
df_classes_update = df_classes_update.drop(columns=['edible'])
df_classes_update.tail(10)


In [None]:
df_class_augmented = pd.concat([df_classes, df_classes_update],ignore_index=True)
df_class_augmented.tail(10)


# building the order test dataset

In [None]:
# renaming and converting to jpg all images with a unique identifier in their source directory
cnt_img = 0
start_index = 1000000

for dir_name, subdir_list, file_list in os.walk(os.path.join(dataset_path, 'data', 'data')):
    for file_name in file_list:
        if file_name.endswith(".png"):
            # create unique identifier
            unique_identifier  = str(start_index + cnt_img)
            cnt_img = cnt_img + 1

            # Get the full path of the file
            old_file_path = os.path.join(dir_name, file_name)

            # Construct the new file name with the unique identifier
            new_file_name = unique_identifier + '.jpg'

            # Construct the new full path with the new file name
            new_file_path = os.path.join(dir_name, new_file_name)

            # Open and convert the PNG image to JPG using Pillow
            image = Image.open(old_file_path)
            image = image.convert("RGB")
            image.save(new_file_path, "JPEG")

            # Remove the old PNG file
            os.remove(old_file_path)

            print(f"Converted '{old_file_path}' to '{new_file_path}'")    
            

print(cnt_img)

    

In [None]:
validation_datatset_path = '../dataset/order/validation'

# Create the folder silently
os.makedirs(validation_datatset_path, exist_ok=True)

In [None]:
# from the subdirectory name, common name is extracted and matched to its order
not_found_cnt = 0
rootdir = os.path.join(dataset_path, 'data', 'data')

# Get a list of all subdirectories
subdirectories = [d for d in os.listdir(rootdir)]

# Create a dictionary to store the mapping of subdirectories to orders
subdir_to_order = {}

# Iterate through subdirectories and match with 'common_name' or 'species'
for subdir in subdirectories:
    common_name = subdir.replace('_', ' ')

    scientific_name = find_scientific_name(common_name, df_classes_update.rename(columns={'species': 'scientific_name'}),name_exceptions)

    matching_species = df_classes_update[df_classes_update['species'] == scientific_name]

    if not matching_species.empty:
        order_value = matching_species.iloc[0]['order']
        subdir_to_order[subdir] = order_value

        # Copy the source folder and its contents to the destination folder
        shutil.copytree(os.path.join(rootdir, subdir), os.path.join(validation_datatset_path, subdir))
    else: 
        # at that stage not found means not edible
        not_found_cnt = not_found_cnt + 1

print("found: " + str(len(subdirectories) - not_found_cnt) + " / " + str(df_classes_update['species'].count()))