In [2]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import os

import time

In [3]:
# This function will take in one website soup object and append a row to a dataframe
# Containing a unique row for each mushroom

def Compile_Mushrooms(one_mushroom_soup):
    name = []
    family = []
    location = []
    min_cap_cm = []
    max_cap_cm = []
    min_stem_hgt = []
    max_stem_hgt = []
    min_stem_diam = []
    max_stem_diam = []
    edibility = []
    long_desc = []
    

    # Create Family Column Data in the list
    family.append(one_mushroom_soup[0])
    
    # Create Location Column Data in the list
    location.append(one_mushroom_soup[1])
    
    # Try to create dimension columns in the lists, some mushrooms don't have enough all dimensions
    try:
        min_cap_cm.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][0])
    except:
        pass
    
    try:
        max_cap_cm.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][1])
    except:
        pass
    
    try:
        min_stem_hgt.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][2])
    except:
        pass
    
    try:    
        max_stem_hgt.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][3])
    except:
        pass
    
    try:   
        min_stem_diam.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][4])
    except:
        pass
    
    try:
        max_stem_diam.append([float(s) for s in re.findall(r'(\d+(?:\.\d+)?)',one_mushroom_soup[2])][5])
    except:
        pass
    


    # Create edibility column data in the list
    edibility.append(
        one_mushroom_soup[3]\
        .replace(' (see important information about picking mushrooms)','')
                    )
    # Create long description column data in the list
    long_desc.append(one_mushroom_soup[4])
    
    # Create name/species column data in the list
    name.append(one_mushroom_soup[5])
    
    # use lists to generate rows in the dataframes
    family_df   = pd.DataFrame(family,columns=["family"])
    location_df = pd.DataFrame(location, columns=["location"])
    min_cap_cm_df  = pd.DataFrame(min_cap_cm, columns=["min_cap_cm"])
    max_cap_cm_df  = pd.DataFrame(max_cap_cm, columns=["max_cap_cm"])
    min_stem_cm_df  = pd.DataFrame(min_stem_hgt, columns=["min_stem_hgt_cm"])
    max_stem_cm_df  = pd.DataFrame(max_stem_hgt, columns=["max_stem_hgt_cm"])
    min_stem_diam_df = pd.DataFrame(min_stem_diam, columns=["min_stem_diam_cm"])
    max_stem_diam_df = pd.DataFrame(max_stem_diam, columns=["max_stem_diam_cm"])
    edibility_df = pd.DataFrame(edibility, columns=["edibility"])
    long_desc_df = pd.DataFrame(long_desc, columns=["long_desc"])
    name_df = pd.DataFrame(name,columns=["name"])
    
    #Smoosh all data frames togther starting with the family name
    outout_df = family_df.join(location_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(min_cap_cm_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(max_cap_cm_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(min_stem_cm_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(max_stem_cm_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(min_stem_diam_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(max_stem_diam_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(edibility_df, lsuffix='_caller',rsuffix='_other')
    outout_df = outout_df.join(long_desc_df, lsuffix='_caller',rsuffix='_other')    
    outout_df = outout_df.join(name_df, lsuffix='_caller',rsuffix='_other') 
    
    return outout_df

In [4]:
#url for alphabetical list
url = 'https://www.mushroom.world/mushrooms/namelist'

In [5]:
res = requests.get(url)

In [6]:
res.status_code

200

In [7]:
# this soup object will hold the data for the page with links to each mushroom
soup = BeautifulSoup(res.content,'lxml')

In [8]:
name_list = soup.find('div', {'class':'item'})

In [9]:
# One link for each mushroom from the alphabetical Url
link_list = []
for item in soup.find_all('div', {'class':'item'}):
    link_list.append(item.find('a').attrs['href'])

In [10]:
link_list[0:10]

['https://www.mushroom.world/show?n=Agaricus-arvensis',
 'https://www.mushroom.world/show?n=Agaricus-augustus',
 'https://www.mushroom.world/show?n=Albatrellus-confluens',
 'https://www.mushroom.world/show?n=Albatrellus-ovinus',
 'https://www.mushroom.world/show?n=Amanita-ceciliae',
 'https://www.mushroom.world/show?n=Amanita-fulva',
 'https://www.mushroom.world/show?n=Amanita-muscaria',
 'https://www.mushroom.world/show?n=Amanita-pantherina',
 'https://www.mushroom.world/show?n=Amanita-phalloides',
 'https://www.mushroom.world/show?n=Amanita-porphyria']

In [11]:
# This will extract the genus and species from each link
name_list = []
for each_link in link_list:
    name_list.append(
        (re.search(r'(\w*-\w*)',each_link)).group().lower().replace("""-""","""_""")
    )

In [12]:
link_list[:5]

['https://www.mushroom.world/show?n=Agaricus-arvensis',
 'https://www.mushroom.world/show?n=Agaricus-augustus',
 'https://www.mushroom.world/show?n=Albatrellus-confluens',
 'https://www.mushroom.world/show?n=Albatrellus-ovinus',
 'https://www.mushroom.world/show?n=Amanita-ceciliae']

In [13]:
name_list[:5]

['agaricus_arvensis',
 'agaricus_augustus',
 'albatrellus_confluens',
 'albatrellus_ovinus',
 'amanita_ceciliae']

In [14]:
# one species per hyperlink
len(link_list), len(name_list)

(139, 139)

# make all the soups

In [15]:
#Go to one link - read the webpage associated with the link make a list containing one soup object per page

mushroom_soups = []
for link in link_list:
    item_res = requests.get(link)
    item_soup = BeautifulSoup(item_res.content,'lxml')
    mushroom_soups.append(item_soup)
    time.sleep(5)

# long description code

In [16]:
names_df=pd.DataFrame(name_list, columns = ['mushroom_long_name'])

In [17]:
names_df.head()

Unnamed: 0,mushroom_long_name
0,agaricus_arvensis
1,agaricus_augustus
2,albatrellus_confluens
3,albatrellus_ovinus
4,amanita_ceciliae


In [18]:
len(names_df),len(mushroom_soups)

(139, 139)

In [19]:
# Use each soup object to read the specific website data in order to create a list for each individual mushroom.
final_df = pd.DataFrame()

num = 0
while num < len(mushroom_soups):
    try:
        alltheshrooms = []
        # Gets name of mushroom species
        stringy = (mushroom_soups[num].find('div',{'class':'caption'}).text)
        
        # Gets 'dimensions'
        for i in mushroom_soups[num].find_all('div',{'class':'textus'}):
            alltheshrooms.append(i.text)
        
        # Gets long description
        alltheshrooms.append(mushroom_soups[num].find('div',{'class':'longtextus'}).text)
        alltheshrooms.append(
            re.search(r'([^\r\n]+)',stringy).group().strip().lower()
                            )
        
        final_df = pd.concat([final_df,Compile_Mushrooms(alltheshrooms)], ignore_index=True)
    
    # common exceptions occur when a mushroom does not have 3 dimensions
    except IndexError:
        error_shroom = []
        stringy = (mushroom_soups[num].find('div',{'class':'caption'}).text)
        error_shroom = re.search(r'([^\r\n]+)',stringy).group().strip().lower()
        
        final_df = pd.concat([final_df,pd.DataFrame([error_shroom ],columns=['name'])],ignore_index=True)
        print(f"Missing measurements/index error on line {num} - {error_shroom}")
    
    except AttributeError:
        print(f"Missing name/attribute error on line {num}")
        
    num+=1



In [20]:
alltheshrooms

['Boletaceae',
 'North America, Europe',
 'Cap 6-12 cm diamter, stem 7-10 cm tall * 2-3 cm diameter',
 'Inedible',
 'Slightly to distinctly pink pores on the underside of the bun-shaped brown cap and a dark net on the thick stem are characteristic of this bolete. It is also distinguished by a very bitter taste.\r\n\nCap snuff- or fulvous-brown; convex or bun-shaped, at first slightly downy then smooth and dry. Flesh whitish, with pinkish tinge beneath cap cuticle, unchanging, thick and firm. Stem pallid background with brown reticulation, stout and slightly bulbous. The mushroom has no ring.\r\n\nSimilar species When young this is easily mistaken for Boletus edulis, except that it is very bitter and has a dark stem net.\r\n\nTylopilus felleus on the MushroomExpert.Com Web site.',
 'tylopilus felleus']

In [21]:
final_df.shape

(139, 11)

In [22]:
final_df.head(5)

Unnamed: 0,family,location,min_cap_cm,max_cap_cm,min_stem_hgt_cm,max_stem_hgt_cm,min_stem_diam_cm,max_stem_diam_cm,edibility,long_desc,name
0,Agaricaceae,"North America, Europe",8.0,20.0,8,10,2.0,3,Edible and good,"Agaricus arvensis, commonly known as the horse...",agaricus arvensis
1,Agaricaceae,"North America, Europe",10.0,20.0,10,20,2.0,4,Edible and good,"Agaricus augustus, also known as the prince, i...",agaricus augustus
2,Polyporaceae,"North America, Europe",7.0,18.0,3,7,1.0,3,Inedible,"From above, this pale orange polypore looks li...",albatrellus confluens
3,Polyporaceae,"North America, Europe",7.0,18.0,3,7,1.0,3,Edible,"Albatrellus ovinus, also known as Sheep Polypo...",albatrellus ovinus
4,Pluteaceae,"North America, Europe",7.0,12.0,8,13,1.5,2,Inedible,"The cap of this large, grayish brown, fleshy ...",amanita ceciliae


In [23]:
final_df.tail(5)

Unnamed: 0,family,location,min_cap_cm,max_cap_cm,min_stem_hgt_cm,max_stem_hgt_cm,min_stem_diam_cm,max_stem_diam_cm,edibility,long_desc,name
134,Tricholomataceae,"North America, Europe",4.0,10.0,,,,,Inedible,The green or brown cap of this agaric is moist...,tricholoma sejunctum
135,Tricholomataceae,"North America, Europe",4.0,12.0,3.5,5.5,1.0,1.5,Inedible,"This is a medium to large, fleshy agaric, cap ...",tricholomopsis decora
136,Tricholomataceae,"North America, Europe",2.0,12.0,3.5,5.5,1.0,1.5,Inedible,"This medium to large, fleshy agaric has a cap ...",tricholomopsis rutilans
137,Gomphaceae,"North America, Asia",5.0,15.0,8.0,15.0,,,Inedible,This mushroom has an orange-capped vase- or tr...,turbinellus floccosus
138,Boletaceae,"North America, Europe",6.0,12.0,7.0,10.0,2.0,3.0,Inedible,Slightly to distinctly pink pores on the under...,tylopilus felleus


In [24]:
# Replace all spaces with underscore
final_df.name = final_df['name'].apply(lambda x:"_".join(x.split()))

In [25]:
names_df.to_csv('CSVs/names_only.csv', index=False)

In [26]:
final_df.to_csv('CSVs/my_mushrooms.csv', index=False)

# single image finding

In [27]:
item_url = 'https://www.mushroom.world/show?n=Agaricus-arvensis'

In [28]:
item_res = requests.get(item_url)

In [29]:
item_res.status_code

200

In [30]:
item_soup = BeautifulSoup(item_res.content,'lxml')

In [31]:
item_soup

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="CReYpkkP3rUFpFBcsfDcqgVzipFv861EiWBS9pWmZ04" name="google-site-verification"/>
<title>Agaricus arvensis - Mushroom World</title>
<link href="/Content/css?v=V76rujkPCSEs2ikOqH5TEGzc1v6VAtO8LZKvVflOKTE1" rel="stylesheet"/>
<script src="/bundles/modernizr?v=wBEWDufH_8Md-Pbioxomt90vm6tJN2Pyy9u9zHtWsPo1"></script>
<!-- swipebox http://brutaldesign.github.io/swipebox/ -->
<link href="/swipebox/src/css/swipebox.css" rel="stylesheet"/>
<!-- https://cookieconsent.insites.com -->
<link href="//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.0.3/cookieconsent.min.css" rel="stylesheet" type="text/css"/>
<script src="//cdnjs.cloudflare.com/ajax/libs/cookieconsent2/3.0.3/cookieconsent.min.js"></script>
<script>
        window.addEventListener("load", function () {
            window.cookieconsent.initialise({
                "palette": {
                    "po

In [32]:
item_soup.find_all('a')

[<a class="navbar-brand" href="/home/index">Home</a>,
 <a href="/mushrooms/search">Search the database</a>,
 <a href="/mushrooms/list">Identification helper</a>,
 <a class="dropdown-toggle" data-toggle="dropdown" href="#">Mushrooms <b class="caret"></b></a>,
 <a href="/mushrooms/namelist">Alphabetical list</a>,
 <a href="/mushrooms/edible">Edible mushrooms</a>,
 <a href="/mushrooms/inedible">Inedible mushrooms</a>,
 <a href="/mushrooms/poisonous">Poisonous mushrooms</a>,
 <a href="/mushrooms/guide">Guide to picking mushrooms</a>,
 <a href="/home/upload">Upload picture</a>,
 <a class="dropdown-toggle" data-toggle="dropdown" href="#">Other <b class="caret"></b></a>,
 <a href="/home/about">About this site</a>,
 <a href="/home/contact">Contact information</a>,
 <a href="/home/links">Useful links</a>,
 <a href="/home/disclaimer">Disclaimer</a>,
 <a href="/Account/Login" id="loginLink">Log in</a>,
 <a class="swipebox" href="/../data/fungi/Agaricusarvensis1.JPG" rel="Agaricus arvensis" title=

# Single image ending

In [33]:
name_list = []
for item in item_soup.find_all('div', {'class':'longtextus'}):
    print(item.find('a').text.replace(' ',"""_"""))
    name_list.append(item.find('a').text.replace(' ',"""_"""))

Agaricus_arvensis


In [34]:
image_list = []
for image in item_soup.find_all('div', {'class':'image'}):
    print(image.find('a').attrs['href'].replace('/../data/fungi/',''))
    image_list.append(image.find('a').attrs['href'].replace('/../data/fungi/',''))

Agaricusarvensis1.JPG
Agaricusarvensis2.JPG
Agaricusarvensis3.JPG
Agaricusarvensis4.JPG
Agaricusarvensis5.JPG


In [35]:
final_df.head()

Unnamed: 0,family,location,min_cap_cm,max_cap_cm,min_stem_hgt_cm,max_stem_hgt_cm,min_stem_diam_cm,max_stem_diam_cm,edibility,long_desc,name
0,Agaricaceae,"North America, Europe",8.0,20.0,8,10,2.0,3,Edible and good,"Agaricus arvensis, commonly known as the horse...",agaricus_arvensis
1,Agaricaceae,"North America, Europe",10.0,20.0,10,20,2.0,4,Edible and good,"Agaricus augustus, also known as the prince, i...",agaricus_augustus
2,Polyporaceae,"North America, Europe",7.0,18.0,3,7,1.0,3,Inedible,"From above, this pale orange polypore looks li...",albatrellus_confluens
3,Polyporaceae,"North America, Europe",7.0,18.0,3,7,1.0,3,Edible,"Albatrellus ovinus, also known as Sheep Polypo...",albatrellus_ovinus
4,Pluteaceae,"North America, Europe",7.0,12.0,8,13,1.5,2,Inedible,"The cap of this large, grayish brown, fleshy ...",amanita_ceciliae


In [36]:
try:
    os.mkdir('scrape_images')
except:
    print('Failed to make directory, may already exist')

Failed to make directory, may already exist


In [38]:
image_list = []

image_num = 0
# for each link, find all the images and append it to image_list
while image_num <(len(mushroom_soups)):
    for image in mushroom_soups[image_num].find_all('div', {'class':'image'}):
        image_list.append(image.find('a').attrs['href'].replace('/../data/fungi/',''))
    image_num+=1

    

In [39]:
image_num

139

In [40]:
len(image_list)

627

In [41]:
image_list[:10]

['Agaricusarvensis1.JPG',
 'Agaricusarvensis2.JPG',
 'Agaricusarvensis3.JPG',
 'Agaricusarvensis4.JPG',
 'Agaricusarvensis5.JPG',
 'Agaricusaugustus1.jpg',
 'Agaricusaugustus2.jpg',
 'Agaricusaugustus3.jpg',
 'Agaricusaugustus4.jpg',
 'Albatrellusconfluens1.JPG']

## for each link -  get all images on that page

In [42]:
# For each image from the entire alphabetical list - save the image in the scrape_images folder
counter = 0
for pic in image_list[:]:
    try:
        image_url = f'https://www.mushroom.world/data/fungi/{pic}'
        picture = requests.get(image_url, timeout=60)
        fp_str = f'scrape_images/{pic.lower()}'
        fp = open(fp_str, 'wb')
        fp.write(picture.content)
        fp.close()
        counter+=1
        time.sleep(5)
    except:
        print(f'error on {pic.lower()}')
        counter+=1
        pass