# Species Data 

## Project Description

In this I will be extracting/scraping and cleaning up some data about tree species for use in the Tree Stats book.

## Sources
- treecanada.ca


In [17]:
from bs4 import BeautifulSoup
import csv
import nltk
import pandas as pd
import requests

nltk.download(['popular', 'punkt_tab', 'averaged_perceptron_tagger_eng'])

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/codespace/nltk_data...
[nltk_data]    |   Package movie_review

True

In [10]:

url = "https://treecanada.ca/resources/trees-of-canada/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0"}

res = requests.get(
    url.strip(), 
    headers=headers, 
    timeout=10
)

soup = BeautifulSoup(res.content, 'html.parser')

In [65]:
table = soup.find_all('table')[0]
rows = table.find_all('tr')

data_out = []
for row in rows[1:]:
    cells = row.find_all('td')
    names = cells[0].text.strip()

    common_name = names.split("(")[0].replace("-", " ")
    latin_name = names[names.find("(")+1:names.find(")")]
    variety = cells[1].text.strip()
    origin = cells[2].text.strip()
    edible = "Edible" if bool(cells[3].text.strip()) else "Inedible"

    # first attempt at extracting the general family of tree
    family = next(filter(lambda word: word[1] == 'NN', nltk.pos_tag(nltk.word_tokenize(common_name))), ['none'])[0]

    data_out.append([common_name, latin_name, family, variety, origin, edible])

print(data_out)


[['Ailanthus', 'Ailanthus altissima', 'none', 'Deciduous', 'Naturalized', 'Inedible'], ['Alaska paper birch', 'Betula neoalaskana', 'paper', 'Deciduous', 'Native', 'Inedible'], ['Alternate leaf dogwood', 'Cornus alternifolia', 'leaf', 'Deciduous', 'Native', 'Inedible'], ['American beech', 'Fagus grandifolia', 'beech', 'Deciduous', 'Native', 'Inedible'], ['American chestnut', 'Castanea dentata', 'chestnut', 'Deciduous', 'Native', 'Edible'], ['American elder', 'Sambucus canadensis', 'elder', 'Deciduous', 'Native', 'Edible'], ['American hazel', 'Corylus americana', 'hazel', 'Deciduous', 'Native', 'Edible'], ['American holly', 'Ilex opaca', 'none', 'Deciduous', 'Introduced', 'Inedible'], ['American mountain ash', 'Sorbus americana', 'mountain', 'Deciduous', 'Native', 'Inedible'], ['American plum', 'Prunus americana', 'plum', 'Deciduous', 'Native', 'Edible'], ['Amur choke cherry', 'Prunus maackii', 'cherry', 'Deciduous', 'Introduced', 'Edible'], ['Amur corktree', 'Phellodendron amurense', '

In [66]:
HEADERS = [
    "species", 
    "latin_species", 
    "family",
    "variety", 
    "origin_canada", 
    "edible"
]

with open('data/species-data.csv', 'w', newline='',encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(HEADERS)
    
    for row in data_out:
        writer.writerow(row)

In [67]:
species = pd.read_csv("data/species-data.csv")
species

Unnamed: 0,species,latin_species,family,variety,origin_canada,edible
0,Ailanthus,Ailanthus altissima,none,Deciduous,Naturalized,Inedible
1,Alaska paper birch,Betula neoalaskana,paper,Deciduous,Native,Inedible
2,Alternate leaf dogwood,Cornus alternifolia,leaf,Deciduous,Native,Inedible
3,American beech,Fagus grandifolia,beech,Deciduous,Native,Inedible
4,American chestnut,Castanea dentata,chestnut,Deciduous,Native,Edible
...,...,...,...,...,...,...
309,Witch hazel,Hamamelis virginiana,hazel,Deciduous,Native,Inedible
310,Yellow birch,Betula alleghaniensis,birch,Deciduous,Native,Inedible
311,Yellow cedar,Chamaecyparis nootkatensis,cedar,Conifer,Native,Inedible
312,Yellow Twig Dogwood,Cornus sericea,none,Deciduous,Naturalized,Inedible


In [68]:
species["family"].unique()

array(['none', 'paper', 'leaf', 'beech', 'chestnut', 'elder', 'hazel',
       'mountain', 'plum', 'cherry', 'corktree', 'maple', 'Arbutus',
       'cedar', 'pine', 'cypress', 'fir', 'poplar', 'willow', 'Basswood',
       'hickory', 'ash', 'cottonwood', 'hawthorn', 'locust', 'oak',
       'spruce', 'walnut', 'berry', 'gum', 'birch', 'bush', 'Butternut',
       'yew', 'buckthorn', 'Cedar', 'juniper', 'redwood', 'apple',
       'hoptree', 'laburnum', 'lilac', 'prickly', 'winterberry',
       'viburnum', 'tree', 'larch', 'serviceberry', 'chinquapin',
       'hackberry', 'dogwood', 'hemlock', 'elderberry', 'redcedar', 'elm',
       'alder', 'cranberry', 'euonymus', 'filbert', 'Ginkgo',
       'Horsechestnut', 'Ironwood', 'flowering', 'zelkova', 'coffeetree',
       'linden', 'plane', 'puzzle', 'Mountain', 'catalpa', 'pin',
       'buckeye', 'Osage', 'bayberry', 'crab', 'rhododendron', 'Pawpaw',
       'ivy', 'sumac', 'horsechestnut', 'mulberry', 'Redbud', 'olive',
       'Saskatoon', 'magno