In [1]:
# importing dependencies 
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import requests
import time

In [2]:
# URL of page to be scraped
domain = 'https://www.fws.gov/'
url = f'{domain}/refuges/databases/ThreatenedEndangeredSpecies/ThreatenedEndangered_Display.cfm'
response = requests.get(url)

In [3]:
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
# Extract tables
table = soup.find_all('table')[1]
df = pd.read_html(str(table))[1]

In [5]:
# Showing dataframe 
df

Unnamed: 0,0,1,2,3,4
0,Organism Type,Species Name,Federal Status,Unit Name,State
1,Amphibian,Arroyo Toad,Endangered,San Diego NWR,CA
2,Amphibian,California Red-legged Frog,Endangered,Guadalupe-Nipomo Dunes NWR,CA
3,Amphibian,California Red-legged Frog,Endangered,San Diego NWR,CA
4,Amphibian,Cheat Mountain Salamander,Threatened,Canaan Valley NWR,WV
...,...,...,...,...,...
961,Snail,Oahu Tree Snail - Achatinella decipiens,Endangered,Oahu Forest NWR,HI
962,Snail,Oahu Tree Snail - Achatinella leucorraphe,Endangered,Oahu Forest NWR,HI
963,Snail,Oahu Tree Snail - Achatinella sowerbyana,Endangered,Oahu Forest NWR,HI
964,Snail,Round rocksnail,Threatened,Cahaba River National Wildlife Refuge,AL


In [6]:
# renaming columns
df.columns = ['Organism Type', 'Species Name', 'Federal Status', 'Unit Name', 'State']
df

Unnamed: 0,Organism Type,Species Name,Federal Status,Unit Name,State
0,Organism Type,Species Name,Federal Status,Unit Name,State
1,Amphibian,Arroyo Toad,Endangered,San Diego NWR,CA
2,Amphibian,California Red-legged Frog,Endangered,Guadalupe-Nipomo Dunes NWR,CA
3,Amphibian,California Red-legged Frog,Endangered,San Diego NWR,CA
4,Amphibian,Cheat Mountain Salamander,Threatened,Canaan Valley NWR,WV
...,...,...,...,...,...
961,Snail,Oahu Tree Snail - Achatinella decipiens,Endangered,Oahu Forest NWR,HI
962,Snail,Oahu Tree Snail - Achatinella leucorraphe,Endangered,Oahu Forest NWR,HI
963,Snail,Oahu Tree Snail - Achatinella sowerbyana,Endangered,Oahu Forest NWR,HI
964,Snail,Round rocksnail,Threatened,Cahaba River National Wildlife Refuge,AL


In [7]:
# Pulling all plants from dataframe
plants_df = df.loc[df['Organism Type'] == 'Plant']
plants_df

Unnamed: 0,Organism Type,Species Name,Federal Status,Unit Name,State
681,Plant,"Pogogyne abramsii, San Diego mesa mint",Endangered,San Diego NWR,CA
682,Plant,"Pogogyne nudiuscula, Otay mesa mint",Endangered,San Diego NWR,CA
683,Plant,"Aconitum noveboracense, Northern Wild Monkshood",Endangered,Driftless Area NWR,IA
684,Plant,"Aeschynomene virginica, Sensitive Joint-vetch",Threatened,Mattamuskeet NWR,NC
685,Plant,"Agalinis acuta, sandplain gerardia",Endangered,Wertheim NWR,NY
...,...,...,...,...,...
807,Plant,"Stahlia monosperma, Cobana negra",Threatened,Cabo Rojo NWR,PR
808,Plant,"Stahlia monosperma, Cobana negra",Threatened,Vieques NWR,PR
809,Plant,"Tetraplasandra gymnocarpa, no common name",Endangered,Oahu Forest NWR,HI
810,Plant,"Trifolium stoloniferum, Running Buffalo Clover",Threatened,Ottawa NWR,OH


In [8]:
#Birds
bird_df = df.loc[df['Organism Type'] == 'Bird']
bird_df

Unnamed: 0,Organism Type,Species Name,Federal Status,Unit Name,State
9,Bird,Akiapolaau,Endangered,Hakalau Forest NWR,HI
10,Bird,Attwater's Greater Prairie-chicken,Endangered,Attwater Prairie Chicken NWR,TX
11,Bird,Attwater's Greater Prairie-chicken,Endangered,Aransas NWR,TX
12,Bird,Audubon's Crested Caracara,Threatened,Arthur R. Marshall Loxahatchee NWR,FL
13,Bird,Bachman's Warbler,Endangered,Ernest F. Hollings ACE Basin NWR,SC
...,...,...,...,...,...
416,Bird,Wood Stork,Endangered,Arthur R. Marshall Loxahatchee NWR,FL
417,Bird,Wood Stork,Endangered,Banks Lake NWR,GA
418,Bird,Wood Stork,Endangered,Blackbeard Island NWR,GA
419,Bird,Yellow-shouldered Blackbird,Endangered,Cabo Rojo NWR,PR


In [10]:
# reading in csv with lat long
latdf = pd.read_csv('../../data/Lat_long_states.csv') 
latdf

Unnamed: 0,State,Lat,Long
0,WI,44.5,-89.5
1,WV,39.0,-80.5
2,VT,44.0,-72.699997
3,TX,31.0,-100.0
4,SD,44.5,-100.0
5,RI,41.700001,-71.5
6,OR,44.0,-120.5
7,NY,43.0,-75.0
8,NH,44.0,-71.5
9,NE,41.5,-100.0


In [11]:
# merging bird df on state 
merged_bird_df = pd.merge(latdf,bird_df, on='State', how='left')
merged_bird_df

Unnamed: 0,State,Lat,Long,Organism Type,Species Name,Federal Status,Unit Name
0,WI,44.50000,-89.500000,Bird,Kirtland's Warbler,Endangered,St. Croix WMD
1,WV,39.00000,-80.500000,,,,
2,VT,44.00000,-72.699997,,,,
3,TX,31.00000,-100.000000,Bird,Attwater's Greater Prairie-chicken,Endangered,Attwater Prairie Chicken NWR
4,TX,31.00000,-100.000000,Bird,Attwater's Greater Prairie-chicken,Endangered,Aransas NWR
...,...,...,...,...,...,...,...
386,LA,30.39183,-92.329100,Bird,Piping Plover,Endangered,Delta NWR
387,LA,30.39183,-92.329100,Bird,Piping Plover,Endangered,Shell Keys NWR
388,LA,30.39183,-92.329100,Bird,Red-cockaded Woodpecker,Endangered,Upper Ouachita NWR
389,LA,30.39183,-92.329100,Bird,Red-cockaded Woodpecker,Endangered,Big Branch Marsh NWR


In [12]:
#dropping NAN rows because there are no endangered birds in the states with NaN values 
cleaned_bird_df = merged_bird_df.dropna(how='any')
cleaned_bird_df

Unnamed: 0,State,Lat,Long,Organism Type,Species Name,Federal Status,Unit Name
0,WI,44.50000,-89.5000,Bird,Kirtland's Warbler,Endangered,St. Croix WMD
3,TX,31.00000,-100.0000,Bird,Attwater's Greater Prairie-chicken,Endangered,Attwater Prairie Chicken NWR
4,TX,31.00000,-100.0000,Bird,Attwater's Greater Prairie-chicken,Endangered,Aransas NWR
5,TX,31.00000,-100.0000,Bird,Black-capped Vireo,Endangered,Santa Ana NWR
6,TX,31.00000,-100.0000,Bird,Black-capped Vireo,Endangered,Balcones Canyonlands NWR
...,...,...,...,...,...,...,...
386,LA,30.39183,-92.3291,Bird,Piping Plover,Endangered,Delta NWR
387,LA,30.39183,-92.3291,Bird,Piping Plover,Endangered,Shell Keys NWR
388,LA,30.39183,-92.3291,Bird,Red-cockaded Woodpecker,Endangered,Upper Ouachita NWR
389,LA,30.39183,-92.3291,Bird,Red-cockaded Woodpecker,Endangered,Big Branch Marsh NWR


In [13]:
# Export to CSV file
cleaned_bird_df.to_csv("../../data/Endangered_Birds.csv", index=False)

In [14]:
# merging plant df on state 
merged_plant_df = pd.merge(latdf,plants_df, on='State', how='left')
merged_plant_df

Unnamed: 0,State,Lat,Long,Organism Type,Species Name,Federal Status,Unit Name
0,WI,44.500000,-89.500000,Plant,"Cirsium pitcheri, Pitcher's Thistle",Threatened,Leopold WMD
1,WI,44.500000,-89.500000,Plant,"Iris lacustris, Dwarf Lake Iris",Threatened,Leopold WMD
2,WI,44.500000,-89.500000,Plant,"Lespedeza leptostachya, Prairie Bush-clover",Threatened,Leopold WMD
3,WI,44.500000,-89.500000,Plant,"Oxytropis campestris var. chartacea, Fassett's...",Threatened,Leopold WMD
4,WI,44.500000,-89.500000,Plant,"Platanthera leucophaea, Eastern Prairie Fringe...",Threatened,Leopold WMD
...,...,...,...,...,...,...,...
134,ID,44.068203,-114.742043,,,,
135,WY,43.075970,-107.290283,,,,
136,NC,35.782169,-80.793457,Plant,"Aeschynomene virginica, Sensitive Joint-vetch",Threatened,Mattamuskeet NWR
137,NC,35.782169,-80.793457,Plant,"Amaranthus pumilus, Seabeach Amaranth",Endangered,Currituck NWR


In [27]:
merged_plant_df['Location'] = merged_plant_df[['Lat', 'Long']].values.tolist()
merged_plant_df

Unnamed: 0,State,Lat,Long,Organism Type,Species Name,Federal Status,Unit Name,Location
0,WI,44.500000,-89.500000,Plant,"Cirsium pitcheri, Pitcher's Thistle",Threatened,Leopold WMD,"[44.5, -89.5]"
1,WI,44.500000,-89.500000,Plant,"Iris lacustris, Dwarf Lake Iris",Threatened,Leopold WMD,"[44.5, -89.5]"
2,WI,44.500000,-89.500000,Plant,"Lespedeza leptostachya, Prairie Bush-clover",Threatened,Leopold WMD,"[44.5, -89.5]"
3,WI,44.500000,-89.500000,Plant,"Oxytropis campestris var. chartacea, Fassett's...",Threatened,Leopold WMD,"[44.5, -89.5]"
4,WI,44.500000,-89.500000,Plant,"Platanthera leucophaea, Eastern Prairie Fringe...",Threatened,Leopold WMD,"[44.5, -89.5]"
...,...,...,...,...,...,...,...,...
134,ID,44.068203,-114.742043,,,,,"[44.068203000000004, -114.742043]"
135,WY,43.075970,-107.290283,,,,,"[43.07597, -107.290283]"
136,NC,35.782169,-80.793457,Plant,"Aeschynomene virginica, Sensitive Joint-vetch",Threatened,Mattamuskeet NWR,"[35.782169, -80.79345699999999]"
137,NC,35.782169,-80.793457,Plant,"Amaranthus pumilus, Seabeach Amaranth",Endangered,Currituck NWR,"[35.782169, -80.79345699999999]"


In [28]:
#dropping NAN rows because there are no endangered plants in the states with NaN values 
cleaned_plants_df = merged_plant_df.dropna(how='any')
cleaned_plants_df

Unnamed: 0,State,Lat,Long,Organism Type,Species Name,Federal Status,Unit Name,Location
0,WI,44.500000,-89.500000,Plant,"Cirsium pitcheri, Pitcher's Thistle",Threatened,Leopold WMD,"[44.5, -89.5]"
1,WI,44.500000,-89.500000,Plant,"Iris lacustris, Dwarf Lake Iris",Threatened,Leopold WMD,"[44.5, -89.5]"
2,WI,44.500000,-89.500000,Plant,"Lespedeza leptostachya, Prairie Bush-clover",Threatened,Leopold WMD,"[44.5, -89.5]"
3,WI,44.500000,-89.500000,Plant,"Oxytropis campestris var. chartacea, Fassett's...",Threatened,Leopold WMD,"[44.5, -89.5]"
4,WI,44.500000,-89.500000,Plant,"Platanthera leucophaea, Eastern Prairie Fringe...",Threatened,Leopold WMD,"[44.5, -89.5]"
...,...,...,...,...,...,...,...,...
131,CA,36.778259,-119.417931,Plant,"Oenothera deltoides howellii, Antioch Dunes Ev...",Endangered,Antioch Dunes NWR,"[36.778259000000006, -119.41793100000001]"
132,CA,36.778259,-119.417931,Plant,"Orcuttia californica, California Orcutt grass",Endangered,San Diego NWR,"[36.778259000000006, -119.41793100000001]"
133,CA,36.778259,-119.417931,Plant,"Rorippa gambellii, Gambelæ¯ Watercress",Endangered,Guadalupe-Nipomo Dunes NWR,"[36.778259000000006, -119.41793100000001]"
136,NC,35.782169,-80.793457,Plant,"Aeschynomene virginica, Sensitive Joint-vetch",Threatened,Mattamuskeet NWR,"[35.782169, -80.79345699999999]"


In [29]:
# Export to csv
cleaned_plants_df.to_csv("../../data/Endangered_Plants.csv", index=False)