# Web Scraping For Lists of Birds - Github

### Alexindata 03/17/2017

### Tools: Jupyter Notebook with Python 3.5, BeautifulSoup, PostgreSQL RDBMS


## 1. Connect to a PostgreSQL database named 'birds'

In [23]:
import psycopg2
import urllib.request
from bs4 import BeautifulSoup, NavigableString, Tag
import re
from unidecode import unidecode
from IPython.display import display
import pprint as pp


In [2]:
user = '######'
pw = "######"
conn = psycopg2.connect(dbname = "birds", host = "localhost", port = 5432, user = user, password = pw)
cur = conn.cursor()


## 2. Collect names of north american birds from a couple of websites

### Scrape from 1st web page for a bird list in North America with ABA information

In [3]:
url_1 = 'http://help.ebird.org/customer/portal/articles/1101706-aba-codes-and-ebird'
url_2 = 'https://en.wikipedia.org/wiki/List_of_birds_of_Canada_and_the_United_States'

uh = urllib.request.urlopen(url_1)
html = uh.read()

In [4]:
# ebird table

try:
    cur.execute("CREATE TABLE IF NOT EXISTS ebird (ebird_id SERIAL PRIMARY KEY, name TEXT NOT NULL, family TEXT, aba INTEGER, wiki_id INTEGER)")
except psycopg2.Error:
    conn.rollback()  # must use conn.commit() to write to disk, otherwise no changes to the db will show.
else:
    conn.commit()


In [5]:
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# the bird list is in 5th p, use -1 to subset
third_p = soup.find_all('p', limit=5)[-1]

for br in third_p.find_all('br'):
    previous = br.previous_sibling
    if not (previous and isinstance(previous, NavigableString)):
        continue
    text = previous.strip()
    if text:
        if text[0].isdigit():
            line_num = re.search('(.+)\. ', text).group(1)
            name = re.search(' (.+) \(', text).group(1)
            name = re.sub('-', ' ', name) # replace '-' with space
            aba = re.search('.+ \(([1-6])\)', text).group(1).strip()
                
            try:
                # even though b_aba is integer, %i is not accepted in VALUES() field
                cur.execute('INSERT INTO ebird (name, aba) VALUES (%s, %s);', (name, aba))
            except psycopg2.Error as e:
                print('Error in SQL command, {}'.format(e))
                conn.rollback()
            else:
                conn.commit()
  

In [33]:
# show 3 birds from ebird table that are Warblers
try:
    cur.execute("SELECT * FROM ebird WHERE LOWER(name) LIKE '%warbler%' LIMIT 3;")
except psycopg2.Error:
    conn.rollback()
else:
    q1 = cur.fetchall()
    pp.pprint(q1)

[(696, 'Arctic Warbler', 'Leaf-warblers', 2, 731),
 (770, "Bachman's Warbler", 'Wood-warblers', 6, 816),
 (800, 'Bay breasted Warbler', 'Wood-warblers', 1, 847)]


### Scrape the 2nd web page

In [27]:
# scraping the wiki page on north american birds

uh2 = urllib.request.urlopen(url_2)
html2 = uh2.read()

In [29]:
soup2 = BeautifulSoup(html2, 'html.parser')

In [30]:
try:
    cur.execute("CREATE TABLE IF NOT EXISTS wiki (wiki_id SERIAL PRIMARY KEY, name TEXT NOT NULL, family TEXT NOT NULL, sci_name TEXT, status TEXT, status_info TEXT, link TEXT)")
except psycopg2.Error:
    conn.rollback()
else:
    conn.commit()


In [30]:
# working properly

idx = 0
base = 'https://en.wikipedia.org'

for a in soup2.select('h2 span'):
    if a['class'] == ['mw-headline']:
        idx += 1
        if idx >=3 and idx <= 92:
            family = a.get_text()

            for b in a.parent.find_next_sibling('ul'):
                if not isinstance(b, NavigableString):
                    name = b.next_element.get_text()
                    name = re.sub('-', ' ', name) # remove '-' in bird names, as they are irregular
                    name = unidecode(str(name)) # force utf8 conversion to closest ASCII, using unidecode package
                    link = base + b.a['href']
                    sci_name = b.i.get_text()
                    if b.span:
                        status = b.span.get_text()
                        status_info = b.span['title']
                        status_info = re.search(': (.+)$', status_info).group(1)
                    try:
                        cur.execute("INSERT INTO wiki (name, family, sci_name, status, status_info, link) VALUES (%s, %s, %s, %s, %s, %s);", (name, family, sci_name, status, status_info, link))
                    except psycopg2.Error:
                        conn.rollback()
                    else:
                        conn.commit()
                else:
                    continue


In [32]:
# show 3 birds from wiki table that are Warblers
try:
    cur.execute("SELECT * FROM wiki WHERE LOWER(name) LIKE '%warbler%' LIMIT 3;")
except psycopg2.Error:
    conn.rollback()
else:
    q2 = cur.fetchall()
    pp.pprint(q2)

[(726,
  'Willow warbler',
  'Leaf-warblers',
  'Phylloscopus trochilus',
  'LC',
  'Least Concern',
  'https://en.wikipedia.org/wiki/Willow_warbler'),
 (727,
  'Wood warbler',
  'Leaf-warblers',
  'Phylloscopus sibilatrix',
  'LC',
  'Least Concern',
  'https://en.wikipedia.org/wiki/Wood_warbler'),
 (728,
  'Dusky warbler',
  'Leaf-warblers',
  'Phylloscopus fuscatus',
  'LC',
  'Least Concern',
  'https://en.wikipedia.org/wiki/Dusky_warbler')]


### Merge bird family, wiki_id info from wiki table to ebird table

In [44]:
# update 'family', and 'wiki_id' columns in ebird table using wiki table information

try:
    cur.execute("UPDATE ebird e SET (family, wiki_id) = (w.family, w.wiki_id) FROM wiki w WHERE LOWER(e.name) = LOWER(w.name);")
except:
    conn.rollback()
else:
    conn.commit()


In [34]:
# show 3 birds from ebird table that are Warblers, with updated information
try:
    cur.execute("SELECT * FROM ebird WHERE LOWER(name) LIKE '%warbler%' LIMIT 3;")
except psycopg2.Error:
    conn.rollback()
else:
    q3 = cur.fetchall()
    pp.pprint(q3)

[(696, 'Arctic Warbler', 'Leaf-warblers', 2, 731),
 (770, "Bachman's Warbler", 'Wood-warblers', 6, 816),
 (800, 'Bay breasted Warbler', 'Wood-warblers', 1, 847)]


In [35]:
cur.close()
conn.close()