In [2]:
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import re
import pdb

# Fetch the page
url = "https://en.wikipedia.org/wiki/List_of_reptiles_of_Northern_America"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')


In [85]:
# Initialize a DataFrame to hold extracted information
columns = ['Order', 'Suborder', 'Family', 'Subfamily', 'Species', 
           'Common_Name', 'Conservation_Status', 'Endangered_Species_Status', 
           'Alien_Status', 'Raw']
df = pd.DataFrame(columns=columns)

def extract_species_from_i(el):
    text = str(el)
    # Regular expression to match text within <i> tags
    match = re.search(r'<i>(.*?)</i>', text)

    if match:
        extracted_text = match.group(1)
    else:
        extracted_text = ""
    return extracted_text.replace(r"(page does not exist)", "").strip()

def extract_species(text):
    # Regular expression to match text within the first full parenthesis, handling one level of nested parentheses
    match = re.search(r'\(([^()]*\([^()]*\)[^()]*)\)', text)

    # If nested parentheses are found
    if match:
        extracted_text = match.group(1)
    else:
        # Fallback to match the first simple parentheses if no nested ones are found
        match = re.search(r'\(([^()]*)\)', text)
        extracted_text = match.group(1) if match else ""
    extracted_text = extracted_text.replace(r'(page does not exist)', '').strip()
    return extracted_text

# Define a function to extract conservation statuses from the text
def extract_conservation_status(text):
    statuses = {
        'EX': 'Extinct', 'EW': 'Extinct in the Wild',
        'CR': 'Critically Endangered', 'EN': 'Endangered', 'VU': 'Vulnerable',
        'NT': 'Near Threatened', 'LC': 'Least Concern',
        'DD': 'Data Deficient', 'NE': 'Not Evaluated'
    }
    for code, status in statuses.items():
        if code in text.split(' '):
            return status
    return ''
# Define a function to extract endangered species statuses from the text
def extract_endangered_status(text):
    statuses = {
        'E': 'Endangered', 'T': 'Threatened',
        'XN': 'Experimental Non-Essential', 'XE': 'Experimental Essential',
        'E(S/A)': 'Endangered due to similarity of appearance', 
        'T(S/A)': 'Threatened due to similarity of appearance'
    }
    for code, status in statuses.items():
        if code in text.split(' '):
            return status
    return ''

def update_df_from_ul(el, current_hierarchy, df):
    for li in el.find_all('li'):
        sub_els = [sub_el for sub_el in li.children if isinstance(sub_el, (Tag)) and sub_el.name == 'ul']
        if len(sub_els) > 0:
            for sub_ul in sub_els:
                df = update_df_from_ul(sub_ul, current_hierarchy, df)
        else:
            species_text = ' '.join(li.text.split()).\
                  replace(r'(page does not exist)', '').strip()  # Clean and split text
            # Split species and common names
            parts = species_text.split(' (')
            _common_name = parts[0].strip() if len(parts) > 1 else ''
            _species = extract_species(species_text)
            if _species == _common_name or _species == '' or ":" in _species:
                _species = extract_species_from_i(li)
                if _species == _common_name or _species == '':
                    _species = li.find_all('a')[0].get("title").\
                        replace(r"(page does not exist)", "").strip()
            if len(_species.split(' ')) == 1 or ":" in _species or "[" in _species \
                or "'" in _species or 'snake' in _species:
                _species = li.find_all('a')[0].get("title").\
                        replace(r"(page does not exist)", "").strip()

            # Extract conservation/endangered statuses from species or common names text
            _conservation_status = extract_conservation_status(species_text)
            _endangered_status = extract_endangered_status(species_text)
            # Extract alien species status
            if re.search('\\*', species_text):
                _alien = True
            else:
                _alien = False
            row = {**current_hierarchy, 
                "Species": _species,
                "Common_Name": _common_name,
                "Conservation_Status": _conservation_status,
                "Endangered_Species_Status": _endangered_status,
                "Alien_Status": _alien,
                "Raw": species_text}
            # Create a temporary DataFrame and append it to the main DataFrame
            temp_df = pd.DataFrame(row, columns=columns, index=[0])
            df = pd.concat([df, temp_df], ignore_index=True)
    return(df)

orders = [h2 for h2 in soup.find_all('h2') if re.search(r'id="Order:_[^"]*"', str(h2))]
for order in orders:
    _order = order.text.replace('[edit]', '').strip().replace('Order: ', '')
    current_hierarchy = {'Order': _order, 'Suborder': '', 'Family': '', 'Subfamily': '',
                     'Species': '', 'Common_Name': '', 'Conservation_Status': '',
                     'Endangered_Species_Status':'', 'Raw':''}
    for sibling in order.find_next_siblings():
        if sibling.name == "h2":
            break
        if isinstance(sibling, (NavigableString, Tag)):
            if re.search(r'id="Suborder:_[^"]*"', str(sibling)):
                _suborder = sibling.text.replace('[edit]', '').strip().replace('Suborder: ', '')
                current_hierarchy['Suborder'] = _suborder
            elif re.search(r'id="Family:_[^"]*"', str(sibling)):
                _family = sibling.text.replace('[edit]', '').strip().replace('Family: ', '')
                current_hierarchy["Family"] = _family
            elif re.search(r'Subfamily:', sibling.text): 
                if sibling.name == "p":
                    _subfamily = sibling.text.replace('[edit]', '').strip().replace('Subfamily: ', '')
                    current_hierarchy["Subfamily"] = _subfamily
                else:
                    for children in sibling.children:
                        if re.search(r'Subfamily: <b>', children.text) or \
                            re.search(r'Subfamily: <b>', str(children)): 
                            _subfamily = children.text.replace('[edit]', '').strip().replace('Subfamily: ', '')
                            current_hierarchy["Subfamily"] = _subfamily
                        elif isinstance(children, (NavigableString, Tag)):
                            if children.name == 'ul':
                                df = update_df_from_ul(children, current_hierarchy, df)
                            else:
                                if isinstance(children, (Tag)):
                                    for children_2 in children.children:
                                        if isinstance(children_2, (NavigableString, Tag)):
                                            if children_2.name == 'ul':
                                                df = update_df_from_ul(children_2, current_hierarchy, df)
                                                # pdb.set_trace()
            else:
                if sibling.name == 'ul':
                    df = update_df_from_ul(sibling, current_hierarchy, df)
                else:
                    for children in sibling.children:
                        if isinstance(children, (NavigableString, Tag)):
                            if children.name == 'ul':
                                df = update_df_from_ul(children, current_hierarchy, df)
                                # pdb.set_trace()

# Save the DataFrame to a CSV file
output_path = '../data/reptiles_of_northern_america_parsed.csv'
df.to_csv(output_path, index=False)
df.head()

Anolis distichus dominicensis
Anolis distichus ignigularis
Anolis distichus dominicensis
Anolis distichus ignigularis
Phyllodactylus nocticolus
Sonora annulata
Sonora annulata
Sonora palarostris
Nerodia erythrogaster
Thamnophis sirtalis


Unnamed: 0,Order,Suborder,Family,Subfamily,Species,Common_Name,Conservation_Status,Endangered_Species_Status,Alien_Status,Raw
0,Crocodilia,,Alligatoridae,,Alligator mississippiensis,American alligator,Least Concern,Threatened due to similarity of appearance,False,American alligator (Alligator mississippiensis...
1,Crocodilia,,Alligatoridae,,Caiman crocodilus,Spectacled caiman,Least Concern,,True,Spectacled caiman (Caiman crocodilus) * LC
2,Crocodilia,,Crocodylidae,Crocodylinae,Crocodylus acutus,American crocodile,Vulnerable,Endangered,False,American crocodile (Crocodylus acutus) VU E (F...
3,Testudines (turtles),Pleurodira,Podocnemididae (side-necked turtles),,Podocnemis unifilis,Yellow-spotted Amazon River turtle,,,True,Yellow-spotted Amazon River turtle (Podocnemis...
4,Testudines (turtles),Cryptodira,Testudinidae (tortoises),,Gopherus polyphemus,Gopher tortoise,Vulnerable,Threatened,False,Gopher tortoise (Gopherus polyphemus) VU T


In [65]:
extract_species_from_i('<li><a href="/wiki/Marbled_whiptail" title="Marbled whiptail">Marbled whiptail</a> (<i>Aspidoscelis marmorata</i></li>')

'Aspidoscelis marmorata'