In [230]:
import os, glob, time, argparse
from urllib import parse
import numpy as np 
import pandas as pd
import requests

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException, MaxRetryError
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep



In [231]:
parser = argparse.ArgumentParser()
parser.add_argument("--waittime", type=int, default=10, help="waiting time for 'WebDriverWait(driver, waittime)', default=10")
# args = parser.parse_args()
args = parser.parse_args(args=[]) # in jupyter
print(args)

Namespace(waittime=10)


In [232]:
# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless")                #不開啟實體瀏覽器背景執行
# options.add_argument("--start-maximized")         #最大化視窗
options.add_argument("--incognito")               #開啟無痕模式
options.add_argument("--disable-popup-blocking ") #禁用彈出攔截

# 使用 Chrome 的 WebDriver (含 options)
driver = webdriver.Chrome(options = options)
waittime = 10
wait = WebDriverWait(driver, waittime)

In [233]:
#---------------------Define Fuctions--------------------------------------------------------------------------------------------------------------------------------

In [235]:
def get_url(family: str = None, genus: str = None) -> str :
    '''input query targets,'''
    family = family
    genus = genus
    website = 'https://www.nhm.ac.uk/our-science/data/butmoth/search/GenusList3.dsml?'
    query ={
        'searchPageURL':'index.dsml',
         'SUPERFAMIL':'',
        'FAMILYqtype':'starts+with',
        'FAMILY':family,
        'SUBFAMILYqtype':'starts+with',
        'SUBFAMILY':'',
        'TRIBEqtype':'starts+with',
        'TRIBE':'',
        'SUBTRIBEqtype':'starts+with',
        'SUBTRIBE':'',
        'GENUSqtype':'starts+with',
        'GENUS':genus,
        'AUTHORqtype':'starts+with',
        'AUTHOR':'',
        'YEARqtype':'equals',
        'YEAR':'',
        'sort':'GENUS'}
    query_url = website + parse.urlencode(query)
    return query_url

In [269]:
def get_higherClassification(url: str = None) -> str: 
    try:
        driver.get(url)  # 開啟網頁
        print('Lauch url')

        # 等待Accept cookies畫面跳出，並按下 
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#cookie-banner-form > button')))
        driver.find_element(By.CSS_SELECTOR, '#cookie-banner-form > button').click()
        print('Click "Accept cookies"')

        # 輸入的資料查詢到結果
        getrecord  = driver.find_element(By.CLASS_NAME, 'RecSetLoc')
        print(getrecord.text)

        # 等待查詢物種畫面跳出、按下欲查詢的屬名
        wait.until(EC.presence_of_element_located((By.LINK_TEXT, genus)))
        driver.find_element(By.LINK_TEXT, genus).click()                  # 直接用屬名點選連結
        print(f'Click {genus}')

        # 等待屬名詳細頁面跳出，抓取Higher classification欄位內的文字 'Higher classification:\nSuperfamily : Family : SubFamily : Tribe'
        class_selector = '#microsite-body > table.dataTable_ms > tbody > tr:nth-child(3) > td > p'
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, class_selector)))
        sleep(2)                             # 這邊需要強制等待一段時間才能抓到
        class_element = driver.find_element(By.CSS_SELECTOR, class_selector)   
        classes = driver.find_element(By.CSS_SELECTOR, class_selector).text.split('\n')

        c=0
        while classes[0] == '':              # 當強制等待後，target還是沒有抓到想要的內容時，送入迴圈處理
            c+=1
            sleep(2)                         # 強制等待後重新補抓
            classes = driver.find_element(By.CSS_SELECTOR, class_selector).text.split('\n')  
            if classes[0] == '':
                print(f'Wait again {c:2d}', end='\r')
                if c>5:                      # 迴圈重複執行n次還是沒結果的話，強制引發異常，並執行
                    classes = 'nan'
                    raise Exception('Assign classes as "None"')
                continue                     # 如果還是沒抓到的話，再回到迴圈等待
            else:
                print('Break loop')
                break
        classes = classes[1]
        print(f'Get {classes} !')
        return classes

    except NoSuchElementException as exc:    # 攔截查詢不到的錯誤訊息 
        error_selector = '#microsite-body > table > tbody > tr > td > p.msgError.center'
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, error_selector)))  # 等待目標元素出現
        error_element = driver.find_element(By.CSS_SELECTOR, error_selector)
        classes = 'nan'
        print(f"Can't find '{family} {genus}' Assign classes as nan!")
        return classes

    except IndexError as e:
        print(e)
    except TimeoutException:
        print("等待逾時，關閉瀏覽器")
        sleep(3)
        driver.quit()
    except Exception as e:
        print(e)

In [238]:
def write_file(family: str, genus: str, classes: str) -> None:
    with open('./classification.csv','a') as file:
        file.write(",".join([family ,genus, classes]))

In [None]:
#---------------------Define main()--------------------------------------------------------------------------------------------------------------------------------

In [239]:
# 建立科名:屬名對應的Series資料集，方便後續取用
moth_meta = pd.read_csv(f'moth_meta.csv')
family_genus = (moth_meta.groupby(['Family','Genus']).count()
                .reset_index(level='Genus')['Genus'])
# print(len(family_genus)) # 4949

In [244]:
start = time.time()

for i, (family, genus) in enumerate(family_genus.items()) :
    pass_ = time.time() - start
    print(f'{i:4d}, {family:15s} : {genus:10s}.  Time Passed: {pass_//(60*60):3.0f}h,{pass_//60%60:2.0f}m,{pass_%60:2.0f}s')
    url = get_url(family=family, genus=genus)
    classes = get_higherClassification(url=url)
    print(f'Get classes: {classes}')
    
    sleep(2)
    driver.quit()

    if i == 3: break

   0, Acanthopteroctetidae : Acanthopteroctetes.  Time Passed:   0h, 0m, 0s
something wrong!
關閉瀏覽器
Get classes: None


TypeError: sequence item 2: expected str instance, NoneType found

In [252]:
classes

In [251]:
print(f'Get classes: {classes}')

Get classes: None


In [268]:
get_higherClassification(url=url)

something wrong! HTTPConnectionPool(host='127.0.0.1', port=2509): Max retries exceeded with url: /session/80f7f530fa7b565fa119092165306b23/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E39A847970>: Failed to establish a new connection: [WinError 10061] 無法連線，因為目標電腦拒絕連線。'))


MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=2509): Max retries exceeded with url: /session/80f7f530fa7b565fa119092165306b23/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E39A85BD30>: Failed to establish a new connection: [WinError 10061] 無法連線，因為目標電腦拒絕連線。'))