#### 1.1 同時輸入科名+屬名查詢
- 如果查詢不到
  - 僅輸入屬名
    - 只有1筆，抓取唯一的高階分類資料
    - 多筆，抓取所有的高階分類資料


#### 1.2 只輸入屬名查詢
- 得到1筆資料
  - 科名不同  
- 得到多筆資料
  - 將多筆資料全部抓下
  - 資料儲存格式
    - 　{科　: {屬名: 分類資料1,2,3 } }
    -   科 : 為我們輸入的 


##### 有多筆資料的抓取與儲存方式
- 如果屬名對應到 多個family ，以list方式同時封存多個classes

### 英國自然史博物館網站
[Butterflies and Moths of the World](https://www.nhm.ac.uk/our-science/data/butmoth/index.html)

In [3]:
import os, glob, time, argparse
from urllib import parse
import numpy as np 
import pandas as pd
# import requests
# from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep

In [122]:
parser = argparse.ArgumentParser()
parser.add_argument("--waittime", type=int, default=20, help="waiting time for 'WebDriverWait(driver, waittime)', default=20")
parser.add_argument("--sleeptime", type=int, default=3, help="sleep time between actions', default=3")
parser.add_argument("--start", type=int, default=0, help="if break,  start fron ith iteration , default=0")
# args = parser.parse_args()  # in termonal
args = parser.parse_args(args=[]) # in jupyter
print(args)

Namespace(sleeptime=3, start=0, waittime=20)


In [123]:
# 啟動瀏覽器工具的選項
options = webdriver.ChromeOptions()
# options.add_argument("--headless")                #不開啟實體瀏覽器背景執行
# options.add_argument("--start-maximized")         #最大化視窗
options.add_argument("--incognito")                 #開啟無痕模式
options.add_argument("--disable-popup-blocking ")   #禁用彈出攔截
options.add_argument('--disable-gpu')               #官方文件建議設置，避免各種bug
options.add_argument('blink-settings=imagesEnabled=false')             #不加载图片, 提升速度
options.add_experimental_option('excludeSwitches', ['enable-logging']) #禁止打印日志
options.add_argument('--headless')                  #不開啟實體瀏覽器背景執行

# 使用 Chrome 的 WebDriver (含 options)
wait_time = args.waittime
sleep_time =args.sleeptime
start = args.start

# 建立科名:屬名對應的Series資料集，方便後續取用
moth_meta = pd.read_csv(f'moth_meta.csv')
family_genus = (moth_meta.groupby(['Family','Genus']).count()
                .reset_index(level='Genus')['Genus'])
# print(len(family_genus)) # 4949

# 檢視屬名屬名對應多個科名的資料
# mask = family_genus.duplicated(keep=False)
# family_genus[mask].sort_values()             # 有47筆 
# moth_meta

In [124]:
#---------------------Define Fuctions--------------------------------------------------------------------------------------------------------------------------------

In [125]:
def get_url(family: str = '', genus: str = '') -> str :
    '''input query targets'''
    family = family
    genus = genus
    website = 'https://www.nhm.ac.uk/our-science/data/butmoth/search/GenusList3.dsml?'
    query ={
        'FAMILY':family,
        'GENUS':genus
    }
    query_url = website + parse.urlencode(query)
    return query_url

In [126]:
def wait_class_element(class_: str = '', driver=None) -> None: 
    ''' 當強制等待後，class_element還是沒有抓到想要的內容時，送入迴圈處理'''
    c=0
    while class_ == '':                  # 當強制等待後，target還是沒有抓到想要的內容時，送入迴圈處理
        c+=1
        sleep(1)                         # 強制等待後重新補抓
        class_ = driver.find_element(By.CSS_SELECTOR, class_selector).text  
        if class_ == '':
            print(f'\t\tWait again {c:2d}', end='\r')
            if c>5:                      # 迴圈重複執行n次還是沒結果的話，強制引發異常，並執行
                class_ = "can't get classes"
                raise Exception(f"\tAssign classes as '{class_}'")
            continue                     # 如果還是沒抓到的話，再回到迴圈等待
        else:
            print(f'\t\tBreak loop')
            break
    return class_

In [127]:
def get_higherClassification(family: str = '', genus: str = '', url: str = None) -> str:
    ''' 函式前需先指定driver為全域變數'''
    assert(family!=''); assert(genus!=''); assert(url!='')
    try:
        driver.get(url)  # 開啟網頁
        print('\tLaunch url')

        # 等待Accept cookies畫面跳出，並按下 
        wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#cookie-banner-form > button')))
        driver.find_element(By.CSS_SELECTOR, '#cookie-banner-form > button').click()
        print('\tClick "Accept cookies"')

        # 輸入的資料查詢到結果，如果在這一步查詢不到，則交由NoSuchElementException攔截處理
        getrecord  = driver.find_element(By.CLASS_NAME, 'RecSetLoc')
        print(f'\t{getrecord.text}')  # 如果能執行到這一步，表示屬名有資料

        # 等待查詢物種畫面跳出、按下欲查詢的屬名
        wait.until(EC.presence_of_element_located((By.LINK_TEXT, genus)))
        genus_elements = driver.find_elements(By.LINK_TEXT, genus)
        for link in genus_elements:
            link = link.get_attribute('href')                    # 取得連結
            driver.execute_script('window.open("'+link+'");')    # 開啟分頁

        sleep(sleep_time)         # 等待一秒等新的分頁跳出否則又會跑出cookies
        driver.close()      # 開啟分頁B後將分頁A關閉 

        # 獲取目前視窗控制碼(list) 
        handles = driver.window_handles

        # 取得類別名稱
        # 等待屬名詳細頁面跳出，抓取Higher classification欄位內的文字 'Higher classification:\nSuperfamily : Family : SubFamily : Tribe'
        classes = []
        for handle in handles:
            driver.switch_to.window(handle)                 # 切換至不同分頁進行操作
            sleep(sleep_time)
            class_selector = '#microsite-body > table.dataTable_ms > tbody > tr:nth-child(3) > td > p'
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, class_selector)))
            sleep(sleep_time)                             # 這邊需要強制等待一段時間才能抓到
            class_ = driver.find_element(By.CSS_SELECTOR, class_selector).text
            
            # 如果等待時間不夠久，class_尚未抓到內文，則送入 wait_class_element執行等待，直到取值
            if class_ == '' :                                       
                class_ = wait_class_element(class_=class_, driver=driver)     
            class_ = class_.split('\n')[1]
            classes.append(class_)


        # 比對目前的科名，如果能與目前科名一致，就填入一致的，否則保留目前科名(1或多個)
        if classes != 'genus not exist':
            print('\tComparing family name ...')
            for c in classes:
                family_ = c.split(':')[1].strip()
                if family.upper() == family_:
                    classes = c
            # 如果沒有比對出科名的，資料型態會是list，則取第一筆資料
            if type(classes) is list:  
                classes = classes[0]
            
        print(f'\tGot "{classes}" !')
        return classes

    except NoSuchElementException as exc:    # 攔截查詢不到的錯誤訊息 
        error_selector = '#microsite-body > table > tbody > tr > td > p.msgError.center'
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, error_selector)))  # 等待目標元素出現
        error_element = driver.find_element(By.CSS_SELECTOR, error_selector)
        classes = "genus not exist"
        print(f"\tCan't find '{family} {genus}' Assign classes as '{classes}' !")
        return classes

    except IndexError as e:
        print(e)
    except TimeoutException:
        print("\t等待逾時，關閉瀏覽器")
        sleep(1)
        driver.quit()
    except Exception as e:
        print(e)

In [128]:
def write_file(i, family: str, genus: str, classes: str) -> None:
    with open('./classification.csv','a') as file:
        file.write(",".join([str(i), family ,genus, f'{str(classes)}\n']))  # 最後一項要加入換行符號 \n 

In [129]:
#---------------------Define main()--------------------------------------------------------------------------------------------------------------------------------

In [15]:
def main():
    global driver, wait
    start_time = time.time()
    for i, (family, genus) in enumerate(family_genus[start:].items(), start=start) : 
        print(f'{i}/4949, Querying... {family}, {genus}')

        driver = webdriver.Chrome(options = options)
        wait = WebDriverWait(driver, wait_time)

        url = get_url(family='', genus=genus)    #　僅用屬名查詢
        classes = get_higherClassification(family=family, genus=genus, url=url)
        write_file(i, family, genus, classes)
        pass_ = time.time() - start_time
        print(f'Time Passed: {pass_//(60*60):3.0f}h,{pass_//60%60:2.0f}m,{pass_%60:2.0f}s.')
        driver.quit()
    

In [None]:
# '''主程式'''
# if __name__ == '__main__':
#     main()

In [None]:
# =====檢視、整理抓取到的資料=========================================================================================================================================================================================

In [1]:
df = (
    pd.read_csv("classification.csv",index_col=0 ,header=None)
    .rename(columns={1:'ImgFamily', 2:'Genus', 3:'Classes'})
     )
df.index.names = ['']
df[:20]

NameError: name 'pd' is not defined

78


Unnamed: 0,ImgFamily,Genus,Classes
,,,
11,Autostichidae,Spinitibia,genus not exist
22,Bombycidae,Rotunda,genus not exist
73,Cossidae,Polyphagozerra,genus not exist
206,Crambidae,Hodebertia,genus not exist
282,Crambidae,Poliobotys,genus not exist
...,...,...,...
4540,Sphingidae,Cerberonoton,genus not exist
4613,Sphingidae,Notonagemia,genus not exist
4643,Sphingidae,Pseudoangonyx,genus not exist


In [385]:
mask = df.Classes=='None'  # df.Classes.str.contains('^[Nn]one')==True
df[mask]
print(f"Number of 'None' : {df[mask].Classes.size}")
index_None =  df[mask].index.values
df[mask]

Number of 'None' : 0


Unnamed: 0,ImgFamily,Genus,Classes
,,,


In [383]:
#　抓出"None"的資料，重新指派
start_time = time.time()
for index in index_None:
    start = index
    for i, (family, genus) in enumerate(family_genus[start:].items(), start=start) : 
        print(f'{i}/4949, Querying... {family}, {genus}')

        driver = webdriver.Chrome(options = options)
        wait = WebDriverWait(driver, wait_time)

        url = get_url(family='', genus=genus)    #　僅用屬名查詢
        classes = get_higherClassification(family=family, genus=genus, url=url)

        # 將None的值重新指派
        df.iloc[start].Classes = classes
        pass_ = time.time() - start_time
        print(f'Time Passed: {pass_//(60*60):3.0f}h,{pass_//60%60:2.0f}m,{pass_%60:2.0f}s.')
        driver.quit()
        break
    

958/4949, Querying... Erebidae, Olulis
	Launch url
	Click "Accept cookies"
	1 record found
	Comparing family name ...
	Got "NOCTUOIDEA : NOCTUIDAE : CALPINAE" !
Time Passed:   0h, 0m,19s.
1195/4949, Querying... Erebidae, Tritonaclia
	Launch url
	Click "Accept cookies"
	1 record found
	Comparing family name ...
	Got "NOCTUOIDEA : ARCTIIDAE : Subfamily unassigned" !
Time Passed:   0h, 0m,39s.
1399/4949, Querying... Geometridae, Antitrygodes
	Launch url
	Click "Accept cookies"
	1 record found
	Comparing family name ...
	Got "GEOMETROIDEA : GEOMETRIDAE : STERRHINAE" !
Time Passed:   0h, 0m,59s.
1402/4949, Querying... Geometridae, Apithecia
	Launch url
	Click "Accept cookies"
	1 record found
	Comparing family name ...
	Got "GEOMETROIDEA : GEOMETRIDAE : LARENTIINAE" !
Time Passed:   0h, 1m,19s.
1452/4949, Querying... Geometridae, Brachyctenistis
	Launch url
	Click "Accept cookies"
	1 record found
	Comparing family name ...
	Got "GEOMETROIDEA : GEOMETRIDAE : ENNOMINAE" !
Time Passed:   0h, 1m

In [6]:
# df.to_csv("classification_210607.csv")
df = pd.read_csv("classification_210607.csv", index_col=0)
df[:20]

Unnamed: 0,ImgFamily,Genus,Classes
0,Acanthopteroctetidae,Acanthopteroctetes,ACANTHOPTEROCTETOIDEA : ACANTHOPTEROCTETIDAE
1,Acrolophidae,Acrolophus,TINEOIDEA : TINEIDAE : ACROLOPHINAE
2,Acrolophidae,Amydria,TINEOIDEA : TINEIDAE : ACROLOPHINAE
3,Adelidae,Adela,ADELOIDEA : ADELIDAE : ADELINAE
4,Adelidae,Cauchas,ADELOIDEA : ADELIDAE : ADELINAE
5,Adelidae,Nematopogon,ADELOIDEA : ADELIDAE : NEMATOPOGONINAE
6,Adelidae,Nemophora,ADELOIDEA : ADELIDAE : ADELINAE
7,Alucitidae,Alucita,ALUCITOIDEA : ALUCITIDAE
8,Alucitidae,Pterotopteryx,ALUCITOIDEA : ALUCITIDAE
9,Apatelodidae,Apatelodes,BOMBYCOIDEA : BOMBYCIDAE : APATELODINAE


In [409]:
mask = df.Classes.str.contains('genus not exist')==True
print(len(df[mask].Genus))
df[mask].sort_values('Genus')

78


Unnamed: 0,ImgFamily,Genus,Classes
2789,Noctuidae,Agrocholorta,genus not exist
2791,Noctuidae,Albocosta,genus not exist
2829,Noctuidae,Apterogenum,genus not exist
493,Erebidae,Ataboruza,genus not exist
2446,Limacodidae,Avatara,genus not exist
...,...,...,...
2485,Limacodidae,Thespea,genus not exist
2489,Limacodidae,Vanlangia,genus not exist
3401,Noctuidae,Viridiseptis,genus not exist
3402,Noctuidae,Viridistria,genus not exist


In [332]:
# 將Higher classificatio切分至不同欄位並命名
df_hc =(df.Classes.str.title()
        .str.replace(' ', '')
        .str.split(':', expand=True)   # 以 ':'分割字串並拓展至不同欄位
        .rename(columns={0 : 'Superfamily',
                         1 : 'Family',
                         2 : 'Subfamily',
                         3 : 'Tribe',
                         4 : 'Subtribe'}
               )
       )
df_hc

Unnamed: 0,Superfamily,Family,Subfamily,Tribe,Subtribe
,,,,,
0,Acanthopteroctetoidea,Acanthopteroctetidae,,,
1,Tineoidea,Tineidae,Acrolophinae,,
2,Tineoidea,Tineidae,Acrolophinae,,
3,Adeloidea,Adelidae,Adelinae,,
4,Adeloidea,Adelidae,Adelinae,,
...,...,...,...,...,...
4471,Bombycoidea,Saturniidae,Saturniinae,,
4472,Bombycoidea,Saturniidae,Hemileucinae,,
4473,Bombycoidea,Saturniidae,Ceratocampinae,,


In [4]:
df_hc = pd.concat((df_hc, df[['Genus','ImgFamily']]), axis=1)
df_hc[:20]

NameError: name 'df_hc' is not defined

In [369]:
# 檢視科名不一致的資料
mask = (df_hc.Family != df_hc.ImgFamily)
df_FamilyWrong = df_hc[mask].copy()
df_FamilyWrong.groupby(['ImgFamily']).Genus.count().sort_values()[::-1]

ImgFamily
Erebidae            801
Lycaenidae           40
Noctuidae            34
Geometridae          18
Endromidae           15
Oecophoridae         10
Gelechiidae           8
Nolidae               8
Elachistidae          7
Limacodidae           6
Pyralidae             5
Nymphalidae           5
Cossidae              4
Notodontidae          3
Crambidae             2
Roeslerstammidae      2
Acrolophidae          2
Mimallonidae          2
Chrysopolomidae       1
Eriocottidae          1
Bombycidae            1
Autostichidae         1
Apatelodidae          1
Saturniidae           1
Ethmiidae             1
Euteliidae            1
Glyphipterigidae      1
Lacturidae            1
Lecithoceridae        1
Lypusidae             1
Momphidae             1
Plutellidae           1
Riodinidae            1
Hesperiidae           1
Name: Genus, dtype: int64

In [310]:
df_hc[df_hc.Family == 'Roeslerstammiidae']
# 科名有誤 Roeslerstammiidae  
# 誤寫為    Roeslerstammidae

Unnamed: 0,Superfamily,Family,Subfamily,Tribe,Subtribe,Genus,ImgFamily
,,,,,,,
4383.0,Gracillarioidea,Roeslerstammiidae,,,,Agriothera,Roeslerstammidae
4384.0,Gracillarioidea,Roeslerstammiidae,,,,Telethera,Roeslerstammidae
4385.0,Gracillarioidea,Roeslerstammiidae,,,,Roeslerstammia,Roeslerstammiidae


In [312]:
mask = moth_meta.Family.str.contains('Roeslerst')
moth_meta[mask]

Unnamed: 0,Family,Genus,Species,DataID,SciName,Family_encode,Fam_Sample,Fam_Specie,Family_encodeYY
8986,Roeslerstammiidae,Roeslerstammia,erxlebella,3003689801_TAM_639_0.jpg,Roeslerstammia erxlebella,78,1,1,36
9887,Roeslerstammidae,Telethera,blepharacma,A43-20160713-132_TESRI_0.jpg,Telethera blepharacma,77,4,2,55
12916,Roeslerstammidae,Agriothera,issikii,A52-20160601-205_TESRI_0.jpg,Agriothera issikii,77,4,2,55
21503,Roeslerstammidae,Telethera,blepharacma,A58-20171125-072_TESRI_0.jpg,Telethera blepharacma,77,4,2,55
31937,Roeslerstammidae,Agriothera,issikii,V04-20130606-191_TESRI_0.jpg,Agriothera issikii,77,4,2,55


In [324]:
np.sort(moth_meta.Family.unique())

array(['Acanthopteroctetidae', 'Acrolophidae', 'Adelidae', 'Alucitidae',
       'Apatelodidae', 'Argyresthiidae', 'Autostichidae',
       'Batrachedridae', 'Blastobasidae', 'Bombycidae', 'Brachodidae',
       'Brahmaeidae', 'Bucculatricidae', 'Callidulidae', 'Carposinidae',
       'Castniidae', 'Chimabachidae', 'Choreutidae', 'Chrysopolomidae',
       'Coleophoridae', 'Cosmopterigidae', 'Cossidae', 'Crambidae',
       'Drepanidae', 'Elachistidae', 'Endromidae', 'Epermeniidae',
       'Epicopeiidae', 'Epipyropidae', 'Erebidae', 'Eriocottidae',
       'Eriocraniidae', 'Ethmiidae', 'Eupterotidae', 'Euteliidae',
       'Galacticidae', 'Gelechiidae', 'Geometridae', 'Glyphipterigidae',
       'Gracillariidae', 'Hedylidae', 'Heliodinidae', 'Heliozelidae',
       'Hepialidae', 'Hesperiidae', 'Hyblaeidae', 'Immidae',
       'Incurvariidae', 'Lacturidae', 'Lasiocampidae', 'Lecithoceridae',
       'Lemoniidae', 'Limacodidae', 'Lycaenidae', 'Lyonetiidae',
       'Lypusidae', 'Megalopygidae', 'Micr