[Python中re的match、search、findall、finditer区别](https://blog.csdn.net/djskl/article/details/44357389)
- search
  - 若string中包含pattern子串，则返回Match对象，否则返回None，注意，如果string中存在多个pattern子串，只返回第一个。
  - group()：母串中与模式pattern匹配的子串；group(0)：结果与group()一样；


In [2]:
import urllib.request
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import numpy as np


In [None]:
prog1 = re.compile('(GenusDetails.dsml\?.+?)&')   # 當有查詢到資料時，會出現前端包含 "GenusDetails.dsml"字串的連結
prog2 = re.compile('\s+(.+?)\<\/changecase\>')    # 搜尋文件檔的<B>Higher classification:</B><br />\r\n\t\tGEOMETROIDEA : GEOMETRIDAE : LARENTIINAE</changecase>
genus_higher_taxa = dict()


In [4]:
moth_meta = pd.read_csv(f'moth_meta.csv')
family_genus = (moth_meta.groupby(['Family','Genus']).count()
                .reset_index(level='Genus')['Genus'])
genus_list = list(moth_meta.sort_values('Genus').Genus.unique())

In [10]:
for genus in genus_list:
    q1 = f"https://www.nhm.ac.uk/our-science/data/butmoth/search/GenusList3.dsml?GENUSqtype=equals&GENUS={genus}"
    try:
        higher_taxa = genus_higher_taxa[genus]
        print("Ignoring", genus)
    except:
        print("Fetching", genus)
        failed_counter = 0
        while True:
            try:
                with urllib.request.urlopen(q1) as response1: 
                    html1 = response1.read()                     # 直接讀取的話會是binary檔，需要decode轉為str格式，等同於requests.get(q1).text，且已經為str格式，可省略.decode()這一步
                    s1 = prog1.search(html1.decode('utf-8'))     # get such as 'GenusDetails.dsml?NUMBER=136.0'   group(0)比group(1)多一個& 。 用這個方法只為抓出
                    if s1 is None:
                        genus_higher_taxa[genus] = "Unknown"
                    else:
                        q2 = "https://www.nhm.ac.uk/our-science/data/butmoth/search/" + s1.group(1)
                        with urllib.request.urlopen(q2) as response2:
                            html2 = response2.read()
                            s2 = prog2.search(html2.decode('utf-8'))
                            higher_taxa = s2.group(1)
                            genus_higher_taxa[genus] = higher_taxa.replace(' ', '')
                break
            except:
                failed_counter += 1
                time.sleep(10)
                if failed_counter == 5:
                    break

Fetching Abaciscus
Fetching Abaeis
Fetching Abagrotis
Fetching Abegesta
Fetching Abisara
Fetching Abraxas
Fetching Abrochia
Fetching Abrostola
Fetching Abrota
Fetching Acada
Fetching Acantholipes
Fetching Acanthopteroctetes
Fetching Acasis
Fetching Acentria


KeyboardInterrupt: 

In [8]:
(pd.DataFrame(
    {'genus': list(genus_higher_taxa.keys()), 
     'higher': list(genus_higher_taxa.values())})
 .to_csv('genus_to_higher.csv', index=False, sep='\t')
)

In [1]:
# ===採用requests + BeautifulSoup的寫法=================================================================================================================

In [6]:
genus_higher_taxa = dict()

moth_meta = pd.read_csv(f'moth_meta.csv')
family_genus = (moth_meta.groupby(['Family','Genus']).count()
                .reset_index(level='Genus')['Genus'])
genus_list = list(moth_meta.sort_values('Genus').Genus.unique())

In [9]:
family_genus

Family
Acanthopteroctetidae    Acanthopteroctetes
Acrolophidae                    Acrolophus
Acrolophidae                       Amydria
Adelidae                             Adela
Adelidae                           Cauchas
                               ...        
Zygaenidae                         Pryeria
Zygaenidae                      Pyromorpha
Zygaenidae                        Rhagades
Zygaenidae                         Soritia
Zygaenidae                         Zygaena
Name: Genus, Length: 4949, dtype: object

In [7]:
for i, (family, genus) in enumerate(family_genus.items()):
    q1 = f"https://www.nhm.ac.uk/our-science/data/butmoth/search/GenusList3.dsml?GENUSqtype=equals&GENUS={genus}"
    try:
        higher_taxa = genus_higher_taxa[genus]
        print("Ignoring", genus)
    except:
        print("Fetching", genus)
        failed_counter = 0
        while True:
            try:
                response = requests.get(q1)
                soup = BeautifulSoup(response.text, "lxml")  # 指定 lxml 作為解析器
                genus_search = soup.find_all(href=re.compile("GenusDetails.dsml"))  # 尋找"GenusDetails.dsml"的連結
                print(f'\tNumber of genus record found：{len(genus_search)}')
                if len(genus_search) == 0:
                    genus_higher_taxa[genus] = "Unknown"
                else:
                    for link in soup.find_all(href=re.compile("(GenusDetails.dsml\?.+?)&")):
                        q2 = "https://www.nhm.ac.uk/our-science/data/butmoth/search/" + link.get('href')
                        response2 = requests.get(q2)
                        soup2 = BeautifulSoup(response2.text, "lxml")
                        higher_taxa = soup2.select("#microsite-body > table.dataTable_ms > tr:nth-child(3)")[0]  # 使用css選擇器
                        genus_higher_taxa[genus] = higher_taxa.text.split('\r\n\t\t')[1].split('\n')[0].replace(' ', '').title()
                        
                    '''目前寫法，如果有查詢到多個屬名，則只會取最後一個直接填入，下一步可加入科名比對流程'''
                break
            except:
                failed_counter += 1
                time.sleep(10)
                if failed_counter == 5:
                    break

Fetching Abaciscus
	Number of genus record found：1
Fetching Abaeis
	Number of genus record found：1
Fetching Abagrotis
	Number of genus record found：1
Fetching Abegesta
	Number of genus record found：1
Fetching Abisara
	Number of genus record found：1
Fetching Abraxas
	Number of genus record found：1
Fetching Abrochia
	Number of genus record found：1
Fetching Abrostola
	Number of genus record found：1
Fetching Abrota
	Number of genus record found：1
Fetching Acada
	Number of genus record found：1
Fetching Acantholipes
	Number of genus record found：1
Fetching Acanthopteroctetes
	Number of genus record found：1
Fetching Acasis
	Number of genus record found：1
Fetching Acentria
	Number of genus record found：2
Fetching Acerra
	Number of genus record found：1
Fetching Achaea
	Number of genus record found：1
Fetching Achalarus
	Number of genus record found：1
Fetching Achatia
	Number of genus record found：3
Fetching Achatodes
	Number of genus record found：1
Fetching Acherdoa
	Number of genus record found

KeyboardInterrupt: 

In [8]:
(pd.DataFrame(
    {'genus': list(genus_higher_taxa.keys()), 
     'higher': list(genus_higher_taxa.values())})
 .to_csv('genus_to_higher.csv', index=False, sep=',')
)