# 動態網頁爬蟲 - 使用 Selenium

* 了解 Selenium 用於動態網頁爬蟲的原理
* 能夠使用 Selenium 撰寫動態網頁爬蟲

## 作業目標


* 取出 台北市萬華區 2019/01 – 2019/07 的 table 資料



In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
from io import StringIO
import time

In [2]:
browser = webdriver.Chrome(executable_path='../chromedriver')
browser.get("https://taqm.epa.gov.tw/taqm/tw/MonthlyAverage.aspx")
#browser.close

In [3]:
selectSite = Select(browser.find_element_by_id("ctl05_ddlSite"))
selectSite.select_by_value('13')
selectYear = Select(browser.find_element_by_id("ctl05_ddlYear"))
selectYear.select_by_value('2019')
browser.find_element_by_id('ctl05_btnQuery').click()
time.sleep(5) #若不設定間格時間去接執行，則html_source抓到的網頁為上為查詢之網頁，會找不到表格
html_source = browser.page_source
#html_source

In [4]:
pd.__version__

'0.25.1'

In [39]:
dfs = pd.read_html(html_source)
# 先篩選欄位為5的表格，再做選取
df = [df for df in dfs if df.shape[1] == 5]
df = df[1]
df


Unnamed: 0,監測項目,單位,監測日期,監測值,標註
0,SO2,ppb,2019/01,1.90,
1,SO2,ppb,2019/02,2.00,
2,SO2,ppb,2019/03,1.90,
3,SO2,ppb,2019/04,2.50,
4,SO2,ppb,2019/05,2.00,
...,...,...,...,...,...
75,NO2,ppb,2019/10,17.96,
76,NO2,ppb,2019/11,19.64,
77,THC,ppm,,,無此測項
78,NMHC,ppm,,,無此測項


In [6]:
from datetime import datetime

https://blog.csdn.net/ls13552912394/article/details/79328762

In [7]:
#法一

#先把'監測日期'轉為datetime再設為index(反向操作亦可)

df['監測日期']=pd.to_datetime(df['監測日期'],errors='coerce')
# df['監測日期'] = df['監測日期'].dt.strftime('%m / %Y') 僅把"間測日期"改為字串格式，若再設定為index則不能使用時間去做篩選(因類型為object)
# df.index = pd.DatetimeIndex(df.index) 可把object類型之index轉為datetime
df = df.set_index(['監測日期'])
df

Unnamed: 0_level_0,監測項目,單位,監測值,標註
監測日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,SO2,ppb,1.90,
2019-02-01,SO2,ppb,2.00,
2019-03-01,SO2,ppb,1.90,
2019-04-01,SO2,ppb,2.50,
2019-05-01,SO2,ppb,2.00,
...,...,...,...,...
2019-10-01,NO2,ppb,17.96,
2019-11-01,NO2,ppb,19.64,
NaT,THC,ppm,,無此測項
NaT,NMHC,ppm,,無此測項


In [8]:
df['2019-1':'2019-7']

Unnamed: 0_level_0,監測項目,單位,監測值,標註
監測日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,SO2,ppb,1.9,
2019-02-01,SO2,ppb,2.0,
2019-03-01,SO2,ppb,1.9,
2019-04-01,SO2,ppb,2.5,
2019-05-01,SO2,ppb,2.0,
2019-06-01,SO2,ppb,2.0,
2019-07-01,SO2,ppb,1.7,
2019-01-01,CO,ppm,0.57,
2019-02-01,CO,ppm,0.54,
2019-03-01,CO,ppm,0.56,


In [40]:
#法2
df = df.set_index('監測日期')
df.index = pd.DatetimeIndex(df.index)
df.loc['2019/01':'2019/07'] # or  df['2019/01':'2019/07']

Unnamed: 0_level_0,監測項目,單位,監測值,標註
監測日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,SO2,ppb,1.9,
2019-02-01,SO2,ppb,2.0,
2019-03-01,SO2,ppb,1.9,
2019-04-01,SO2,ppb,2.5,
2019-05-01,SO2,ppb,2.0,
2019-06-01,SO2,ppb,2.0,
2019-07-01,SO2,ppb,1.7,
2019-01-01,CO,ppm,0.57,
2019-02-01,CO,ppm,0.54,
2019-03-01,CO,ppm,0.56,


**取出  2019/01 – 2019/07 SO2資料**

reindex resetindex

https://zhuanlan.zhihu.com/p/30053389

https://blog.csdn.net/jingyi130705008/article/details/78162758

多層索引

https://ithelp.ithome.com.tw/articles/10194235

https://www.itread01.com/content/1548203786.html

In [41]:
#利用多層索引

df = df.reset_index()
#df.reindex(columns=['監測項目','監測日期','單位','監測值','標註']) #reindex()需要将所有的行索引或者列名全部表示出来
df = df.set_index(keys = ["監測項目","監測日期"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,單位,監測值,標註
監測項目,監測日期,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SO2,2019-01-01,ppb,1.90,
SO2,2019-02-01,ppb,2.00,
SO2,2019-03-01,ppb,1.90,
SO2,2019-04-01,ppb,2.50,
SO2,2019-05-01,ppb,2.00,
...,...,...,...,...
NO2,2019-10-01,ppb,17.96,
NO2,2019-11-01,ppb,19.64,
THC,NaT,ppm,,無此測項
NMHC,NaT,ppm,,無此測項


In [43]:
df.loc['SO2'].loc['2019/01':'2019/07']

Unnamed: 0_level_0,單位,監測值,標註
監測日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,ppb,1.9,
2019-02-01,ppb,2.0,
2019-03-01,ppb,1.9,
2019-04-01,ppb,2.5,
2019-05-01,ppb,2.0,
2019-06-01,ppb,2.0,
2019-07-01,ppb,1.7,
