# 空氣污染監測網 網路爬蟲實作練習


* 能夠利用 selenium + BeautifulSoup 撰寫爬蟲，並存放到合適的資料結構


## 作業目標

根據範例 ，完成以下問題：

* ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料
* ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料





In [1]:
# 打開瀏覽器

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

browser = webdriver.Chrome(executable_path='chromedriver')
browser.get("http://taqm.epa.gov.tw/taqm/tw/MonthlyAverage.aspx")

### ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料

In [2]:
# 模擬使用者操作行為，選擇/點擊

# <select name="ctl05$ddlSite" id="ctl05_ddlSite">
selectSite = Select(browser.find_element_by_id("ctl05_ddlSite"))
# <option selected="selected" value="11">臺北市-士林</option>
selectSite.select_by_value('11')
# <select name="ctl05$ddlYear" id="ctl05_ddlYear">
selectYear = Select(browser.find_element_by_id("ctl05_ddlYear"))
# <option value="2018">2018</option>
selectYear.select_by_value('2018')

browser.find_element_by_id('ctl05_btnQuery').click()

In [3]:
# 取得資料，丟到 BeautifulSoup 解析
html_source = browser.page_source

soup = BeautifulSoup(html_source, 'html.parser')
table = soup.find('table', class_='TABLE_G')
print(table)

<table align="Center" border="1" cellpadding="3" cellspacing="0" class="TABLE_G" id="ctl05_gv" rules="all" style="border-color:Black;border-width:1px;border-style:None;border-collapse:collapse;">
<tbody><tr style="color:Black;font-weight:normal;">
<th scope="col">監測項目</th><th scope="col">單位</th><th scope="col">監測日期</th><th scope="col">監測值</th><th scope="col">標註</th>
</tr><tr style="color:Black;">
<td class="no-alt" rowspan="12" style="white-space:nowrap;" valign="top">SO2</td><td class="no-alt" rowspan="12" valign="top">ppb</td><td>2018/01</td><td>1.80</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/02</td><td>1.90</td><td> </td>
</tr><tr style="color:Black;">
<td>2018/03</td><td>2.20</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/04</td><td>2.30</td><td> </td>
</tr><tr style="color:Black;">
<td>2018/05</td><td>3.10</td><td> </td>
</tr><tr class="ALT" style="color:Black;">
<td>2018/06</td><td>2.70</td><td> </td>
</tr><tr style="color:Black;">
<td>

In [4]:
d = {}
item = ''
date = ''

for t in table.find_all('td'):
    st = t.text.strip()
    if 'style' in t.attrs:
        #ex: <td class="no-alt" rowspan="12" style="white-space:nowrap;" valign="top">SO2</td>
        item = st
        d[item] = {}
    elif not 'class' in t.attrs:
        if len(st) == 0:
            continue
        if '/' in st:
            #ex: 2019/02
            date = st 
        else:
            #ex: 3.30
            d[item][date] = st
            
d

{'SO2': {'2018/01': '1.80',
  '2018/02': '1.90',
  '2018/03': '2.20',
  '2018/04': '2.30',
  '2018/05': '3.10',
  '2018/06': '2.70',
  '2018/07': '2.20',
  '2018/08': '2.40',
  '2018/09': '2.10',
  '2018/10': '1.70',
  '2018/11': '1.90',
  '2018/12': '1.80'},
 'CO': {'2018/01': '0.34',
  '2018/02': '0.44',
  '2018/03': '0.40',
  '2018/04': '0.38',
  '2018/05': '0.34',
  '2018/06': '0.29',
  '2018/07': '0.21',
  '2018/08': '0.30',
  '2018/09': '0.26',
  '2018/10': '0.29',
  '2018/11': '0.30',
  '2018/12': '0.35'},
 'O3': {'2018/01': '33.40',
  '2018/02': '32.50',
  '2018/03': '35',
  '2018/04': '38.40',
  '2018/05': '31.60',
  '2018/06': '29.50',
  '2018/07': '18.70',
  '2018/08': '26.40',
  '2018/09': '29.10',
  '2018/10': '45.90',
  '2018/11': '32.40',
  '2018/12': '30.70'},
 'PM10': {'2018/01': '23',
  '2018/02': '41',
  '2018/03': '39',
  '2018/04': '48',
  '2018/05': '37',
  '2018/06': '26',
  '2018/07': '24',
  '2018/08': '26',
  '2018/09': '28',
  '2018/10': '33',
  '2018/11': '2

### 存放到合適的資料結構

In [5]:
import pandas as pd
df = pd.DataFrame(d)
df

Unnamed: 0,SO2,CO,O3,PM10,NOx,NO,NO2,THC,NMHC,CH4
2018/01,1.8,0.34,33.4,23,14.17,3.41,10.77,,,
2018/02,1.9,0.44,32.5,41,19.43,4.48,14.95,,,
2018/03,2.2,0.4,35.0,39,18.5,4.93,13.56,,,
2018/04,2.3,0.38,38.4,48,15.24,2.6,12.64,,,
2018/05,3.1,0.34,31.6,37,15.2,2.26,12.94,,,
2018/06,2.7,0.29,29.5,26,13.79,2.41,11.38,,,
2018/07,2.2,0.21,18.7,24,11.6,2.68,8.92,,,
2018/08,2.4,0.3,26.4,26,14.75,2.68,12.08,,,
2018/09,2.1,0.26,29.1,28,12.4,2.41,10.0,,,
2018/10,1.7,0.29,45.9,33,12.45,2.13,10.33,,,


### 2018/01 – 2018/08 的 SO2 資料

In [6]:
df['SO2'][:8]

2018/01    1.80
2018/02    1.90
2018/03    2.20
2018/04    2.30
2018/05    3.10
2018/06    2.70
2018/07    2.20
2018/08    2.40
Name: SO2, dtype: object

### ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料

In [7]:
df[['SO2', 'CO']][:8]

Unnamed: 0,SO2,CO
2018/01,1.8,0.34
2018/02,1.9,0.44
2018/03,2.2,0.4
2018/04,2.3,0.38
2018/05,3.1,0.34
2018/06,2.7,0.29
2018/07,2.2,0.21
2018/08,2.4,0.3


In [8]:
# 關閉瀏覽器
browser.quit();