# Selenium 網站爬蟲
# 中學生網站 - 閱讀心得寫作比賽參賽作品

- 主網頁: https://www.shs.edu.tw/
- 搜尋頁: https://www.shs.edu.tw/index.php?p=search
- 作品頁: https://www.shs.edu.tw/search_view_over.php?work_id=2432715

In [1]:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, print_function, division

__author__ = 'I-HSUAN WU'

import os
import json
import time
import requests
import numpy as np
from random import randint
from tqdm import tqdm

# browser-based web crawler
from selenium import webdriver
from selenium.webdriver.support.ui import Select

options = webdriver.ChromeOptions()
options.add_argument("ignore-certificate-errors")

In [2]:
def article_content(browser, url):
    print('Doing', url)
    browser.get(url)          #執行url
    time.sleep(randint(0, 1)) #停留時間，隨機取數0~1，因為太過頻繁會被封鎖
    
    try:
        reward_xpath = '/html/body/div[1]/div[2]/div[2]/table[1]/tbody/tr/td'
        reward = browser.find_element_by_xpath(reward_xpath).text.strip()

        school_xpath = '/html/body/div[1]/div[2]/div[2]/table[3]/tbody/tr/td[2]'
        school = browser.find_element_by_xpath(school_xpath).text.replace('\u3000',' ').strip()

        ####################
        author_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[1]/td[2]' # find the xpath
        author = browser.find_element_by_xpath(author_xpath).text.strip()

        ####################
        title_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[2]/td[2]' # find the xpath
        title = browser.find_element_by_xpath(title_xpath).text.strip()

        ####################
        ISBN_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[3]/td[2]' # find the xpath
        ISBN = browser.find_element_by_xpath(ISBN_xpath).text.strip()

        ####################
        zh_book_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[4]/td[2]' # find the xpath
        zh_book = browser.find_element_by_xpath(zh_book_xpath).text.strip()

        ####################
        orig_book_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[5]/td[2]' # find the xpath    
        orig_book = browser.find_element_by_xpath(orig_book_xpath).text.strip()

        ####################
        book_author_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[6]/td[2]' # find the xpath
        book_author = browser.find_element_by_xpath(book_author_xpath).text.strip()

        ####################
        book_compiler_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[7]/td[2]' # find the xpath
        book_compiler = browser.find_element_by_xpath(book_compiler_xpath).text.strip()

        ####################
        publication_org_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[8]/td[2]' # find the xpath
        publication_org = browser.find_element_by_xpath(publication_org_xpath).text.strip()

        ####################
        publication_date_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[9]/td[2]' # find the xpath
        publication_date= browser.find_element_by_xpath(publication_date_xpath).text.strip()

        ####################
        revision_xpath = '/html/body/div[1]/div[2]/div[2]/table[4]/tbody/tr[10]/td[2]' # find the xpath
        revision = browser.find_element_by_xpath(revision_xpath).text.strip()

        ####################
        my_view_xpath = '/html/body/div[1]/div[2]/div[2]/table[7]/tbody/tr[2]/td' # find the xpath
        my_view = [line.strip() for line in browser.find_element_by_xpath(my_view_xpath).text.replace('<br>','').split('\n') if line.strip() != '']

        ####################
        book_author_intr_xpath = '/html/body/div[1]/div[2]/div[2]/table[5]/tbody/tr[2]/td' # find the xpath
        book_author_intr = [line.strip() for line in browser.find_element_by_xpath(book_author_intr_xpath).text.replace('<br>','').split('\n') if line.strip() != '']

        ####################
        abstract_xpath = '/html/body/div[1]/div[2]/div[2]/table[6]/tbody/tr[2]/td' # find the xpath
        abstract = [line.strip() for line in browser.find_element_by_xpath(abstract_xpath).text.replace('<br>','').split('\n') if line.strip() != '']

        ####################
        issue_xpath = '/html/body/div[1]/div[2]/div[2]/table[8]/tbody/tr[2]/td' # find the xpath
        issue = [line.strip() for line in browser.find_element_by_xpath(issue_xpath).text.replace('<br>','').split('\n') if line.strip() != '']


        return {'url':url,
                '作品':reward, '學校名稱':school, '作者':author, 
            '參賽標題':title, '書籍ISBN':ISBN, '中文書名':zh_book, '原文書名':orig_book,
            '書籍作者':book_author, '書籍編譯者':book_compiler, 
            '出版單位':publication_org, '出版年月':publication_date, '版次':revision,
            '圖書作者與內容簡介':book_author_intr, '內容摘錄':abstract,
            '我的觀點': my_view, '討論議題':issue}
    except:
        print('ERROR: ', url)
        return None

In [3]:
def crawler_article(i, urls):

    browser = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)   #開視窗
    articles = [article_content(browser, url) for url in urls]
    articles = [art for art in articles if art is not None]
    browser.close()  #關視窗

    with open(os.path.join(ARTICLE_SAVE_PATH, 'result_article_'+str(i)+'.json'), 'w', encoding='utf-8') as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)    

In [4]:
def crawler_all_article(url_file_name, num_set):
    with open(url_file_name, 'r', encoding='utf-8') as f:
        href = [line for line in f.read().split('\n') if line != '']
    print('Number of URLs: %d'%(len(href)))

    # split URLs into several sets
    urls_list = np.array_split(np.array(href), num_set)
    for i,urls in enumerate(tqdm(urls_list)):
        if i < 20:
            continue     #只爬第21,22個集合
        crawler_article(i, urls)

    print('All articles have DONE!!!')

In [5]:
def crawler_href(browser, la, lo):
    print(la, lo)
    browser.get(MAIN_URL)
    s_contest_number = Select(browser.find_element_by_id('s_contest_number'))
    s_contest_number.select_by_value(la)
    s_area = Select(browser.find_element_by_id('s_area'))
    s_area.select_by_visible_text(lo)
    browser.find_element_by_id('search_button').click()

    href = [elem.get_attribute('href') for elem in browser.find_elements_by_xpath("//a[@href]")]
    href = [h for h in href if h is not None and 'work_id=' in h]
    return href

In [6]:
def crawler_all_href(url_file_name):
    browser = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
    browser.get(MAIN_URL)
    s_contest_number = browser.find_element_by_id('s_contest_number')
    s_contest_number = s_contest_number.find_elements_by_tag_name('option')
    
    ####################
    ladder = [item.text for item in s_contest_number][:1]  # first one
    
    s_area = browser.find_element_by_id('s_area') # find the id
    s_area = s_area.find_elements_by_tag_name('option')
    
    ####################
    location = [item.text for item in s_area if item.text != '--'][:1] # first two
    
    print('Number of pages: %d',len(location)*len(ladder))

    results = [crawler_href(browser, i,j) for i in ladder for j in location] # 2114
    browser.close()
    with open(url_file_name, 'w', encoding='utf-8') as f:
        f.write('\n'.join([j for i in results for j in i]))
    print('All URLs have DONE!!!')

In [7]:
#初始化啟動chrome webdriver
DRIVER_PATH = "./chromedriver"
MAIN_URL = 'https://www.shs.edu.tw/index.php?p=search'
ARTICLE_SAVE_PATH = '/Users/yuni/Retrieval/selenium_data'
URL_FILE_NAME = '/Users/yuni/Retrieval/selenium_data/url_list.txt'
NUM_set = 22

crawler_all_href(url_file_name=URL_FILE_NAME)
crawler_all_article(url_file_name=URL_FILE_NAME, 
                        num_set=NUM_set)

Number of pages: %d 1
1090315 南投區


  0%|          | 0/22 [00:00<?, ?it/s]

All URLs have DONE!!!
Number of URLs: 203
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2373242
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2448518
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2364126
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2432534
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2433218
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2437900
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2448470
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2445736
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2444506


 95%|█████████▌| 21/22 [00:42<00:02,  2.00s/it]

Doing https://www.shs.edu.tw/search_view_over.php?work_id=2454097
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2431766
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2431756
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2432957
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2437888
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2454122
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2448436
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2453112
Doing https://www.shs.edu.tw/search_view_over.php?work_id=2458921


100%|██████████| 22/22 [01:04<00:00,  2.94s/it]

All articles have DONE!!!



