# 練習

- 觀察 https://www.thsrc.com.tw/tw/TimeTable/SearchResult 並撰寫爬蟲程式
- 抓取一個禮拜後的高鐵時刻表
- 台北到台南下午兩點的班次
- 使用 requests + BeautifulSoup 實作
- 透過 pandas 輸出成 csv

In [1]:
import os
import requests
import pandas as pd
import datetime

from bs4 import BeautifulSoup

url = 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'

In [2]:
after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)
after_one_week_format = after_one_week.strftime('%Y/%m/%d')
print('The date after one week - {}'.format(after_one_week_format))

form_data = {
    'StartStation': '977abb69-413a-4ccf-a109-0272c24fd490',
    'EndStation': '9c5ac6ca-ec89-48f8-aab0-41b738cb1814',
    'SearchDate': after_one_week_format,
    'SearchTime': '14:00',
    'SearchWay': 'DepartureInMandarin',
    'RestTime': '',
    'EarlyOrLater': ''
}

The date after one week - 2018/02/09


In [3]:
resp = requests.post(url, data=form_data)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')

In [4]:
rows = soup.table.find_all('tr')

colname, rows = rows[1], rows[2:]
colname = list(colname.stripped_strings)

for i, row in enumerate(rows):
    trips = row.find('td', class_='column1')
    t_departure = row.find('td', class_='column3')
    t_arrive = row.find('td', class_='column4')
    duration = row.find('td', class_='column2')
    early_ticket = row.find('td', class_='Width1')
    
    trips = trips.text if trips else None
    t_departure = t_departure.text if t_departure else ''
    t_arrive = t_arrive.text if t_arrive else ''
    duration = duration.text if duration else ''
    early_ticket = list(early_ticket.stripped_strings) if early_ticket else ''
    early_ticket = early_ticket[0] if early_ticket else ''
    
    rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]
    
df = pd.DataFrame(rows, columns=colname)
df

Unnamed: 0,車次,出發時間,抵達時間,行車時間,早鳥
0,833,14:11,16:11,02:00,65折起
1,833,14:11,16:11,02:00,65折起
2,1649,14:21,16:06,01:45,8折起
3,1649,14:21,16:06,01:45,8折起
4,651,14:46,16:32,01:46,
5,651,14:46,16:32,01:46,
6,837,15:11,17:11,02:00,65折起
7,837,15:11,17:11,02:00,65折起
8,1655,15:21,17:06,01:45,8折起
9,1655,15:21,17:06,01:45,8折起


In [5]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

filename = os.path.join(results, 'thsrc_{}.csv'.format(after_one_week.strftime('%Y%m%d')))
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))

Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180209.csv
