In [1]:
# Import `requests`
import requests
# Import BeautifulSoup from bs4
from bs4 import BeautifulSoup
# pandas for dataframe
import pandas as pd
# use defaultdict with list
from collections import defaultdict
import os

In [2]:
# scrape 3 years of holidays from China
holidays_df = []
for y in range(2015, 2017+1):
    
    # Store url
    url = 'https://www.timeanddate.com/holidays/china/' + str(y)

    # Make the request and check object type
    # could use scrapy
    r = requests.get(url)

    # Extract HTML from Response object and print
    html = r.text

    # Create a BeautifulSoup object from the HTML
    soup = BeautifulSoup(html, 'html5lib')

    # extract holiday table
    Holiday_list = []
    for i in range(365):
        holiday_id = 'tr{:d}'.format(i)
        holiday_html = soup.find(id=holiday_id)
        if not holiday_html:
            break
        else:
            Holiday_list.append(holiday_html)

    # extract date, description and category from html list
    holiday_dict = defaultdict(list)
    for h in Holiday_list:
        holiday_dict['month_day'].append(  ','.join([h.find('th').string, str(y)])  )
        holiday_dict['name'].append(h.find('a').string)
        holiday_dict['category'].append(h.td.next_sibling.next_sibling.string)

    # collect holidays in list as dataframe
    holidays_df.append(pd.DataFrame(holiday_dict))

In [3]:
# combine dataframe for each year
df_holidays = pd.concat(holidays_df)

In [4]:
# convert date to datetime object and set it as index
df_holidays.month_day = pd.to_datetime(df_holidays.month_day, infer_datetime_format=True)
df_holidays = df_holidays.set_index('month_day')

In [6]:
df_holidays.shape

(154, 2)

In [7]:
df_holidays.head()

Unnamed: 0_level_0,category,name
month_day,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,National holiday,New Year's Day
2015-01-02,National holiday,New Year's weekend
2015-01-03,National holiday,New Year's weekend
2015-01-04,Working day in weekend,Special Working Day
2015-02-15,Working day in weekend,Special Working Day


In [9]:
# save data as external data
holidays_path = os.path.join(os.pardir, 'data', 'external', 'holidays_taiwan.csv')
df_holidays.to_csv(holidays_path)