# Scraper Utility to obtain data from twse website
## References
- https://bitbucket.opst.c1.vanguard.com/users/uc8c/repos/gidm/browse/fdm1/fdm1.py
- https://www.twse.com.tw/en/page/trading/exchange/BWIBBU_d.html - P/E, P/B ratios
- https://www.twse.com.tw/en/page/trading/exchange/TWTASU.html - short sale vol
- https://www.twse.com.tw/en/page/trading/exchange/MI_INDEX.html#subtitle7 - daily summary - all
- https://www.twse.com.tw/en/page/trading/exchange/TWT85U.html - full delivery securities
- https://emops.twse.com.tw/server-java/t58query#
- https://app2.msci.com/eqb/custom_indexes/tw_performance.html - MSCI index constituents


In [48]:
urls = {
    "ratios" : 'https://www.twse.com.tw/en/page/trading/exchange/BWIBBU_d.html',
    "short_sales" : 'https://www.twse.com.tw/en/page/trading/exchange/TWTASU.html',
    "summary_px_vol" : 'https://www.twse.com.tw/en/page/trading/exchange/MI_INDEX.html#subtitle7',
    "full_delivery": 'https://www.twse.com.tw/en/page/trading/exchange/TWT85U.html'
}

In [1]:
#for Chrome bot stuff
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ActionChains
import time
import pandas as pd
import shutil

In [3]:
pd.options.display.max_rows = 2000

In [4]:
def createWebDriver():
    """ creates the Chrome web driver bot """
    chrome_path = os.getcwd()+"\\chromedriver.exe"
    #chrome_path = "C:\\Users\\uc8c\\OneDrive - Vanguard\\Repos on Desktop - Laptop Migration 6-26-21\\GIDM\\gidm\\fdm1\\drivers"
    chrome_options = Options()
    #chrome_options.add_argument("--headless") 
    driver = webdriver.Chrome(executable_path = chrome_path, chrome_options=chrome_options)

    return driver

In [5]:
def get_full_delivery_data(driver, start_date='1/1/2022', end_date='1/5/2022'):
    """ Used to obtain full delivery data from the twse website """
   
    driver.get(urls['full_delivery'])
    
    date_range = pd.bdate_range(start=start_date, end=end_date)

    for d in date_range:
        #select month
        select_mo = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[2]'))
        select_mo.select_by_index(d.month-1)
        time.sleep(1)

        #select day
        select_day = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[3]'))
        select_day.select_by_index(d.day-1)
        time.sleep(1)

        #click query button
        driver.find_element_by_tag_name('a.button.search').click()
        time.sleep(4)

        #click download csv button
        driver.find_element_by_xpath('//*[@id="reports"]/div[1]/a[2]').click()
        time.sleep(4)

In [6]:
def get_short_sales_data(driver, start_date='1/1/2022', end_date='1/5/2022'):
    """ Used to obtain short sales data from twse website """
   
    driver.get(urls['short_sales'])
    
    date_range = pd.bdate_range(start=start_date, end=end_date)

    for d in date_range:
        #select month
        select_mo = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[2]'))
        select_mo.select_by_index(d.month-1)
        time.sleep(1)

        #select day
        select_day = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[3]'))
        select_day.select_by_index(d.day-1)
        time.sleep(1)

        #click query button
        driver.find_element_by_tag_name('a.button.search').click()
        time.sleep(4)

        #click download csv button
        driver.find_element_by_xpath('//*[@id="reports"]/div[1]/a[2]').click()
        time.sleep(4)

In [46]:
def get_ratios_data(driver, start_date='1/1/2022', end_date='1/5/2022'):
    """ Used to obtain ratios data from twse website """
   
    driver.get(urls['ratios'])
    date_range = pd.bdate_range(start=start_date, end=end_date) 

    for d in date_range:
        #select month
        select_mo = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[2]'))
        select_mo.select_by_index(d.month-1)
        time.sleep(1)

        #select day
        select_day = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[3]'))
        select_day.select_by_index(d.day-1)
        time.sleep(1)

        #click query button
        driver.find_element_by_tag_name('a.button.search').click()
        time.sleep(3)

        try:
            #click download csv button
            driver.find_element_by_xpath('//*[@id="reports"]/div[1]/a[2]').click()
        except:
            print(f"Error downloading data for: {d}")
        time.sleep(3)

In [49]:
def get_summary_data(driver, start_date='1/1/2022', end_date='1/5/2022'):
    """ Used to obtain summary data from twse website """
   
    driver.get(urls['summary_px_vol'])
    date_range = pd.bdate_range(start=start_date, end=end_date) 

    for d in date_range:
        #select month
        select_mo = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[2]'))
        select_mo.select_by_index(d.month-1)
        time.sleep(1)

        #select day
        select_day = Select(driver.find_element_by_xpath('//*[@id="d1"]/select[3]'))
        select_day.select_by_index(d.day-1)
        time.sleep(1)

        #select category
        select_cat = Select(driver.find_element_by_xpath('//*[@id="main-form"]/div/div/form/select'))
        select_cat.select_by_visible_text('All(no Warrant & CBBC & OCBBC)')

        #click query button
        driver.find_element_by_tag_name('a.button.search').click()
        time.sleep(3)

        try:
            #click download csv button
            driver.find_element_by_xpath('//*[@id="reports"]/div[1]/a[2]').click()
        except:
            print(f"Error downloading data for: {d}")
        time.sleep(3)

In [8]:
################set up Chrome driver#######
try:   
    driver = createWebDriver()
    print("Successfully created Chrome bot")
    time.sleep(1)

except:
    print("Unexpected error:", sys.exc_info()[0])
    #time.sleep(1*60)
    raise

  driver = webdriver.Chrome(executable_path = chrome_path, chrome_options=chrome_options)


Successfully created Chrome bot


In [83]:
#run to obtain full delivery data, via downloaded csv files
get_full_delivery_data(driver, start_date='1/2/2022', end_date='1/5/2022')

In [86]:
#make zip file
shutil.make_archive("taiwan-watchlist-data-website-raw-01012022-10312022", 'zip', './data/')

'c:\\Users\\uc8c\\Desktop\\daa.eq-service-prefunding-challenge\\taiwan-watchlist-data-website-raw-01012022-10312022.zip'

In [99]:
#run to obtain short sales data, via downloaded csv files
#(NT$, trading unit/thousand shares)
#use Margin Short Sales column
get_short_sales_data(driver, start_date='5/2/2022', end_date='10/31/2022')

In [100]:
#make zip file
shutil.make_archive("taiwan-shortsale-data-website-raw-01012022-10312022", 'zip', './data/')

'c:\\Users\\uc8c\\Desktop\\daa.eq-service-prefunding-challenge\\taiwan-shortsale-data-website-raw-01012022-10312022.zip'

In [45]:
get_ratios_data(driver, start_date='1/27/2022', end_date='10/31/2022')

In [90]:
#make zip file
shutil.make_archive("all_data", 'zip', './data')

'c:\\Users\\uc8c\\Desktop\\daa.eq-service-prefunding-challenge\\all_data.zip'

In [51]:
get_summary_data(driver, start_date='1/6/2022', end_date='10/31/2022')

Error downloading data for: 2022-01-27 00:00:00
Error downloading data for: 2022-01-28 00:00:00
Error downloading data for: 2022-01-31 00:00:00
Error downloading data for: 2022-02-01 00:00:00
Error downloading data for: 2022-02-02 00:00:00
Error downloading data for: 2022-02-03 00:00:00
Error downloading data for: 2022-02-04 00:00:00
Error downloading data for: 2022-02-28 00:00:00
Error downloading data for: 2022-04-04 00:00:00
Error downloading data for: 2022-04-05 00:00:00
Error downloading data for: 2022-05-02 00:00:00
Error downloading data for: 2022-06-03 00:00:00
Error downloading data for: 2022-09-09 00:00:00
Error downloading data for: 2022-10-10 00:00:00


In [88]:
#make zip file
shutil.make_archive("taiwan-stockquotes-data-website-raw-010122-10312022", 'zip', './data')

'c:\\Users\\uc8c\\Desktop\\daa.eq-service-prefunding-challenge\\taiwan-stockquotes-data-website-raw-010122-10312022.zip'

In [42]:
import glob

In [60]:
summary_path = os.path.join(os.getcwd(), "other_data", "taiwan-summary-website-raw-01012022-10312022")

In [61]:
summary_path

'c:\\Users\\uc8c\\Desktop\\daa.eq-service-prefunding-challenge\\data\\taiwan-summary-website-raw-01012022-10312022'

In [63]:
summary_files = glob.glob(os.path.join(summary_path, '*'))

In [79]:
#extacting just the stock quotes from the summary files.
for filepath in summary_files[5:]:
    data_start = 1000000
    with open(filepath, 'r') as f:
        with open(os.path.join(os.path.split(filepath)[0], os.path.split(filepath)[1][:-4]+"_stock_quotes.csv"), 'w') as temp:
            for i, line in enumerate(f):
                try:
                    if "Security Code" in line:
                        data_start = i
                        temp.write(line)

                    if (i > data_start):
                        if "Remarks:" in line:
                            break
                        temp.write(line)
                        
                except:
                    print(f"Error reading line: {i}")
                    errors.append(i)


In [84]:
pd.read_csv(r"C:\Users\uc8c\Desktop\daa.eq-service-prefunding-challenge\data\taiwan-stockquotes-data-website-raw-010122-10312022\MI_INDEX_ALLBUT0999_20220103_stock_quotes.csv")

Unnamed: 0,Security Code,Trade Volume,Transaction,Trade Value,Opening Price,Highest Price,Lowest Price,Closing Price,Dir(+/-),Change,Last Best Bid Price,Last Best Bid Volume,Last Best Ask Price,Last Best Ask Volume,Price-Earning ratio,Unnamed: 15
0,"=""0050""",7064552,8298,1034974917,146.00,147.35,146.00,146.40,+,0.90,146.35,32,146.40,96,0.00,
1,"=""0051""",135357,185,8244702,60.90,61.30,60.75,60.90,+,0.10,60.75,2,60.90,45,0.00,
2,"=""0052""",882163,488,119306921,134.65,135.95,134.65,135.20,+,1.05,135.20,54,135.35,7,0.00,
3,"=""0053""",28349,25,1996804,70.25,70.75,70.25,70.45,+,0.50,70.40,20,70.45,1,0.00,
4,"=""0054""",3998,13,127031,31.68,31.80,31.68,31.80,+,0.12,31.59,24,31.80,2,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,9944,124352,74,2945481,24.05,24.05,23.10,23.65,-,0.40,23.65,5,23.80,2,71.67,
1146,9945,3458411,2685,220080151,63.80,64.10,63.30,63.60,-,0.20,63.50,53,63.60,27,8.97,
1147,9946,74544,57,1119248,15.10,15.10,15.00,15.05,-,0.05,15.00,51,15.05,5,35.83,
1148,9955,114432,285,1970087,17.35,17.45,17.00,17.05,-,0.20,17.05,5,17.15,5,0.00,
