In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import pandas as pd
import json
import re


class set_scraper:
    stock_code = ""

    def __init__(self, stock_code):
        # Init webdriver with headless
        self.options = Options()
        self.options.headless = True
        self.driver = webdriver.Chrome(options=self.options)

        # Init constructor for scarping
        self.stock_code = stock_code
        self.company_info = dict()
        self.stock_info = dict()
        self.financial_period_cols_header = []
        self.financial_statistics_cols_header = []
        self.factsheet_statistics_cols_header = []

        # Init page and row mapper
        self.set_url_prefix = "https://www.set.or.th/set"
        self.set_url_suffix = "&language=th&country=TH"
        self.company_info_xpath_row_no = {
            "industry_group": 2,
            "business_type": 3
        }
        self.page_mapper = {
            "company_info": self.set_url_prefix + "/companyprofile.do?symbol=" + self.stock_code + "&ssoPageId=4" + self.set_url_suffix,
            "company_highlight": self.set_url_prefix + "/companyhighlight.do?symbol=" + self.stock_code + "&ssoPageId=5" + self.set_url_suffix,
            "factsheet": self.set_url_prefix + "/factsheet.do?symbol=" + self.stock_code + "&ssoPageId=3" + self.set_url_suffix
        }
        self.value_mapper = {
            "net_profit": { "row": 7, "header": "fipc" },
            "pe": { "row": 4, "header": "fisc" },
            "bv": { "row": 6, "header": "fisc" },
            "p_bv": { "row": 5, "header": "fisc" },
            "roa": { "row": 10, "header": "fipc" },
            "roe": { "row": 11, "header": "fipc" },
            "eps": { "row": 8, "header": "fipc" }
        }

    def jumper(self, page_key):
        # print('Jumping to page key: ', page_key, ' with url: ', self.page_mapper.get(page_key))
        self.driver.get(self.page_mapper.get(page_key))

    def go_to_company_highlights_page(self):
        self.jumper('company_highlight')

    def go_to_factsheet_page(self):
        self.jumper('factsheet')

    def get_company_info_xpath(self, field_key):
        element_text = self.driver.find_element_by_xpath("/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-reponsive']/table[@class='table']/tbody/tr[3]/td/div[@class='row']/div[@class='col-xs-12 col-md-7']/div[@class='row'][" + str(self.company_info_xpath_row_no.get(field_key)) + "]/div[@class='col-xs-9 col-md-5']").text
        return element_text

    # Get company industry group and business type from SET website
    def get_company_info(self):
        self.jumper('company_info')

        for item in ['industry_group', 'business_type']:
            temp_value = self.get_company_info_xpath(item)
            self.company_info[item] = temp_value

    # Get financial period column header from SET website
    def get_financial_period_column_header(self):
        df_stock = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/thead[1]/tr/th")

        for index, column in enumerate(df_stock):
            column_dict = dict()
            if (column.text.startswith("งบปี")):
                column_dict['index'] = index + 1
                column_dict['text'] = column.text.replace("\n", " - ")
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in sorted(fipc_years, reverse=True):
                    fipc_years.add(column_dict['year'])
                self.financial_period_cols_header.append(column_dict)

    # Get financial statistics column header from SET website
    def get_financial_statistics_column_header(self):
        df_stock = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/thead[2]/tr/th")

        for index, column in enumerate(df_stock):
            column_dict = dict()
            date_pattern = re.compile("^\d{1,2}\/\d{1,2}\/\d{4}$")
            if (date_pattern.match(column.text)):
                column_dict['index'] = index + 1
                column_dict['text'] = column.text
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in fisc_years:
                    fisc_years.add(column_dict['year'])
                self.financial_statistics_cols_header.append(column_dict)

    # Get factsheet statistics column header from SET website
    def get_factsheet_statistics_column_header(self):
        column_list = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/table/tbody/tr[3]/td/table[@class='table-factsheet-padding3'][2]/tbody/tr[4]/td[2]/table[@class='table-factsheet-padding0'][1]/tbody/tr[2]/td[@class='factsheet-head']")

        for index, column in enumerate(column_list):
            column_dict = dict()
            if not column.text.startswith("ข้อมูลสถิติ") and index + 1 == 3:
                column_dict['index'] = index + 1
                column_dict['text'] = column.text.replace("\n", " - ")
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in fasc_years:
                    fasc_years.add(column_dict['year'])
                self.factsheet_statistics_cols_header.append(column_dict)

    def check_cols_header(self, header_name):
        column_header = []
        if header_name == "fisc":
            column_header = self.financial_statistics_cols_header
        elif header_name == "fipc":
            column_header = self.financial_period_cols_header
        return column_header

    def set_tbody(self, header_name):
        tbody = 0
        if header_name == "fisc":
            tbody = 2
        elif header_name == "fipc":
            tbody = 1
        return tbody
    
    def get_value_by_xpath(self, field_key):
        temp_list = []
        column_header = self.check_cols_header(self.value_mapper.get(field_key).get('header'))
        tbody_value = self.set_tbody(self.value_mapper.get(field_key).get('header'))

        for index, column in enumerate(column_header):
            temp_value = self.driver.find_element_by_xpath("/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[" + str(tbody_value) + "]/tr[" + str(self.value_mapper.get(field_key).get('row')) + "]/td[" + str(column.get("index")) + "]").text
            temp_list.append(temp_value.strip())
        self.stock_info[field_key + '_list'] = temp_list

    def get_all_values(self):
        field_list = ["net_profit", "pe", "bv", "p_bv", "roa", "roe", "eps"]
        for field in field_list:
            self.get_value_by_xpath(field)

    def get_beta_value(self):
        beta_list = []
        for index, column in enumerate(self.factsheet_statistics_cols_header):
            beta = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/table/tbody/tr[3]/td/table[@class='table-factsheet-padding3'][2]/tbody/tr[4]/td[2]/table[@class='table-factsheet-padding0'][1]/tbody/tr[11]/td[@class='factsheet'][" + str(column.get("index")) + "]").text
            beta_list.append(beta.strip())
        self.stock_info['beta_list'] = beta_list

    def get_benefit_value(self):
        self.stock_info['benefit_value'] = "https://www.set.or.th/set/companyrights.do?symbol=" + self.stock_code + "&ssoPageId=7&language=th&country=TH"

    def get_company_highlights(self):
        self.go_to_company_highlights_page()
        self.get_financial_period_column_header()
        self.get_financial_statistics_column_header()
        self.get_all_values()
        # self.get_net_profit()
        # self.get_pe_value()
        # self.get_bv_value()
        # self.get_p_bv_value()
        # self.get_roa_value()
        # self.get_roe_value()
        # self.get_eps_value()
        self.go_to_factsheet_page()
        self.get_factsheet_statistics_column_header()
        self.get_beta_value()
        self.get_benefit_value()

    def retrieve_stock_info(self):
        self.get_company_info()
        self.get_company_highlights()


# Save list of years
fipc_years = set([])
fisc_years = set([])
fasc_years = set([])

df_stock = dict()
df_stock = {
    'industry_group': [],
    'business_type': [],
    'stock_name': [],
}
unsorted_stock_keys = {}

# Get list of stock by reading it from file or define in variable
# stock_list = open("stockList.txt", "r")
stock_list = ['MINT', 'CPALL']

# Start iteration over list of stocks
for index, stock in enumerate(stock_list):
    scraper = set_scraper(stock)
    scraper.retrieve_stock_info()

    df_stock['industry_group'].append(scraper.company_info.get('industry_group'))
    df_stock['business_type'].append(scraper.company_info.get('business_type'))
    df_stock['stock_name'].append(stock.replace("\n", ""))

    # sort stock key method
    # for fipc_index, year in enumerate(fipc_years):
    #     if not '1. กำไรสุทธิ ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['1. กำไรสุทธิ ' + year] = []
    #     if not '2. ROA ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['2. ROA ' + year] = []
    #     if not '3. ROE ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['3. ROE ' + year] = []
    #     if not '4. กำไรต่อหุ้น ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['4. กำไรต่อหุ้น ' + year] = []

    #     unsorted_stock_keys['1. กำไรสุทธิ ' + year].append(scraper.stock_info.get('net_profit_list')[fipc_index])
    #     unsorted_stock_keys['2. ROA ' + year].append(scraper.stock_info.get('roa_list')[fipc_index])
    #     unsorted_stock_keys['3. ROE ' + year].append(scraper.stock_info.get('roe_list')[fipc_index])
    #     unsorted_stock_keys['4. กำไรต่อหุ้น ' + year].append(scraper.stock_info.get('eps_list')[fipc_index])

    # for fisc_index, year in enumerate(fisc_years):
    #     if not '5. P/E ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['5. P/E ' + year] = []
    #     if not '6. BV ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['6. BV ' + year] = []
    #     if not '7. P/BV ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['7. P/BV ' + year] = []

    #     unsorted_stock_keys['5. P/E ' + year].append(scraper.stock_info.get('pe_list')[fisc_index])
    #     unsorted_stock_keys['6. BV ' + year].append(scraper.stock_info.get('bv_list')[fisc_index])
    #     unsorted_stock_keys['7. P/BV ' + year].append(scraper.stock_info.get('p_bv_list')[fisc_index])

    # for fasc_index, year in enumerate(fasc_years):
    #     if not '8. Beta ' + year in unsorted_stock_keys:
    #         unsorted_stock_keys['8. Beta ' + year] = []

    #     unsorted_stock_keys['8. Beta ' + year].append(scraper.stock_info.get('beta_list')[fasc_index])

    # if not '9. Benefit' in unsorted_stock_keys:
    #     unsorted_stock_keys['9. Benefit'] = []
    
    # unsorted_stock_keys['9. Benefit'].append(scraper.stock_info.get('benefit_value'))

    # # If iteration comes to the end, perform sorting stock keys and merge it into data variables
    # if index == len(stock_list) - 1:
    #     sorted_stock_keys = {}

    #     for index_key, key in enumerate(sorted(unsorted_stock_keys.keys())):
    #         sorted_stock_keys[key] = unsorted_stock_keys[key]

    #     df_stock = {**df_stock, **sorted_stock_keys}

    # legacy method
    fipc_years = sorted(fipc_years)
    fisc_years = sorted(fisc_years)
    fasc_years = sorted(fasc_years)

    for fipc_index, year in enumerate(fipc_years):
        if not 'กำไรสุทธิ ' + year in df_stock:
            df_stock['กำไรสุทธิ ' + year] = []
        df_stock['กำไรสุทธิ ' + year].append(scraper.stock_info.get('net_profit_list')[fipc_index])

    for fipc_index, year in enumerate(fipc_years):
        if not 'ROA ' + year in df_stock:
            df_stock['ROA ' + year] = []
        df_stock['ROA ' + year].append(scraper.stock_info.get('roa_list')[fipc_index])

    for fipc_index, year in enumerate(fipc_years):
        if not 'ROE ' + year in df_stock:
            df_stock['ROE ' + year] = []
        df_stock['ROE ' + year].append(scraper.stock_info.get('roe_list')[fipc_index])

    for fipc_index, year in enumerate(fipc_years):
        if not 'กำไรต่อหุ้น ' + year in df_stock:
            df_stock['กำไรต่อหุ้น ' + year] = []
        df_stock['กำไรต่อหุ้น ' + year].append(scraper.stock_info.get('eps_list')[fipc_index])

    for fisc_index, year in enumerate(fisc_years):
        if not 'P/E ' + year in df_stock:
            df_stock['P/E ' + year] = []
        df_stock['P/E ' + year].append(scraper.stock_info.get('pe_list')[fisc_index])

    for fisc_index, year in enumerate(fisc_years):
        if not 'BV ' + year in df_stock:
            df_stock['BV ' + year] = []
        df_stock['BV ' + year].append(scraper.stock_info.get('bv_list')[fisc_index])

    for fisc_index, year in enumerate(fisc_years):
        if not 'P/BV ' + year in df_stock:
            df_stock['P/BV ' + year] = []
        df_stock['P/BV ' + year].append(scraper.stock_info.get('p_bv_list')[fisc_index])

    for fasc_index, year in enumerate(fasc_years):
        if not 'Beta ' + year in df_stock:
            df_stock['Beta ' + year] = []
        df_stock['Beta ' + year].append(scraper.stock_info.get('beta_list')[fasc_index])

    if not 'Benefit' in df_stock:
        df_stock['Benefit'] = []
    df_stock['Benefit'].append(scraper.stock_info.get('benefit_value'))

# print(df_stock)
df = pd.DataFrame(data=df_stock)
df

{'industry_group': ['เกษตรและอุตสาหกรรมอาหาร', 'บริการ'], 'business_type': ['อาหารและเครื่องดื่ม', 'พาณิชย์'], 'stock_name': ['MINT', 'CPALL'], 'กำไรสุทธิ 2559': ['6,590.00', '16,676.51'], 'กำไรสุทธิ 2560': ['5,415.40', '19,907.71'], 'กำไรสุทธิ 2561': ['5,444.77', '20,929.65'], 'กำไรสุทธิ 2562': ['10,697.93', '22,343.08'], 'ROA 2559': ['9.25', '8.39'], 'ROA 2560': ['7.20', '8.84'], 'ROA 2561': ['5.16', '8.81'], 'ROA 2562': ['6.66', '8.94'], 'ROE 2559': ['18.77', '36.04'], 'ROE 2560': ['12.78', '30.50'], 'ROE 2561': ['9.13', '26.14'], 'ROE 2562': ['14.52', '25.02'], 'กำไรต่อหุ้น 2559': ['1.50', '1.85'], 'กำไรต่อหุ้น 2560': ['1.22', '2.14'], 'กำไรต่อหุ้น 2561': ['1.18', '2.22'], 'กำไรต่อหุ้น 2562': ['2.04', '2.38'], 'P/E 2559': ['18.09', '34.55'], 'P/E 2560': ['39.64', '37.02'], 'P/E 2561': ['28.27', '29.55'], 'P/E 2562': ['19.73', '29.87'], 'P/E 2563': ['9.11', '28.55'], 'BV 2559': ['8.23', '4.57'], 'BV 2560': ['8.79', '7.81'], 'BV 2561': ['13.19', '8.86'], 'BV 2562': ['15.73', '9.83'],

Unnamed: 0,industry_group,business_type,stock_name,กำไรสุทธิ 2559,กำไรสุทธิ 2560,กำไรสุทธิ 2561,กำไรสุทธิ 2562,ROA 2559,ROA 2560,ROA 2561,...,BV 2561,BV 2562,BV 2563,P/BV 2559,P/BV 2560,P/BV 2561,P/BV 2562,P/BV 2563,Beta 2562,Benefit
0,เกษตรและอุตสาหกรรมอาหาร,อาหารและเครื่องดื่ม,MINT,6590.0,5415.4,5444.77,10697.93,9.25,7.2,5.16,...,13.19,15.73,16.34,4.34,5.2,2.58,2.29,1.29,1.24,https://www.set.or.th/set/companyrights.do?sym...
1,บริการ,พาณิชย์,CPALL,16676.51,19907.71,20929.65,22343.08,8.39,8.84,8.81,...,8.86,9.83,10.44,13.69,9.87,7.76,7.35,6.8,1.08,https://www.set.or.th/set/companyrights.do?sym...


In [24]:
df.to_excel('report1.xlsx')