In [92]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import pandas as pd
import json
import re


class set_crawler:
    stock_code = ""

    def __init__(self, stock_code):
        # Init webdriver with headless
        self.options = Options()
        self.options.headless = True
        self.driver = webdriver.Chrome(options=self.options)

        # Init constructor for scarping
        self.stock_code = stock_code
        self.company_info = dict()
        self.stock_info = dict()
        self.financial_period_cols_header = []
        self.financial_statistics_cols_header = []
        self.factsheet_statistics_cols_header = []

    def go_to_company_highlights_page(self):
        self.driver.get(
            "https://www.set.or.th/set/companyhighlight.do?symbol=" + self.stock_code + "&ssoPageId=5&language=th&country=TH")

    def go_to_factsheet_page(self):
        self.driver.get(
            "https://www.set.or.th/set/factsheet.do?symbol=" + self.stock_code + "&ssoPageId=3&language=th&country=TH")

    def get_company_info(self):
        self.driver.get(
            "https://www.set.or.th/set/companyprofile.do?symbol=" + self.stock_code + "&ssoPageId=4&language=th&country=TH")

        industry_group = self.driver.find_element_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-reponsive']/table[@class='table']/tbody/tr[3]/td/div[@class='row']/div[@class='col-xs-12 col-md-7']/div[@class='row'][2]/div[@class='col-xs-9 col-md-5']").text
        business_type = self.driver.find_element_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-reponsive']/table[@class='table']/tbody/tr[3]/td/div[@class='row']/div[@class='col-xs-12 col-md-7']/div[@class='row'][3]/div[@class='col-xs-9 col-md-5']").text

        self.company_info['industry_group'] = industry_group
        self.company_info['business_type'] = business_type

    def get_financial_period_column_header(self):
        df_stock = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/thead[1]/tr/th")

        for index, column in enumerate(df_stock):
            column_dict = dict()
            if (column.text.startswith("งบปี")):
                column_dict['index'] = index + 1
                column_dict['text'] = column.text.replace("\n", " - ")
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in fipc_years:
                    fipc_years.add(column_dict['year'])
                self.financial_period_cols_header.append(column_dict)

    def get_financial_statistics_column_header(self):
        df_stock = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/thead[2]/tr/th")

        for index, column in enumerate(df_stock):
            column_dict = dict()
            date_pattern = re.compile("^\d{1,2}\/\d{1,2}\/\d{4}$")
            if (date_pattern.match(column.text)):
                column_dict['index'] = index + 1
                column_dict['text'] = column.text
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in fisc_years:
                    fisc_years.add(column_dict['year'])
                self.financial_statistics_cols_header.append(column_dict)

    def get_factsheet_statistics_column_header(self):
        column_list = self.driver.find_elements_by_xpath(
            "/html[@class='no-js']/body/table/tbody/tr[3]/td/table[@class='table-factsheet-padding3'][2]/tbody/tr[4]/td[2]/table[@class='table-factsheet-padding0'][1]/tbody/tr[2]/td[@class='factsheet-head']")
        
        for index, column in enumerate(column_list):
            column_dict = dict()
            if not column.text.startswith("ข้อมูลสถิติ"):
                column_dict['index'] = index + 1
                column_dict['text'] = column.text.replace("\n", " - ")
                column_dict['year'] = re.findall("\d{4}", column.text)[0]
                if not column_dict['year'] in fasc_years:
                    fasc_years.add(column_dict['year'])
                self.factsheet_statistics_cols_header.append(column_dict)

    def get_net_profit(self):
        profit_list = []
        for index, column in enumerate(self.financial_period_cols_header):
            profit = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[1]/tr[7]/td[" + str(column.get("index")) + "]").text
            profit_list.append(profit.strip())
        self.stock_info['profit_list'] = profit_list

    def get_pe_value(self):
        pe_list = []
        for index, column in enumerate(self.financial_statistics_cols_header):
            pe = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[2]/tr[4]/td[" + str(column.get("index")) + "]").text
            pe_list.append(pe.strip())
        self.stock_info['pe_list'] = pe_list

    def get_bv_value(self):
        bv_list = []
        for index, column in enumerate(self.financial_statistics_cols_header):
            bv = self.driver.find_element_by_xpath(
                "/ html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[2]/tr[6]/td[" + str(column.get("index")) + "]").text
            bv_list.append(bv.strip())
        self.stock_info['bv_list'] = bv_list

    def get_p_bv_value(self):
        p_bv_list = []
        for index, column in enumerate(self.financial_statistics_cols_header):
            p_bv = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[2]/tr[5]/td[" + str(column.get("index")) + "]").text
            p_bv_list.append(p_bv.strip())
        self.stock_info['p_bv_list'] = p_bv_list

    def get_roa_value(self):
        roa_list = []
        for index, column in enumerate(self.financial_period_cols_header):
            roa = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[1]/tr[10]/td[" + str(column.get("index"))+ "]").text
            roa_list.append(roa.strip())
        self.stock_info['roa_list'] = roa_list

    def get_roe_value(self):
        roe_list = []
        for index, column in enumerate(self.financial_period_cols_header):
            roe = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[1]/tr[11]/td[" + str(column.get("index")) + "]").text
            roe_list.append(roe.strip())
        self.stock_info['roe_list'] = roe_list

    def get_eps_value(self):
        eps_list = []
        for index, column in enumerate(self.financial_period_cols_header):
            eps = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/div[@class='container']/div[@class='row sidebar-body-content']/div[@id='body-content']/div[@class='row']/div[@id='maincontent']/div[@class='row']/div[@class='table-responsive']/table[@class='table table-hover table-info']/tbody[1]/tr[8]/td[" + str(column.get("index")) + "]").text
            eps_list.append(eps.strip())
        self.stock_info['eps_list'] = eps_list

    def get_beta_value(self):
        beta_list = []
        for index, column in enumerate(self.factsheet_statistics_cols_header):
            beta = self.driver.find_element_by_xpath(
                "/html[@class='no-js']/body/table/tbody/tr[3]/td/table[@class='table-factsheet-padding3'][2]/tbody/tr[4]/td[2]/table[@class='table-factsheet-padding0'][1]/tbody/tr[11]/td[@class='factsheet'][" + str(column.get("index")) + "]").text
            beta_list.append(beta.strip())
        self.stock_info['beta_list'] = beta_list

    def get_benefit_value(self):
        self.stock_info['benefit_value'] = "https://www.set.or.th/set/companyrights.do?symbol=" + self.stock_code + "&ssoPageId=7&language=th&country=TH"

    def get_company_highlights(self):
        self.go_to_company_highlights_page()
        self.get_financial_period_column_header()
        self.get_financial_statistics_column_header()
        self.get_net_profit()
        self.get_pe_value()
        self.get_bv_value()
        self.get_p_bv_value()
        self.get_roa_value()
        self.get_roe_value()
        self.get_eps_value()
        self.go_to_factsheet_page()
        self.get_factsheet_statistics_column_header()
        self.get_beta_value()
        self.get_benefit_value()

    def retrieve_stock_info(self):
        self.get_company_info()
        self.get_company_highlights()

        # print(self.financial_statistics_cols_header)
        # print(self.financial_period_cols_header)
        # print(self.factsheet_statistics_cols_header)


# Store list of years here
fipc_years = set([])
fisc_years = set([])
fasc_years = set([])

df_stock = dict()
df_stock = {
    'industry_group': [],
    'business_type': [],
    'stock_name': [],
}

# Get list of stock
# stock_list = open("stockList.txt", "r")
stock_list = ['MINT', 'CPALL']
for index, stock in enumerate(stock_list):
    craw = set_crawler(stock)
    craw.retrieve_stock_info()

    df_stock['industry_group'].append(craw.company_info.get('industry_group'))
    df_stock['business_type'].append(craw.company_info.get('business_type'))
    df_stock['stock_name'].append(stock.replace("\n", ""))

    # df_stock['industry_group'] = craw.company_info.get('industry_group')
    # df_stock['business_type'] = craw.company_info.get('business_type')
    # df_stock['stock_name'] = stock.replace("\n", "")

    # for idx, fipc in enumerate(craw.financial_period_cols_header):
        # df_stock['net_profit'].append(craw.stock_info.get('profit_list')[idx])
    #     df_stock["กำไรสุทธิ: " + fipc.get('text')] = craw.stock_info.get('profit_list')[idx]
    #     df_stock["ROA: " + fipc.get('text')] = craw.stock_info.get('roa_list')[idx]
    #     df_stock["ROE: " + fipc.get('text')] = craw.stock_info.get('roe_list')[idx]
    #     df_stock["กำไรต่อหุ้น: " + fipc.get('text')] = craw.stock_info.get('eps_list')[idx]

    # for idx, fisc in enumerate(craw.financial_statistics_cols_header):
    #     df_stock["P/E: " + fisc.get('text')] = craw.stock_info.get('pe_list')[idx]
    #     df_stock["BV: " + fisc.get('text')] = craw.stock_info.get('bv_list')[idx]
    #     df_stock["P/BV: " + fisc.get('text')] = craw.stock_info.get('p_bv_list')[idx]


    # df_stock.append(df_stock)

    print('outer index', index)
    # for i, year in enumerate(fipc_years):
    #     if i % 2 == 0:
    #         print('current i: ', i, ' i % 2: ', i % 2)
    #         df_stock['กำไรสุทธิ: ' + year] = []
    #     else:
    #         print('current i: ', i, ' ! i % 2: ', i % 2)
    #         df_stock['ROA: ' + year] = []

    unsorted_stock_keys = {}
    for i, year in enumerate(fipc_years):
        unsorted_stock_keys['1. กำไรสุทธิ ' + year] = craw.stock_info.get('profit_list')[i]
        unsorted_stock_keys['2. ROA ' + year] = craw.stock_info.get('roa_list')[i]
        unsorted_stock_keys['3. ROE ' + year] = craw.stock_info.get('roe_list')[i]
        unsorted_stock_keys['4. กำไรต่อหุ้น ' + year] = craw.stock_info.get('eps_list')[i]

    # Create array of custom key order
    # custom_key_order = []
    # for year in fipc_years:
    #     custom_key_order.append()

    if index == len(stock_list) - 1:
        # print('unsorted', unsorted_stock_keys.keys())
        # for key in unsorted_stock_keys.keys():
        #     if key.startswith

        # unsorted_stock_keys.keys()

        sorted_stock_keys = {}

        # custom_key_order = ['กำไรสุทธิ', 'ROA']
        # print('custom sorted', sorted(unsorted_stock_keys.items(), key=lambda i:custom_key_order.index(i[0])))

        # sorted_list = sorted(list.items(), key=lambda i:custom_key_order.index(i[0]))

        # for key in sorted(unsorted_stock_keys.items(), key=lambda i:custom_key_order.index(i[0])):
        #     sorted_stock_keys[key] = []
        #     sorted_stock_keys[key] = []

        for index_key, key in enumerate(sorted(unsorted_stock_keys.keys())):
            sorted_stock_keys[key] = unsorted_stock_keys[key]
            print('unsorted stock key: ', unsorted_stock_keys[key][index_key])

        print('sorted stock keys', sorted_stock_keys)

        df_stock = {**df_stock, **sorted_stock_keys}

print(df_stock)


# print("fipc_years", fipc_years)
# print("fisc_years", fisc_years)
# print("fasc_years", fasc_years)

df = pd.DataFrame(data=df_stock)
df

outer index 0
outer index 1
unsorted stock key:  1
unsorted stock key:  6
unsorted stock key:  ,
unsorted stock key:  3


IndexError: string index out of range

In [20]:
cols = {
    'industry_group': ['a', 'v'],
    'business_type': ['1', '2'],
    'stock_name': ['3', 4]
}

cols['industry_group'].append('xx')
cols['business_type'].append('44')
cols['stock_name'].append('22')

print(cols)

df = pd.DataFrame(data=cols)
df

{'industry_group': ['a', 'v', 'xx'], 'business_type': ['1', '2', '44'], 'stock_name': ['3', 4, '22']}


Unnamed: 0,industry_group,business_type,stock_name
0,a,1,3
1,v,2,4
2,xx,44,22


In [88]:
list = {
    'xl': [],
    's': []
}


# for item in ['m', 'l']:
#     print('pushing key: ', item)
#     list[item] = []
    
# print('tempList', sorted(tempList.keys()))

newTempList = {}

# list_tuples = sorted(list.items(), key=lambda x: len(x[0]))

# for elem in list_tuples:
#     print('elem: ', elem)

# custom_key_order = ['s', 'm', 'l', 'xl']

# sorted_list = sorted(list.items(), key=lambda i:custom_key_order.index(i[0]))

for key in sorted(list):
    newTempList['a_' + key] = list[key]

print(newTempList)

list = {**list, **newTempList}

print(list)

# for k in sorted(tempList.keys()):
#     newTempList[k] = []

# print('sorted newTempList ', newTempList)



# print('list', list)



{'a_s': [], 'a_xl': []}
{'xl': [], 's': [], 'a_s': [], 'a_xl': []}
