In [None]:
import logging
import requests
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime

# Historical

In [None]:
def stringReplace(pdf, str_columns):
    copy_pdf = pdf.copy()
    for col in str_columns:
        copy_pdf[col] = copy_pdf[col].str.replace(u'\xa0', u'')
    return copy_pdf

def toNumeric(x):
    try:
        return float(str(x).replace(',', ''))
    except:
        return None
    
def strToFloat(pdf, num_cols):
    tmp = pdf.copy()
    for col in num_cols:
        tmp.loc[:, col] = tmp[col].apply(toNumeric)
    return tmp

def toAdDate(roc_dt):
    if roc_dt == '':
        return None
    else:
        roc_year = roc_dt.split('/')[0]
        ad_year = str(int(roc_year) + 1911)
        month = roc_dt.split('/')[1]
        day = roc_dt.split('/')[2]
        return '/'.join([ad_year, month, day])

result = []
type_list = ['sii', 'otc']
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = "https://mops.twse.com.tw/mops/web/ajax_t108sb27?encodeURIComponent=1&step=1&firstin=1&off=1&keyword4=&code1=&TYPEK2=&checkbtn=&queryName=&TYPEK=%s&co_id_1=&co_id_2=&year=%s&month=&b_date=&e_date=&type="
start_year = 2000
end_year = 2021
for t in type_list:
    print(t)
    for year in range(start_year, end_year+1):
        print(year)
        year = year-1911
        resp = requests.get(url % (t, year), headers=headers)
        soup = BeautifulSoup(resp.text, 'html.parser')
        rows = soup.find_all("tr", class_="odd") + soup.find_all("tr", class_="even")
        if year > 104:
            for row in rows:
                cells = row.find_all("td")
                data = dict()
                data['股票代號'] = cells[0].text
                data['股利所屬期間'] = cells[2].text
                data['權利分派日'] = cells[3].text
                data['股票股利盈餘'] = cells[4].text
                data['股票股利公積'] = cells[5].text
                data['除權交易日'] = cells[6].text
                data['現金股利盈餘'] = cells[7].text
                data['現金股利公積'] = cells[8].text
                data['除息交易日'] = cells[9].text
                data['現金股利發放日'] = cells[10].text
                data['現金增資總股數'] = cells[11].text
                data['現金增資認股比率'] = cells[12].text
                data['現金增資認購價'] = cells[13].text
                data['參加分派總股數'] = cells[14].text
                if year > 107:
                    data['公告時間'] = cells[15].text + ' ' + cells[16].text
                else:
                    data['公告時間'] = cells[14].text + ' ' + cells[15].text
                result.append(data)
        else:
            for row in rows:
                cells = row.find_all("td")
                data = dict()
                data['股票代號'] = cells[0].text
                data['股利所屬期間'] = cells[2].text
                data['權利分派日'] = cells[3].text
                data['股票股利盈餘'] = cells[4].text
                data['股票股利公積'] = cells[5].text
                data['除權交易日'] = cells[6].text
                data['現金股利盈餘'] = cells[11].text
                data['現金股利公積'] = cells[12].text
                data['除息交易日'] = cells[13].text
                data['現金股利發放日'] = cells[14].text
                data['現金增資總股數'] = cells[16].text
                data['現金增資認股比率'] = cells[17].text
                data['現金增資認購價'] = cells[18].text
                data['參加分派總股數'] = ''
                data['公告時間'] = cells[20].text + ' ' + cells[21].text
                result.append(data)
        sleep(3)
pdf = pd.DataFrame(result).sort_values(['股票代號', '除息交易日'], ascending=False)
    
new_pdf = stringReplace(pdf, pdf.columns)
new_pdf['股利所屬期間'] = new_pdf['股利所屬期間'].apply(lambda x: x.replace('\u3000', '') if '不' not in x else '')
new_pdf = strToFloat(new_pdf, ['股票股利盈餘', '股票股利公積', '現金股利盈餘', '現金股利公積', '現金增資總股數', '現金增資認股比率', '現金增資認購價', '參加分派總股數'])
new_pdf['權利分派日'] = new_pdf['權利分派日'].apply(lambda x: toAdDate(x))
new_pdf['除權交易日'] = new_pdf['除權交易日'].apply(lambda x: toAdDate(x))
new_pdf['除息交易日'] = new_pdf['除息交易日'].apply(lambda x: toAdDate(x))
new_pdf['現金股利發放日'] = new_pdf['現金股利發放日'].apply(lambda x: toAdDate(x))
new_pdf['公告時間'] = new_pdf['公告時間'].apply(lambda x: toAdDate(x))

# Output to file
columns = ['股票代號', '除權交易日', '權利分派日', '股票股利盈餘', '股票股利公積', '公告時間']
stock_dividend = new_pdf[~new_pdf['除權交易日'].isnull()][columns]
stock_dividend = stock_dividend.drop_duplicates()
stock_dividend['年份'] = stock_dividend['除權交易日'].map(lambda x: x.split('/')[0])
for year in stock_dividend['年份'].unique():
    stock_dividend[stock_dividend['年份']==year].drop('年份', 1).to_csv(f'stock_dividend_{year}.csv', index = False)

columns = ['股票代號', '除息交易日', '現金股利發放日', '現金股利盈餘', '現金股利公積', '公告時間']
cash_dividend = new_pdf[~new_pdf['除息交易日'].isnull()][columns]
cash_dividend = cash_dividend.drop_duplicates()
cash_dividend['年份'] = cash_dividend['除息交易日'].map(lambda x: x.split('/')[0])
for year in cash_dividend['年份'].unique():
    cash_dividend[cash_dividend['年份']==year].drop('年份', 1).to_csv(f'cash_dividend_{year}.csv', index = False)

# Daily Update

In [None]:
import logging
import requests
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup
from datetime import datetime

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

def stringReplace(pdf, str_columns):
    copy_pdf = pdf.copy()
    for col in str_columns:
        copy_pdf[col] = copy_pdf[col].str.replace(u'\xa0', u'')
    return copy_pdf

def toNumeric(x):
    try:
        return float(str(x).replace(',', ''))
    except:
        return None
    
def strToFloat(pdf, num_cols):
    tmp = pdf.copy()
    for col in num_cols:
        tmp.loc[:, col] = tmp[col].apply(toNumeric)
    return tmp

def toAdDate(roc_dt):
    if roc_dt == '':
        return None
    else:
        roc_year = roc_dt.split('/')[0]
        ad_year = str(int(roc_year) + 1911)
        month = roc_dt.split('/')[1]
        day = roc_dt.split('/')[2]
        return '/'.join([ad_year, month, day])

result = []
type_list = ['sii', 'otc']
url = "https://mops.twse.com.tw/mops/web/ajax_t108sb27?encodeURIComponent=1&step=1&firstin=1&off=1&keyword4=&code1=&TYPEK2=&checkbtn=&queryName=&TYPEK=%s&co_id_1=&co_id_2=&year=%s&month=%s&b_date=&e_date=&type="
now_date = datetime.now()
now_year = now_date.year - 1911
now_month = str(now_date.month).zfill(2)
now_year = 110
now_month = '02'
for t in type_list:
    logging.info(f'{t}-{now_year}/{now_month}')
    resp = requests.get(url % (t, now_year, now_month))
    soup = BeautifulSoup(resp.text, 'html.parser')
    rows = soup.find_all("tr", class_="odd") + soup.find_all("tr", class_="even")
    if now_year > 104: 
        for row in rows:
            cells = row.find_all("td")
            data = dict()
            data['股票代號'] = cells[0].text
            data['股利所屬期間'] = cells[2].text
            data['權利分派日'] = cells[3].text
            data['股票股利盈餘'] = cells[4].text
            data['股票股利公積'] = cells[5].text
            data['除權交易日'] = cells[6].text
            data['現金股利盈餘'] = cells[7].text
            data['現金股利公積'] = cells[8].text
            data['除息交易日'] = cells[9].text
            data['現金股利發放日'] = cells[10].text
            data['現金增資總股數'] = cells[11].text
            data['現金增資認股比率'] = cells[12].text
            data['現金增資認購價'] = cells[13].text
            if now_year > 107:
                data['參加分派總股數'] = cells[14].text
                data['公告時間'] = cells[15].text + ' ' + cells[16].text
            else:
                data['公告時間'] = cells[14].text + ' ' + cells[15].text
            result.append(data)
    else:
        for row in rows:
            cells = row.find_all("td")
            data = dict()
            data['股票代號'] = cells[0].text
            data['股利所屬期間'] = cells[2].text
            data['權利分派日'] = cells[3].text
            data['股票股利盈餘'] = cells[4].text
            data['股票股利公積'] = cells[5].text
            data['除權交易日'] = cells[6].text
            data['現金股利盈餘'] = cells[11].text
            data['現金股利公積'] = cells[12].text
            data['除息交易日'] = cells[13].text
            data['現金股利發放日'] = cells[14].text
            data['現金增資總股數'] = cells[16].text
            data['現金增資認股比率'] = cells[17].text
            data['現金增資認購價'] = cells[18].text
            data['參加分派總股數'] = ''
            data['公告時間'] = cells[20].text + ' ' + cells[21].text
            result.append(data)
    sleep(3)
    
if len(result) > 0:
    pdf = pd.DataFrame(result).sort_values(['股票代號', '除息交易日'], ascending=False)

    # data processing 
    new_pdf = stringReplace(pdf, pdf.columns)
    new_pdf['股利所屬期間'] = new_pdf['股利所屬期間'].apply(lambda x: x.replace('\u3000', '') if '不' not in x else '')
    new_pdf = strToFloat(new_pdf, ['股票股利盈餘', '股票股利公積', '現金股利盈餘', '現金股利公積', '現金增資總股數', '現金增資認股比率', '現金增資認購價', '參加分派總股數'])
    new_pdf['權利分派日'] = new_pdf['權利分派日'].apply(lambda x: toAdDate(x))
    new_pdf['除權交易日'] = new_pdf['除權交易日'].apply(lambda x: toAdDate(x))
    new_pdf['除息交易日'] = new_pdf['除息交易日'].apply(lambda x: toAdDate(x))
    new_pdf['現金股利發放日'] = new_pdf['現金股利發放日'].apply(lambda x: toAdDate(x))
    new_pdf['公告時間'] = new_pdf['公告時間'].apply(lambda x: toAdDate(x))

    # Output to file
    root_path = './'
    columns = ['股票代號', '除權交易日', '權利分派日', '股票股利盈餘', '股票股利公積', '公告時間']
    stock_dividend = new_pdf[~new_pdf['除權交易日'].isnull()][columns]
    stock_dividend['年份'] = stock_dividend['除權交易日'].map(lambda x: x.split('/')[0])
    for year in stock_dividend['年份'].unique():
        stock_dividend_file_name = f'stock_dividend_{year}.csv'
        old_stock_dividend = pd.read_csv(root_path + stock_dividend_file_name)
        old_stock_dividend['股票代號'] = old_stock_dividend['股票代號'].map(lambda x: str(x))
        new_stock_dividend = stock_dividend[stock_dividend['年份']==year].drop('年份', 1)
        new_stock_dividend = pd.concat([old_stock_dividend, new_stock_dividend], axis=0).drop_duplicates()
        new_stock_dividend.to_csv(root_path + stock_dividend_file_name, index=False)

    columns = ['股票代號', '除息交易日', '現金股利發放日', '現金股利盈餘', '現金股利公積', '公告時間']
    cash_dividend = new_pdf[~new_pdf['除息交易日'].isnull()][columns]
    cash_dividend['年份'] = cash_dividend['除息交易日'].map(lambda x: x.split('/')[0])
    for year in cash_dividend['年份'].unique():
        cash_dividend_file_name = f'cash_dividend_{year}.csv'
        old_cash_dividend = pd.read_csv(root_path + cash_dividend_file_name)
        old_cash_dividend['股票代號'] = old_cash_dividend['股票代號'].map(lambda x: str(x))
        new_cash_dividend = cash_dividend[cash_dividend['年份']==year].drop('年份', 1)
        new_cash_dividend = pd.concat([old_cash_dividend, new_cash_dividend], axis=0).drop_duplicates()
        new_cash_dividend.to_csv(root_path + cash_dividend_file_name, index = False)

# Price Correlation

In [None]:
import os
basic_df_list = []
root_path = '/Users/fang/stock_data/basic_data'
for stock in os.listdir(root_path):
    tmp = pd.read_csv(root_path + os.sep + stock)
    basic_df_list.append(tmp)
basic_df = pd.concat(basic_df_list, axis=0, ignore_index=True, sort=False)
basic_df['日期'] = pd.to_datetime(basic_df['日期'])
basic_df = basic_df.sort_values(['股票代號', '日期'])
basic_df.loc[:, '前一天股價'] = basic_df.groupby('股票代號')['收盤價'].shift(1)

In [None]:
basic_df.loc[:, 'before_1_day'] = basic_df.groupby('股票代號')['收盤價'].shift(1)
basic_df.loc[:, 'after_1_day'] = basic_df.groupby('股票代號')['收盤價'].shift(-1)
basic_df.loc[:, 'after_2_day'] = basic_df.groupby('股票代號')['收盤價'].shift(-2)
basic_df.loc[:, 'after_3_day'] = basic_df.groupby('股票代號')['收盤價'].shift(-3)

In [None]:
basic_df.loc[:, '股票代號'] = basic_df['股票代號'].copy().apply(lambda x: str(x).zfill(4))
stock_dividend['日期'] = pd.to_datetime(stock_dividend['除權交易日'])

In [None]:
joined_df = pd.merge(basic_df, stock_dividend, how='left', on=['股票代號', '日期'])
joined_df

In [None]:
joined_df['股票殖利率'] = (joined_df['股票股利盈餘']+joined_df['股票股利公積'])/joined_df['前一天股價']

In [None]:
from matplotlib import pyplot
pyplot.scatter(joined_df['股票殖利率'], joined_df['漲跌幅'])

In [None]:
joined_df = joined_df[~joined_df['除權交易日'].isna()]
joined_df

In [None]:
joined_df[['股票代號', '日期', '開盤價', '收盤價', '漲跌幅', '除權交易日', '股票股利盈餘', 'before_1_day', 'after_1_day', 'after_2_day', 'after_3_day']]

In [None]:
dividend_df[dividend_df['除息交易日']=='2020-08-14']