In [1]:
import requests
import pandas as pd
import sqlite3

import datetime
import time

In [2]:
#上市資料
def crawl_price(date):
    #設定日期＆get data
    datestr = date.strftime('%Y%m%d')
    
    try:
        df = pd.read_html(f'https://www.twse.com.tw/exchangeReport/MI_INDEX?response=html&date={datestr}&type=ALLBUT0999')
    except:
        return None
        
    #取得only股票的
    newdf = df[8]

    #整理資料
    newdf.columns = newdf.columns.get_level_values(2)
    newdf.drop(columns='本益比', inplace=True)
    newdf['漲跌價差'] = newdf['漲跌(+/-)'] + newdf['漲跌價差'].astype(str)
    newdf.drop(columns='漲跌(+/-)',inplace=True)
    newdf['date'] = date
    newdf.drop(columns=['最後揭示買量','最後揭示賣量'], inplace=True)
    newdf.rename(columns={'證券代號':'stock_id', 
                          '證券名稱':'stock_name', 
                          '成交股數':'volume', 
                          '開盤價':'open', 
                          '最高價':'high',
                          '最低價':'low',
                          '收盤價':'close',
                          '最後揭示買價':'最後買價',                          
                          '最後揭示賣價':'最後賣價',
                          '漲跌價差':'漲跌'
                         }, inplace=True)
    #newdf.set_index(['date', 'stock_id', 'stock_name'], inplace=True)
    dlist = ['date', 'stock_id', 'stock_name', 'volume', 'open', 'high',
            'low', 'close', '漲跌', '成交筆數', '成交金額' ,'最後買價' , '最後賣價' ]
    newdf = newdf[dlist]
    newdf['date'] = pd.to_datetime(newdf.date)
    newdf = newdf.set_index(['stock_id','stock_name','date'])
    return newdf


#上櫃資料
def crawl_price_counter(date):
    date_rc = f"{date.year-1911}/{str(date)[5:7]}/{str(date)[8:10]}"
    df = pd.read_html(f"https://www.tpex.org.tw/web/stock/aftertrading/daily_close_quotes/stk_quote_result.php?l=zh-tw&o=htm&d={date_rc}&s=0,asc,0")[0]

    df.columns = df.columns.get_level_values(1)
    df['收盤'] = pd.to_numeric(df['收盤'],errors='coerce')
    df = df.dropna()
    if len(df)==0:
        #print(f"{date.strftime('%Y-%m-%d')} cannot get data")
        return None

    df = df[df.columns[0:15]]
    df = df.drop(columns='均價')
    df.rename(columns={'代號':'stock_id', 
                           '名稱':'stock_name', 
                           '成交股數':'volume', 
                           '開盤':'open', 
                           '最高':'high',
                           '最低':'low',
                           '收盤':'close',
                           '成交金額(元)':'成交金額'
                               }, inplace=True)
    df['date'] = date
    dlist = ['date', 'stock_id', 'stock_name', 'volume', 'open', 'high',
             'low', 'close', '漲跌', '成交筆數', '成交金額' , '最後買價', '最後賣價']
    df = df[dlist]
    df['date'] = pd.to_datetime(df.date)
    df = df.set_index(['stock_id','stock_name','date'])
    return df





%%time
combine_price_data(datetime.date(2021,7,14))

%%time
crawl_price(datetime.date(2021,7,9))

%%time
pd.concat([crawl_price(datetime.date(2021,7,14)),crawl_price_counter(datetime.date(2021,7,14))])

In [3]:
%%time

#date = datetime.date(2021,7,5)
def combine_price_data(date):
    try: 
        df = pd.concat([crawl_price(date), crawl_price_counter(date)])
        print(f'{date} success')
        return df
    except Exception as e:
        print(f'{date} cannot get data')
        return None
    
def get_price_data(start, end):
    result = pd.DataFrame()
    for d in range(int((end -start).days)+1):
        try:
            date = start+datetime.timedelta(d)
            result = result.append(combine_price_data(date), sort=False)
            time.sleep(3)
        except Exception as e:
            print(e)
            
    return result

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


# 最新版＿更新資料

In [4]:
df = pd.read_csv('price_backup.csv', index_col=['stock_id', 'stock_name','date'])
data_start = datetime.datetime.strptime(str(df.index.get_level_values(2)[-1]), "%Y-%m-%d %H:%M:%S").date()
end = datetime.date.today()
df_new = get_price_data(data_start+datetime.timedelta(1), end)
df = df.append(df_new)

#df = df.sort_index(level=2)
df.to_csv('price_backup.csv')
print('copy_success')


2021-09-08 success
copy_success


# 若出錯的更新資料庫

In [5]:
import pandas as pd
import sqlite3
df = pd.read_csv('price_backup.csv', index_col=['stock_id','stock_name','date'])
con = sqlite3.connect('mydata.db')
df.to_sql('price', con, if_exists='replace')
print('sqlite_success')

sqlite_success


In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,volume,open,high,low,close,漲跌,成交筆數,成交金額,最後買價,最後賣價
stock_id,stock_name,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0015,富邦,2013-01-02 00:00:00,12000.0,7.55,7.64,7.55,7.64,+0.04,7.0,90990.0,7.60,7.63
0050,元大台灣50,2013-01-02 00:00:00,16487837.0,54.00,54.65,53.90,54.40,+0.6,3320.0,896311193.0,54.40,54.45
0051,元大中型100,2013-01-02 00:00:00,277083.0,25.75,26.10,25.75,26.09,+0.34,98.0,7204613.0,26.00,26.09
0052,富邦科技,2013-01-02 00:00:00,26000.0,32.30,32.76,32.30,32.72,+0.77,8.0,848930.0,32.64,32.80
0053,元大電子,2013-01-02 00:00:00,42109.0,23.30,23.30,22.92,23.26,+0.36,34.0,973619.0,23.01,23.23
...,...,...,...,...,...,...,...,...,...,...,...,...
9949,琉園,2021-09-01 00:00:00,45245.0,10.80,10.80,10.65,10.75,-0.05,48.0,486236.0,10.7,10.75
9950,萬國通,2021-09-01 00:00:00,20000.0,8.69,8.69,8.41,8.51,-0.18,13.0,171530.0,8.51,8.67
9951,皇田,2021-09-01 00:00:00,202710.0,87.20,88.40,87.20,88.3,+1.30,207.0,17844331.0,88.3,88.40
9960,邁達康,2021-09-01 00:00:00,56004.0,27.00,27.25,26.85,27.1,+0.35,37.0,1515312.0,27.1,27.15


## check data 

In [43]:
datetime.datetime.strptime(str(df.index.get_level_values(2)[-1]), "%Y-%m-%d %H:%M:%S").date()

datetime.date(2021, 7, 15)

# 最新版＿更新舊資料

In [8]:
df.index.get_level_values(2)[0]

'2019-01-02'

In [None]:
start = datetime.date(2012,5,31)
df = pd.read_csv('price_backup.csv',index_col=['stock_id','stock_name','date' ])
data_end = datetime.datetime.strptime(str(df.index.get_level_values(2)[0]), "%Y-%m-%d %H:%M:%S").date()
df_new = get_price_data(start, data_end-datetime.timedelta(1))
df = df.append(df_new)
df = df.sort_index(level=2)
df.to_csv('price_backup.csv')
print('csv_complete')

try:
    con = sqlite3.connect('mydata.db')
    df.to_sql('price', con, if_exists='replace')
    print('sqlite_complete')
except Exception as e:
    print(e)

# 存入資料庫＿新版

In [None]:
df

In [12]:
import sqlite3
df = pd.read_csv('price_backup.csv', index_col=['stock_id','stock_name','date'])
con = sqlite3.connect('mydata.db')
df.to_sql('price', con, if_exists='replace')

In [11]:
pd.read_sql('SELECT * FROM price', con, index_col=['date', 'stock_id','stock_name']).sort_index(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,volume,open,high,low,close,漲跌,成交筆數,成交金額,最後買價,最後賣價
date,stock_id,stock_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-10-31 00:00:00,0050,元大台灣50,0,2131434,85.00,85.35,84.80,85.20,+0.25,552,181429580,85.15,85.20
2017-10-31 00:00:00,0051,元大中型100,1,8000,32.17,32.17,31.99,32.00,+0.2,4,256240,32.0,32.08
2017-10-31 00:00:00,0052,富邦科技,2,15000,55.70,56.65,55.70,56.65,+1.0,6,839150,56.35,56.65
2017-10-31 00:00:00,0053,元大電子,3,22000,38.60,38.60,38.43,38.60,+0.27,10,847720,38.52,38.60
2017-10-31 00:00:00,0054,元大台商50,4,4000,25.30,25.32,25.30,25.30,+0.01,3,101220,25.31,25.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-28 00:00:00,9951,皇田,4181298,884003,99.50,99.50,95.00,96.4,-2.60,579,85547378,96.40,96.50
2021-07-28 00:00:00,9955,佳龍,4177079,278287,18.55,18.55,17.55,17.90,-0.3,260,4985327,17.85,17.95
2021-07-28 00:00:00,9958,世紀鋼,4177080,2701980,121.50,125.00,118.50,123.00,+2.0,2023,329866609,122.50,123.00
2021-07-28 00:00:00,9960,邁達康,4181299,20000,27.35,27.35,26.80,27.15,-0.15,16,540550,27.10,27.20


# update

In [9]:
import sqlite3
import os


def update(date):
    con = sqlite3.connect('mydata.db')
    df = pd.read_sql('SELECT * FROM price', con, index_col=['stock_id','stock_name', 'date' ]) 
    #df = df.drop_duplicates()
    df = df.sort_values(by='date')
    #get 最後一天日期
    start = df.index[-1][2]
    start = datetime.datetime.strptime(start, "%Y-%m-%d %H:%M:%S").date()
    
    #end = datetime.date(date)
    
    df = df.append(get_price_data(start, date))
    df.drop_duplicates(inplace=True)
    #df.sort_index(level='date', inplace=True)
    df.to_csv('backup.csv')
    try:
        df.to_sql('price',con , if_exists='replace')
    except Exception as e:
        print(e)
    print("Success")
    return df
   
def update_new(date):
    con = sqlite3.connect('mydata.db')
    df = pd.read_sql('SELECT * FROM price', con, index_col=['date']) 
    df = df.sort_index(level='date')
    #get 最後一天日期
    start = df.index[-1][2]
    start = datetime.datetime.strptime(start, "%Y-%m-%d %H:%M:%S").date()
    
    #end = datetime.date(date)
    
    df = df.append(get_price_data(start, date))
    df.drop_duplicates(inplace=True)
    #df.sort_index(level='date', inplace=True)
    df.to_csv('price_backup.csv')
    print("Success")
    return df



In [10]:
%%time
end =datetime.date.today()
update(end)

2021-07-26 success
2021-07-27 success
2021-07-28 cannot get data


InterfaceError: Error binding parameter 2 - probably unsupported type.

## 若出錯，則利用備用資料重新整理

In [2]:
import sqlite3
import pandas as pd
con = sqlite3.connect('mydata.db')
backup = pd.read_csv('backup.csv')
backup.set_index(['stock_id', 'stock_name', 'date'], drop=True, inplace=True)
backup.to_sql('price', con ,if_exists='replace')
df = pd.read_sql('SELECT * FROM price', con, index_col=['stock_id','stock_name', 'date']) 
#df = df.drop_duplicates()
#df = df.sort_values(by='date')
#get 最後一天日期
start = df.index[-1][2]

ParserError: Error tokenizing data. C error: Expected 9 fields in line 249820, saw 14


In [2]:
backup = pd.read_csv('backup.csv')

In [4]:
backup.sort_values('date')

Unnamed: 0,stock_id,stock_name,date,外資買進,外資賣出,外資買賣超,投信買進,投信賣出,投信買賣超
789,4190,佐登-KY,2020-01-02 00:00:00,181.0,42.0,139.0,,,
783,4157,太景*-KY,2020-01-02 00:00:00,76.0,90.0,-14.0,,,
782,4147,中裕,2020-01-02 00:00:00,2.0,95.0,-93.0,,,
781,4144,康聯-KY,2020-01-02 00:00:00,17.0,2.0,15.0,,,
780,4142,國光生,2020-01-02 00:00:00,170.0,784.0,-614.0,,,
...,...,...,...,...,...,...,...,...,...
91,3714,富采,2021-07-28 00:00:00,,,,165.0,0.0,165.0
90,3711,日月光投控,2021-07-28 00:00:00,,,,137.0,111.0,26.0
89,3707,漢磊,2021-07-28 00:00:00,,,,30.0,1073.0,-1043.0
95,4915,致伸,2021-07-28 00:00:00,,,,1200.0,0.0,1200.0


# 更新至最新資料

In [36]:
update_data.reset_index(inplace=True)

In [6]:
backup = pd.read_csv('backup.csv')

In [8]:
con = sqlite3.connect('mydata.db')
backup.to_sql('price', con, if_exists='replace')

# 日期倒帶回去

In [55]:
start = datetime.date(2017,12,25)
end = datetime.datetime.strptime(df.index[0][-1], "%Y-%m-%d %H:%M:%S").date()

In [6]:
def update_back(date):
    con = sqlite3.connect('mydata.db')
    df = pd.read_sql('SELECT * FROM price', con, index_col=['stock_id', 'stock_name', 'date']) 
    #df = df.drop_duplicates()
    df = df.sort_index(level='date')
    
    #get 最前面一天日期
    end = datetime.datetime.strptime(df.index[0][-1], "%Y-%m-%d %H:%M:%S").date()
    
    #end = datetime.date(date)
    df = df.append(get_price_data(date, end))
    df.drop_duplicates(inplace=True)
    df.sort_index(level='date', inplace=True)
    
    df.to_sql('price', con, if_exists='replace')
    
    print("Success")
    df.to_csv('backup.csv')

In [7]:
%%time
update_back(datetime.date(2013,12,31))

2013-12-31 success
2014-01-01 cannot get data
2014-01-02 success
2014-01-03 success
2014-01-04 cannot get data
2014-01-05 cannot get data
2014-01-06 success
2014-01-07 success
2014-01-08 success
2014-01-09 success
2014-01-10 success
2014-01-11 cannot get data
2014-01-12 cannot get data
2014-01-13 success
2014-01-14 success
2014-01-15 success
2014-01-16 success
2014-01-17 success
2014-01-18 cannot get data
2014-01-19 cannot get data
2014-01-20 success
2014-01-21 success
2014-01-22 success
2014-01-23 success
2014-01-24 success
2014-01-25 cannot get data
2014-01-26 cannot get data
2014-01-27 success
2014-01-28 cannot get data
2014-01-29 cannot get data
2014-01-30 cannot get data
2014-01-31 cannot get data
2014-02-01 cannot get data
2014-02-02 cannot get data
2014-02-03 cannot get data
2014-02-04 cannot get data
2014-02-05 success
Success
CPU times: user 3min 55s, sys: 1min 40s, total: 5min 36s
Wall time: 9min 33s


In [7]:
con = sqlite3.connect('mydata.db')
df = pd.read_sql('SELECT * FROM price', con, index_col=['stock_id', 'stock_name', 'date']) 
#df = df.drop_duplicates()
df = df.sort_index(level='date')

#get 最前面一天日期
end = datetime.datetime.strptime(df.index[0][-1], "%Y-%m-%d %H:%M:%S").date()

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,volume,open,high,low,close,漲跌,成交筆數,成交金額,最後買價,最後賣價
stock_id,stock_name,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0050,元大台灣50,2016-08-01 00:00:00,10407371,68.80,69.45,68.80,69.30,+0.75,1682,720963537,69.25,69.30
0051,元大中型100,2016-08-01 00:00:00,33670,26.31,26.51,26.31,26.40,+0.19,25,890397,26.4,26.46
0052,富邦科技,2016-08-01 00:00:00,17070,42.84,42.96,42.84,42.96,+0.51,7,732917,42.76,42.92
0053,元大電子,2016-08-01 00:00:00,92000,30.05,30.15,30.01,30.15,+0.25,36,2768310,30.15,30.18
0054,元大台商50,2016-08-01 00:00:00,15000,21.78,22.06,21.78,22.00,+0.22,7,329110,21.93,22.00
...,...,...,...,...,...,...,...,...,...,...,...,...
9951,皇田,2021-07-15 00:00:00,467042,97.30,97.90,96.50,96.6,-0.60,421,45349095,96.60,96.70
9955,佳龍,2021-07-15 00:00:00,256031,18.10,18.70,18.05,18.60,+0.6,182,4733216,18.60,18.65
9958,世紀鋼,2021-07-15 00:00:00,2642784,127.00,132.00,126.50,131.00,+4.5,1922,342802180,131.00,131.50
9960,邁達康,2021-07-15 00:00:00,11728,27.45,27.75,27.45,27.7,0.00,12,323620,27.60,27.75


# 整理只有股票的

In [39]:
new = df[df.index.get_level_values(0).str.len()==4]
#new[new.index.get_level_values(0).str.len()==5]

In [43]:
new.to_csv('only_stock.csv')