In [14]:
import sys,os
sys.path.append("..")
import django
django.setup()
import pandas as pd
from io import StringIO
import requests
import datetime
pd.set_option("display.max_rows", None)

In [35]:
class CrawlStockIndexPriceTW:
    def __init__(self, date):
        self.date = date
        self.date_str = date.strftime("%Y%m%d")
        self.target_name = "台股指數資訊"
        self.sub_market = ["sii", "otc"]

    def crawl_sii(self):

        r = requests.post('http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=' + self.date_str + '&type=IND')

        content = r.text.replace('=', '')
        lines = content.split('\n')
        lines = list(filter(lambda l:len(l.split('",')) > 5, lines))
        content = "\n".join(lines)
        if content == '':
            return None
        df = pd.read_csv(StringIO(content))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(',', ''))
        df = df.rename(columns={'指數':'stock_id','收盤指數':'index_price',
                                '漲跌百分比(%)':'quote_change'})
        df['date'] = pd.to_datetime(self.date).date()
        df['stock_id']=df['stock_id'].apply(lambda s:'上市'+s)
        df[['index_price','quote_change']]=df[['index_price','quote_change']].apply(lambda s:pd.to_numeric(s, errors='coerce'))

        df_all=df.loc[:,['stock_id','date','index_price','quote_change']]
        df_all=df_all.dropna()

        return df_all

    def crawl_otc(self):

        y = str(int(self.date.strftime("%Y")) - 1911)
        date_str = y + "/" + self.date.strftime("%m") + "/" + self.date.strftime("%d")
        link = 'http://www.tpex.org.tw/web/stock/aftertrading/index_summary/summary_download.php?l=zh-tw&d='+date_str+'&s=0,asc,0'
        r = requests.get(link)        

        lines = r.text.replace("\r", "").split("\n")
        try:
            df = pd.read_csv(StringIO("\n".join(lines[3:])), header=None)
        except pd.errors.ParserError:
            return None
        df.columns = list(map(lambda s: s.replace(" ", ""), lines[2].split(",")))
        df = df.apply(lambda s: s.str.replace(",", ""))

        df['stock_id']='上櫃'+(df['指數'].apply(lambda s: s.replace('指數', '')))+'指數'

        #第二個櫃買指數以下的才是報酬指數，找出第二個，各年指數項目不同使用find來定位
        rem_loc=df['指數'].str.find('櫃買指數')
        rem_loc=(rem_loc[rem_loc>-1].index.tolist())[-1]

        #一般指數
        df_normal=df.iloc[:rem_loc]
        #報酬指數
        df_rem=df.copy()
        df_rem=df_rem.iloc[rem_loc:]
        df_rem['stock_id']=df_rem['stock_id'].apply(lambda s: s.replace('指數', '報酬指數'))
#         合併
        df_all=pd.concat([df_normal,df_rem])
        df_all= df_all.rename(columns={'收市指數':'index_price','漲跌幅度':'quote_change'})
        df_all[['index_price','quote_change']]=df_all[['index_price','quote_change']].apply(lambda s:pd.to_numeric(s, errors='coerce'))
        df_all['date'] = pd.to_datetime(self.date).date()
        df_all=df_all.loc[:,['stock_id','date','index_price','quote_change']]
        df_all=df_all.dropna()
        return df_all
        
    def crawl_main(self):
        try:
            df = pd.concat([self.crawl_sii(), self.crawl_otc()])
        except ValueError:
            return None
        return df
   
    
    
z=CrawlStockIndexPriceTW(datetime.datetime(2020,4,16)).crawl_otc()   
z

Unnamed: 0,stock_id,date,index_price,quote_change
0,上櫃化學工業指數,2020-04-16,80.88,0.07
1,上櫃文化創意業指數,2020-04-16,98.62,-0.05
2,上櫃半導體業指數,2020-04-16,66.48,-0.2
3,上櫃生技醫療指數,2020-04-16,102.88,1.61
4,上櫃光電業指數,2020-04-16,18.77,1.35
5,上櫃其他指數,2020-04-16,85.46,0.41
6,上櫃其他電子業指數,2020-04-16,94.06,0.71
7,上櫃建材營造指數,2020-04-16,118.63,-0.62
8,上櫃紡織纖維指數,2020-04-16,85.35,-0.45
9,上櫃航運業指數,2020-04-16,71.74,0.94


In [36]:
a={'s':1}
len(a)

1