In [2]:
import sys,os
sys.path.append("..")
import django
django.setup()
import pandas as pd
from io import StringIO
import requests
import datetime

In [18]:
class CrawlStockIndexPriceTW:
    def __init__(self, date):
        self.date = date
        self.date_str = date.strftime("%Y%m%d")
        self.target_name = "台股指數資訊"
        self.sub_market = ["sii", "otc"]

    def sii_index(self):

        r = requests.post('http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=' + self.date_str + '&type=IND')

        content = r.text.replace('=', '')
        lines = content.split('\n')
        lines = list(filter(lambda l:len(l.split('",')) > 5, lines))
        content = "\n".join(lines)
        if content == '':
            return None
        df = pd.read_csv(StringIO(content))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(',', ''))
        df = df.rename(columns={'指數':'stock_id','收盤指數':'index_price',
                                '漲跌百分比(%)':'quote_change'})
        df['date'] = pd.to_datetime(self.date).date()
        df['stock_id']=df['stock_id'].apply(lambda s:'上市'+s)
        df.loc[:,['index_price','quote_change']]=df.loc[:,['index_price','quote_change']].apply(lambda s:pd.to_numeric(s, errors='coerce'))

        df_all=df.loc[:,['stock_id','date','index_price','quote_change']]
        df_all=df_all.dropna()

        return df_all

    def otc_index(self):

        y = str(int(self.date.strftime("%Y")) - 1911)
        date_str = y + "/" + self.date.strftime("%m") + "/" + self.date.strftime("%d")
        link = 'http://www.tpex.org.tw/web/stock/aftertrading/index_summary/summary_download.php?l=zh-tw&d='+date_str+'&s=0,asc,0'
        r = requests.get(link)        

        lines = r.text.replace("\r", "").split("\n")
        try:
            df = pd.read_csv(StringIO("\n".join(lines[3:])), header=None)
        except pd.errors.ParserError:
            return None
        df.columns = list(map(lambda s: s.replace(" ", ""), lines[2].split(",")))
        df = df.apply(lambda s: s.str.replace(",", ""))

        df['stock_id']='上櫃'+(df['指數'].apply(lambda s: s.replace('指數', '')))+'指數'

        #第二個櫃買指數以下的才是報酬指數，找出第二個，各年指數項目不同使用find來定位
        rem_loc=df['指數'].str.find('櫃買指數')
        rem_loc=(rem_loc[rem_loc>-1].index.tolist())[-1]

        #一般指數
        df_normal=df.iloc[:rem_loc]
        #報酬指數
        df_rem=df.iloc[rem_loc:]

        #合併
        df_all=pd.concat([df_normal,df_rem])
        df_all= df_all.rename(columns={'收市指數':'index_price','漲跌幅度':'quote_change'})
        df_all.loc[:,['index_price','quote_change']]=df_all.loc[:,['index_price','quote_change']].apply(lambda s:pd.to_numeric(s, errors='coerce'))
        df_all['date'] = pd.to_datetime(self.date).date()
        df_all=df_all.loc[:,['stock_id','date','index_price','quote_change']]
        df_all=df_all.dropna()
        return df_all
        
    def crawl_main(self):
        try:
            df = pd.concat([self.sii_index(), self.otc_index()])
        except ValueError:
            return None
        return df
   
    
    
z=CrawlStockIndexPriceTW(datetime.datetime(2009,1,6)).crawl_main()   
z

Unnamed: 0,stock_id,date,index_price,quote_change
0,上市發行量加權股價指數,2009-01-06,4727.26,0.62
1,上市臺灣50指數,2009-01-06,3441.21,0.39
2,上市臺灣中型100指數,2009-01-06,3907.98,1.30
3,上市臺灣資訊科技指數,2009-01-06,4118.34,0.44
4,上市臺灣發達指數,2009-01-06,4032.76,1.56
5,上市臺灣高股息指數,2009-01-06,3423.37,1.35
6,上市未含金融保險股指數,2009-01-06,3934.88,0.97
7,上市未含電子股指數,2009-01-06,7011.89,0.80
8,上市未含金融電子股指數,2009-01-06,5921.49,1.80
9,上市水泥窯製類指數,2009-01-06,360.05,-0.36
