In [1]:
import sys,os
sys.path.append("..")
import django
django.setup()
from io import StringIO
import requests
import datetime
import pandas as pd
import swifter
import time

# from crawlers.models import *
# from crawlers.finlab.import_tools import *

In [9]:
class CrawlStockTiiTW:
    def __init__(self, date):
        self.date = date
        self.date_str = date.strftime("%Y%m%d")
        self.target_name = "台股三大法人個股買賣超資訊"
        self.sub_market = ["sii", "otc", "rotc"]

    def crawl_sii(self):
        r = requests.get('http://www.tse.com.tw/fund/T86?response=csv&date=' + self.date_str + '&selectType=ALLBUT0999')
        try:
            df = pd.read_csv(StringIO(r.text), header=1).dropna(how='all', axis=1).dropna(how='any')
        except pd.errors.EmptyDataError:
            return None
        df = df.astype(str).apply(lambda s: s.str.replace(',', ''))
        df['證券代號'] = df['證券代號'].str.replace('=', '').str.replace('"', '')
        df[df.columns[2:]] = df[df.columns[2:]].apply(lambda s: pd.to_numeric(s, errors='coerce')).dropna(how='all', axis=1)
        df = df.rename(columns={'證券代號': 'stock_id', '證券名稱': 'stock_name',
                                '外陸資買進股數(不含外資自營商)': 'fm_buy', '外陸資賣出股數(不含外資自營商)': 'fm_sell',
                                '外陸資買賣超股數(不含外資自營商)': 'fm_net', '外資自營商買進股數': 'fd_buy',
                                '外資自營商賣出股數': 'fd_sell', '外資自營商買賣超股數': 'fd_net',
                                '投信買進股數': 'itc_buy', '投信賣出股數': 'itc_sell',
                                '投信買賣超股數': 'itc_net', '自營商買賣超股數': 'dealer_net',
                                '自營商買進股數(自行買賣)': 'dealer_ppt_buy', '自營商賣出股數(自行買賣)': 'dealer_ppt_sell',
                                '自營商買賣超股數(自行買賣)': 'dealer_ppt_net', '自營商買進股數(避險)': 'dealer_hedge_buy',
                                '自營商賣出股數(避險)': 'dealer_hedge_sell', '自營商買賣超股數(避險)': 'dealer_hedge_net',
                                '三大法人買賣超股數': 'tii_net'
                                })
        df['ft_net'] = df['fm_net'] + df['fd_net']
        df["date"] = pd.to_datetime(self.date).date()
        return df

    def crawl_otc(self):
        west_year = int(self.date.strftime("%Y"))
        y = str(west_year - 1911)
        date_str = y + "/" + self.date.strftime("%m") + "/" + self.date.strftime("%d")
        r = requests.get(
            'http://www.tpex.org.tw/web/stock/3insti/daily_trade/3itrade_hedge_result.php?l=zh-tw&o=csv&se=EW&t=D&d='
            + date_str + '&s=0,asc')
        try:
            df = pd.read_csv(StringIO(r.text), header=1).dropna(how='all', axis=1).dropna(how='any')
        except pd.errors.ParserError:
            return None
        df = df.astype(str).apply(lambda s: s.str.replace(',', ''))
        df['代號'] = df['代號'].str.replace('=', '').str.replace('"', '')
        df[df.columns[2:]]= df[df.columns[2:]].apply(lambda s: pd.to_numeric(s, errors='coerce')).dropna(how='all', axis=1)
        if self.date > datetime.datetime(2018, 1, 14):
            df = df.rename(columns={'代號': 'stock_id', '名稱': 'stock_name',
                                    '外資及陸資(不含外資自營商)-買進股數': 'fm_buy', '外資及陸資(不含外資自營商)-賣出股數': 'fm_sell',
                                    '外資及陸資(不含外資自營商)-買賣超股數': 'fm_net', '外資自營商-買進股數': 'fd_buy',
                                    '外資自營商-賣出股數': 'fd_sell', '外資自營商-買賣超股數': 'fd_net',
                                    '投信-買進股數': 'itc_buy', '投信-賣出股數': 'itc_sell',
                                    '投信-買賣超股數': 'itc_net', '自營商-買賣超股數': 'dealer_net',
                                    '自營商(自行買賣)-買進股數': 'dealer_ppt_buy', '自營商(自行買賣)-賣出股數': 'dealer_ppt_sell',
                                    '自營商(自行買賣)-買賣超股數': 'dealer_ppt_net', '自營商(避險)-買進股數': 'dealer_hedge_buy',
                                    '自營商(避險)-賣出股數': 'dealer_hedge_sell', '自營商(避險)-買賣超股數': 'dealer_hedge_net',
                                    '三大法人買賣超股數合計': 'tii_net'
                                    })
            df['ft_net'] = df['fm_net'] + df['fd_net']
            df = df.drop(columns=['外資及陸資-買進股數', '外資及陸資-賣出股數', '外資及陸資-買賣超股數', '自營商-買進股數', '自營商-賣出股數'])

        else:
            df = df.rename(columns={'代號': 'stock_id', '名稱': 'stock_name',
                                    '外資及陸資買股數': 'fm_buy', '外資及陸資賣股數': 'fm_sell',
                                    '外資及陸資淨買股數': 'fm_net',
                                    '投信買進股數': 'itc_buy', '投信賣股數': 'itc_sell',
                                    '投信淨買股數': 'itc_net', '自營淨買股數': 'dealer_net',
                                    '自營商(自行買賣)買股數': 'dealer_ppt_buy', '自營商(自行買賣)賣股數': 'dealer_ppt_sell',
                                    '自營商(自行買賣)淨買股數': 'dealer_ppt_net', '自營商(避險)買股數': 'dealer_hedge_buy',
                                    '自營商(避險)賣股數': 'dealer_hedge_sell', '自營商(避險)淨買股數': 'dealer_hedge_net',
                                    '三大法人買賣超股數': 'tii_net'
                                    })
            df['ft_net'] = df['fm_net']
        df["date"] = pd.to_datetime(self.date).date()
        return df

    def crawl_rotc(self):
        r = requests.get(
            'https://www.tpex.org.tw/web/emergingstock/historical/daily/EMDaily_dl.php?l=zh-tw&f=EMdss006.'
            + self.date_str + '-C.csv')
        try:
            df = pd.read_csv(StringIO(r.text), header=3).dropna(how='all', axis=1).dropna(how='any')
        except pd.errors.ParserError:
            return None
        df = df.drop(columns=['HEADER'])
        df = df.astype(str).apply(lambda s: s.str.replace(',', ''))
        try:
            df['證券代號'] = df['證券代號'].apply(lambda s: s[:4])
            df = df.rename(columns={'證券代號': 'stock_id', '證券名稱': 'stock_name',
                                    '外資(股數)': 'ft_net', '投信(股數)': 'itc_net',
                                    '自營商(股數)': 'dealer_net', '合計買賣超(股數)': 'tii_net',
                                    })
        except KeyError:
            df['股票代號'] = df['股票代號'].apply(lambda s: s[:4])
            df = df.rename(columns={'股票代號': 'stock_id', '名稱': 'stock_name',
                                    '外資': 'ft_net', '投信': 'itc_net',
                                    '自營商': 'dealer_net', '合計買賣超': 'tii_net',
                                    })
        df[df.columns[2:]] = df[df.columns[2:]].apply(lambda s: pd.to_numeric(s, errors='coerce'))
        df["date"] = self.date.date()
        df['stock_id'] = df['stock_id'].apply(lambda s: s[:s.index(' ')] if '" "' in s else s)
        df['stock_name'] = df['stock_name'].apply(lambda s: s[:s.index(' ')] if '" "' in s else s)
        return df

    def crawl_main(self):
        try:
            df = pd.concat([self.crawl_sii(), self.crawl_otc(), self.crawl_rotc()], sort=False)
        except ValueError:
            return None
        return df
    

import datetime
df=CrawlStockTiiTW(datetime.datetime(2018,1,15)).crawl_main()
df

Pandas Loading Time = 2.4277901649475098


Unnamed: 0,stock_id,stock_name,fm_buy,fm_sell,fm_net,fd_buy,fd_sell,fd_net,itc_buy,itc_sell,...,dealer_net,dealer_ppt_buy,dealer_ppt_sell,dealer_ppt_net,dealer_hedge_buy,dealer_hedge_sell,dealer_hedge_net,tii_net,ft_net,date
0,00637L,元大滬深300正2,9312000.0,305000.0,9007000.0,0.0,0.0,0.0,0.0,0.0,...,51836000.0,0.0,50000.0,-50000.0,72967000.0,21081000.0,51886000.0,60843000,9007000.0,2018-01-15
1,00633L,富邦上証正2,8543000.0,0.0,8543000.0,0.0,0.0,0.0,0.0,0.0,...,5536000.0,65000.0,2990000.0,-2925000.0,12992000.0,4531000.0,8461000.0,14079000,8543000.0,2018-01-15
2,2891,中信金,8824000.0,5682880.0,3141120.0,0.0,0.0,0.0,4684000.0,27000.0,...,1767000.0,3150000.0,2144000.0,1006000.0,957000.0,196000.0,761000.0,9565120,3141120.0,2018-01-15
3,00655L,國泰中國A50正2,2089000.0,0.0,2089000.0,0.0,0.0,0.0,0.0,0.0,...,7030000.0,50000.0,0.0,50000.0,9853000.0,2873000.0,6980000.0,9119000,2089000.0,2018-01-15
4,2882,國泰金,10180000.0,3168173.0,7011827.0,0.0,0.0,0.0,1277000.0,18000.0,...,658000.0,279000.0,20000.0,259000.0,645000.0,246000.0,399000.0,8928827,7011827.0,2018-01-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,6586,醣基,,,,,,,,,...,13000.0,,,,,,,-25000,-38000.0,2018-01-15
16,6626,唯數,,,,,,,,,...,,,,,,,,-16000,-16000.0,2018-01-15
17,6664,群翊,,,,,,,,,...,5000.0,,,,,,,5000,,2018-01-15
18,6669,緯穎,,,,,,,,,...,8000.0,,,,,,,8000,,2018-01-15
