In [1]:
import sys,os
sys.path.append("..")
import django
django.setup()
import pandas as pd
from io import StringIO
import requests
import datetime

In [2]:
class CrawlStockPriceTW:
    def __init__(self, date):
        self.date = date
        self.date_str = date.strftime("%Y%m%d")
        self.target_name = "台股每日交易資訊"
        self.sub_market = ["sii", "otc", "rotc"]

    def crawl_sii(self):
        r = requests.post(
            "http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=" + self.date_str + "&type=ALLBUT0999")
        content = r.text.replace("=", "")
        lines = content.split("\n")
        lines = list(filter(lambda l: len(l.split('",')) > 10, lines))
        content = "\n".join(lines)
        if content == "":
            return None
        df = pd.read_csv(StringIO(content))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(",", ""))
        df.iloc[:, 2:] = df.iloc[:, 2:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df["date"] = self.date.date()
        df = df[["證券代號", "date", "證券名稱", "成交股數", "成交金額", "開盤價", "收盤價", "最高價", "最低價"]]
        df = df.rename(columns={"證券代號": "stock_id", "證券名稱": "stock_name",
                                "成交股數": "turnover_vol", "成交金額": "turnover_price",
                                "開盤價": "open_price", "收盤價": "close_price",
                                "最高價": "high_price", "最低價": "low_price"})
        df['market'] = '上市'
        df = df.where(pd.notnull(df), None)
        return df

    @staticmethod
    def select_otc_id(code):
        if len(code) > 5:
            if code[-1] == "P":
                return False
            else:
                try:
                    code = int(code[:5])
                    if code > 10000:
                        return False
                    else:
                        return True
                except ValueError:
                    return True
        else:
            return True

    def crawl_otc(self):
        y = str(int(self.date.strftime("%Y")) - 1911)
        date_str = y + "/" + self.date.strftime("%m") + "/" + self.date.strftime("%d")
        link = "http://www.tpex.org.tw/web/stock/aftertrading/daily_close_quotes/stk_quote_download.php?l=zh-tw&d=" \
               + date_str + "&s=0,asc,0"
        r = requests.get(link)
        lines = r.text.replace("\r", "").split("\n")
        try:
            df = pd.read_csv(StringIO("\n".join(lines[3:])), header=None)
            df = df.astype(str)
        except pd.errors.ParserError:
            return None
        df.columns = list(map(lambda s: s.replace(" ", ""), lines[2].split(",")))
        df = df.apply(lambda s: s.str.replace(",", ""))
        df["stock_id"] = df["代號"]
        df["代號"] = df["代號"].apply(lambda s: self.select_otc_id(s))
        df = df[df["代號"]]
        df["date"] = self.date.date()
        df = df[["stock_id", "date", "名稱", "成交股數", "成交金額(元)", "開盤", "收盤", "最高", "最低"]]
        df = df.rename(columns={"名稱": "stock_name",
                                "成交股數": "turnover_vol", "成交金額(元)": "turnover_price",
                                "開盤": "open_price", "收盤": "close_price",
                                "最高": "high_price", "最低": "low_price"})
        df.iloc[:, 3:] = df.iloc[:, 3:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df = df[df["turnover_vol"] >= 0]
        df['market'] = '上櫃'
        df = df.where(pd.notnull(df), None)
        return df

    def crawl_rotc(self):
        link = "http://www.tpex.org.tw/web/emergingstock/historical/daily/EMDaily_dl.php?l=zh-tw&f=EMdes010." + \
               self.date_str + "-C.csv"
        r = requests.get(link)
        lines = r.text.replace("\r", "").split("\n")
        try:
            columns_line = lines[3]
        except IndexError:
            return None
        lines = list(filter(lambda l: len(l.split('",')) > 10, lines))
        try:
            df = pd.read_csv(StringIO("\n".join(lines)), header=None)
        except pd.errors.EmptyDataError:
            return None
        df.columns = list(map(lambda l: l.replace(" ", ""), columns_line.split(",")))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(",", ""))
        df.iloc[:, 3:] = df.iloc[:, 3:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df["date"] = self.date.date()
        if "證券名稱" in df.columns:
            df = df[["證券代號", "date", "證券名稱", "成交量", "成交金額", "前日均價", "最後", "最高", "最低"]]
        # old format("名稱")
        else:
            df = df[["證券代號", "date", "名稱", "成交量", "成交金額", "前日均價", "最後", "最高", "最低"]]
            df = df.rename(columns={"名稱": "證券名稱"})

        df = df.rename(columns={"證券代號": "stock_id", "證券名稱": "stock_name",
                                "成交量": "turnover_vol", "成交金額": "turnover_price",
                                "前日均價": "open_price", "最後": "close_price",
                                "最高": "high_price", "最低": "low_price"})
        # solve " "
        df['stock_id'] = df['stock_id'].apply(lambda s: s[:s.index(' ')] if '" "' in s else s)
        df['stock_name'] = df['stock_name'].apply(lambda s: s[:s.index(' ')] if '" "' in s else s)
        df = df[df["stock_id"] != "合計"]
        df['market'] = '興櫃'
        df = df.where(pd.notnull(df), None)
        return df

    def crawl_main(self):
        try:
            df = pd.concat([self.crawl_sii(), self.crawl_otc(), self.crawl_rotc()])
        except ValueError:
            return None
        return df
    
z=CrawlStockPriceTW(datetime.datetime(2018,1,3)).crawl_otc()
z

Unnamed: 0,stock_id,date,stock_name,turnover_vol,turnover_price,open_price,close_price,high_price,low_price,market
0,006201,2018-01-03,元大富櫃50,105000,1.55788e+06,14.72,14.92,14.92,14.72,上櫃
1,00679B,2018-01-03,元大美債20年,948052,3.63606e+07,38.39,38.37,38.4,38.31,上櫃
2,00687B,2018-01-03,國泰20年美債,418000,1.67202e+07,40.06,40.01,40.06,39.98,上櫃
3,00694B,2018-01-03,富邦美債1-3,500075,1.95879e+07,39.17,39.17,39.17,39.17,上櫃
4,00695B,2018-01-03,富邦美債7-10,500100,1.93239e+07,38.64,38.64,38.64,38.64,上櫃
...,...,...,...,...,...,...,...,...,...,...
5233,9950,2018-01-03,萬國通,22293,303848,13.7,13.65,13.7,13.6,上櫃
5234,9951,2018-01-03,皇田,240000,4.0812e+07,172,170.5,173,168,上櫃
5235,9960,2018-01-03,邁達康,37376,1.37085e+06,36.35,37,37,35.8,上櫃
5236,9962,2018-01-03,有益,32917,339295,10.35,10.35,10.4,10.25,上櫃


In [11]:
def select_otc_id(code):
    if len(code) > 5:
        if code[-1]=="P":
            return False
        else:
            try:
                code = int(code[:5])
                if code > 10000:
                    return False
                else:
                    print(1)
                    return True
            except ValueError:
                return True
    else:
        print(2)
        return True
    
select_otc_id('70006X')

False

In [12]:
get_pk_dict = {pk+ '__contain': pk for pk in ('a','b','c')}
get_pk_dict.keys()
# get_pk_contain_dict = {pk + '__contain': df[pk] for pk in pk_columns}

dict_keys(['a__contain', 'b__contain', 'c__contain'])

In [14]:
z.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            5228, 5229, 5230, 5231, 5232, 5233, 5234, 5235, 5236, 5239],
           dtype='int64', length=752)