In [1]:
import pandas as pd
from io import StringIO
import requests
import datetime
from etl.utils import year_transfer, last_month, char_filter, symbols_change, url_month
from etl.import_sql import *
from tw_data.models import *
import time
import json
import numpy as np
import sys,os
sys.path.append("..")
import django
django.setup()
from django.conf import settings
import logging
import datetime

logging.basicConfig(level=logging.INFO)

In [2]:
class StockPriceCrawlerTW:
    def __init__(self, date):
        self.date = date
        self.date_str = date.strftime("%Y%m%d")
        self.target_name = "台股每日交易資訊"
        self.sub_market = ["sii", "otc", "rotc"]

    def crawl_sii(self):
        r = requests.post(
            "http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=" + self.date_str + "&type=ALLBUT0999")
        content = r.text.replace("=", "")
        lines = content.split("\n")
        lines = list(filter(lambda l: len(l.split('",')) > 10, lines))
        content = "\n".join(lines)
        if content == "":
            return None
        df = pd.read_csv(StringIO(content))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(",", ""))
        df.iloc[:, 2:] = df.iloc[:, 2:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df["date"] = pd.to_datetime(self.date)
        df = df[["證券代號", "date", "證券名稱", "成交股數", "成交筆數", "成交金額", "開盤價", "收盤價", "最高價", "最低價", "最後揭示買價", "最後揭示賣價"]]

        df = df.rename(columns={"證券代號": "stock_id", "證券名稱": "stock_name",
                                "成交股數": "vol", "成交金額": "turnover_price",
                                "開盤價": "open", "收盤價": "close",
                                "最高價": "high", "最低價": "low", "成交筆數": "transactions_number",
                                "最後揭示買價": "finally_reveal_buy_price", "最後揭示賣價": "finally_reveal_sell_price"})
        return df

    @staticmethod
    def select_otc_id(code):
        if len(code) > 5:
            if code[-1] == "P":
                return False
            else:
                try:
                    code = int(code[:5])
                    if code > 10000:
                        return False
                    else:
                        return True
                except ValueError:
                    return True
        else:
            return True

    def crawl_otc(self):
        y = str(int(self.date.strftime("%Y")) - 1911)
        date_str = y + "/" + self.date.strftime("%m") + "/" + self.date.strftime("%d")
        link = "http://www.tpex.org.tw/web/stock/aftertrading/daily_close_quotes/stk_quote_download.php?l=zh-tw&d=" \
               + date_str + "&s=0,asc,0"
        r = requests.get(link)
        lines = r.text.replace("\r", "").split("\n")
        try:
            df = pd.read_csv(StringIO("\n".join(lines[3:])), header=None)
            df = df.astype(str)
        except pd.errors.ParserError:
            return None
        df.columns = list(map(lambda s: s.replace(" ", ""), lines[2].split(",")))
        df = df.apply(lambda s: s.str.replace(",", ""))
        df["stock_id"] = df["代號"]
        df["代號"] = df["代號"].apply(lambda s: self.select_otc_id(s))
        df = df[df["代號"]]
        df["date"] = pd.to_datetime(self.date)
        df = df[["stock_id", "date", "名稱", "成交股數", "成交金額(元)", "開盤", "收盤", "最高", "最低", "成交筆數", "最後買價", "最後賣價"]]
        df = df.rename(columns={"名稱": "stock_name",
                                "成交股數": "vol", "成交金額(元)": "turnover_price",
                                "開盤": "open", "收盤": "close",
                                "最高": "high", "最低": "low", "成交筆數": "transactions_number",
                                "最後買價": "finally_reveal_buy_price", "最後賣價": "finally_reveal_sell_price"})
        df.iloc[:, 3:] = df.iloc[:, 3:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df = df.dropna(thresh=7)
        return df

    def crawl_rotc(self):
        link = "http://www.tpex.org.tw/web/emergingstock/historical/daily/EMDaily_dl.php?l=zh-tw&f=EMdes010." + \
               self.date_str + "-C.csv"
        r = requests.get(link)
        lines = r.text.replace("\r", "").split("\n")
        try:
            columns_line = lines[3]
        except IndexError:
            return None
        lines = list(filter(lambda l: len(l.split('",')) > 10, lines))
        try:
            df = pd.read_csv(StringIO("\n".join(lines)), header=None)
        except pd.errors.EmptyDataError:
            return None
        df.columns = list(map(lambda l: l.replace(" ", ""), columns_line.split(",")))
        df = df.astype(str)
        df = df.apply(lambda s: s.str.replace(",", ""))
        df["date"] = pd.to_datetime(self.date)
        if "證券名稱" not in df.columns:
            df = df.rename(columns={"名稱": "證券名稱"})
            df['最後最佳報買價']=None
            df['最後最佳報賣價']=None

        df = df[["證券代號", "date", "證券名稱", "成交量", "成交金額", "前日均價", "最後", "最高", "最低", "日均價", "筆數", "最後最佳報買價", "最後最佳報賣價"]]
        df = df.rename(columns={"證券代號": "stock_id", "證券名稱": "stock_name",
                                "成交量": "vol", "成交金額": "turnover_price",
                                "前日均價": "open", "最後": "close",
                                "最高": "high", "最低": "low",
                                "日均價": "mean_price", "筆數": "transactions_number",
                                "最後最佳報買價": "finally_reveal_buy_price", "最後最佳報賣價": "finally_reveal_sell_price"})
        df.iloc[:, 3:] = df.iloc[:, 3:].apply(lambda s: pd.to_numeric(s, errors="coerce"))
        df['stock_id'] = df['stock_id'].apply(lambda s: s[:s.index('  ')] if '  ' in s else s)
        df['stock_name'] = df['stock_name'].apply(lambda s: s[:s.index('  ')] if '  ' in s else s)
        df = df[df["stock_id"] != "合計"]
        return df

    def crawl_main(self):
        try:
            df = pd.concat([self.crawl_sii(), self.crawl_otc()])
        except ValueError:
            return None
        return df

In [None]:
import datetime
date=datetime.datetime(2007,1,25)
c=StockPriceCrawlerTW(date)
cc=c.crawl_rotc()
cc


In [4]:
C1=CrawlerProcess(StockPriceCrawlerTW,'crawl_rotc' ,RotcStockPrice,'date_range')
logging.info(C1)

# # 指定日期區間爬蟲
# C1.specified_date_crawl('2007-1-4','2007-1-5')

# 自動爬蟲
C1.auto_update_crawl()

INFO:root:rotc_stock_price 
table_earliest_date:2007-01-04 00:00:00
table_latest_date:2020-07-21 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-22 to 2020-07-22 bulk_create:249
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-22 to 2020-07-22 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-07-22 00:00:00 to 2020-07-22 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-23 to 2020-07-23 bulk_create:250
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-23 to 2020-07-23 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-07-23 00:00:00 to 2020-07-23 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-24 to 2020-07-24 bulk_create:249
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-07-24 to 2020-07-24 bulk_update:0
INFO:etl.impor

INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-24 to 2020-08-24 bulk_create:249
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-24 to 2020-08-24 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-08-24 00:00:00 to 2020-08-24 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-25 to 2020-08-25 bulk_create:248
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-25 to 2020-08-25 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-08-25 00:00:00 to 2020-08-25 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-26 to 2020-08-26 bulk_create:247
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-08-26 to 2020-08-26 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-08-26 00:00:00 to 2020-08-26 00:00:00
INFO:etl.import_sql:Finish!<cla

INFO:etl.import_sql:Finish!download data from 2020-09-24 00:00:00 to 2020-09-24 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-09-25 to 2020-09-25 bulk_create:245
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-09-25 to 2020-09-25 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-09-25 00:00:00 to 2020-09-25 00:00:00
ERROR:etl.import_sql:Fail!check if 2020-09-26 00:00:00 is a holiday
ERROR:etl.import_sql:Fail!check if 2020-09-27 00:00:00 is a holiday
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-09-28 to 2020-09-28 bulk_create:246
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-09-28 to 2020-09-28 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-09-28 00:00:00 to 2020-09-28 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-09-29 to 2020-09-29 bulk_create:246
INFO:etl.im

INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-10-30 to 2020-10-30 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-10-30 00:00:00 to 2020-10-30 00:00:00
ERROR:etl.import_sql:Fail!check if 2020-10-31 00:00:00 is a holiday
ERROR:etl.import_sql:Fail!check if 2020-11-01 00:00:00 is a holiday
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-11-02 to 2020-11-02 bulk_create:247
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-11-02 to 2020-11-02 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-11-02 00:00:00 to 2020-11-02 00:00:00
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-11-03 to 2020-11-03 bulk_create:247
INFO:etl.import_sql:Finish!<class 'tw_data.models.RotcStockPrice'> date:from 2020-11-03 to 2020-11-03 bulk_update:0
INFO:etl.import_sql:Finish!download data from 2020-11-03 00:00:00 to 2020-11-03 00:00:00
INFO:etl.impo

'Finish!download data from 2020-07-22 00:00:00 to 2020-11-19 00:00:00'