Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 21 additions & 118 deletions scripts/data_collector/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import re
import copy
import importlib
Expand All @@ -27,16 +26,13 @@

HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"

CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"

# A 股指数日历由 Baostock 拉取;海外用 Yahoo 指数代码,避免东方财富 K 线接口
CALENDAR_BENCH_URL_MAP = {
"CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
"CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
"CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"),
# NOTE: Use the time series of SH600000 as the sequence of all stocks
"ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
# NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks
"CSI300": "000300.ss",
"CSI500": "000905.ss",
"CSI100": "000903.ss",
"US_ALL": "^GSPC",
"IN_ALL": "^NSEI",
"BR_ALL": "^BVSP",
Expand All @@ -54,70 +50,6 @@
# NOTE: Until 2020-10-20 20:00:00
MINIMUM_SYMBOLS_NUM = 3900

# 沪深 A 股列表:auto=优先东方财富,失败或数量不足则 Baostock;eastmoney=仅用东财;baostock=仅用 Baostock。
_HS_SYMBOLS_SOURCE_ENV = "QLIB_HS_SYMBOLS_SOURCE"


def _baostock_code_to_yahoo_symbol(code: str, trade_status: str):
"""将 Baostock 的 code(如 sh.600000)转为 yahooquery 后缀,规则对齐东财分页结果的过滤。"""
if trade_status == "3":
return None
if "." not in code:
return None
mkt, num = code.split(".", 1)
if not num.isdigit() or len(num) != 6:
return None
if mkt == "sh" and num.startswith("6"):
return f"{num}.ss"
if mkt == "sz" and num.startswith("399"):
return None
if mkt == "sz" and (num.startswith("0") or num.startswith("3")):
return f"{num}.sz"
return None


def _bs_error_ok(code) -> bool:
return code in ("0", 0) or str(code) == "0"


def _get_hs_symbols_from_baostock() -> set:
"""通过 Baostock query_all_stock 获取沪深 A 股 Yahoo 代码集合(开源免费,见 http://baostock.com )。"""
import baostock as bs # pylint: disable=C0415
from datetime import date, timedelta # pylint: disable=C0415

lg = bs.login()
if not _bs_error_ok(lg.error_code):
raise RuntimeError(getattr(lg, "error_msg", "") or "baostock 登录失败")
try:
for i in range(20):
day = (date.today() - timedelta(days=i)).strftime("%Y-%m-%d")
rs = bs.query_all_stock(day=day)
if not _bs_error_ok(rs.error_code):
logger.warning("Baostock query_all_stock({}) 失败: {}", day, getattr(rs, "error_msg", ""))
continue
symbols = set()
row_cap = 20000
n = 0
while rs.next():
n += 1
if n > row_cap:
logger.warning("Baostock query_all_stock 行数超过 {},中止当日解析", row_cap)
break
row = rs.get_row_data()
status = row[1] if len(row) > 1 else "1"
y = _baostock_code_to_yahoo_symbol(row[0], status)
if y:
symbols.add(y)
if len(symbols) >= MINIMUM_SYMBOLS_NUM:
logger.info("Baostock 于 {} 解析得 {} 只 A 股(Yahoo 代码)", day, len(symbols))
return symbols
logger.warning("Baostock {} 仅解析 {} 只股票,尝试更早日期", day, len(symbols))
finally:
bs.logout()
raise ValueError(
f"Baostock 连续尝试后仍不足 {MINIMUM_SYMBOLS_NUM} 只股票,请检查网络或 baostock 服务状态"
)


def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
"""get SH/SZ history calendar list
Expand Down Expand Up @@ -152,19 +84,19 @@ def _get_calendar(end_date):
print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
elif bench_code.upper() == "ALL":
import akshare as ak # pylint: disable=C0415

trade_date_df = ak.tool_trade_date_hist_sina()
trade_date_list = trade_date_df["trade_date"].tolist()
trade_date_list = [pd.Timestamp(d) for d in trade_date_list]
dates = pd.DatetimeIndex(trade_date_list)
filtered_dates = dates[(dates >= "2000-01-04") & (dates <= pd.Timestamp.today().normalize())]
calendar = filtered_dates.tolist()
else:
if bench_code.upper() == "ALL":
import akshare as ak # pylint: disable=C0415

trade_date_df = ak.tool_trade_date_hist_sina()
trade_date_list = trade_date_df["trade_date"].tolist()
trade_date_list = [pd.Timestamp(d) for d in trade_date_list]
dates = pd.DatetimeIndex(trade_date_list)
filtered_dates = dates[(dates >= "2000-01-04") & (dates <= pd.Timestamp.today().normalize())]
calendar = filtered_dates.tolist()
else:
end_date = time.strftime("%Y-%m-%d", time.localtime())
calendar = _get_calendar(end_date=end_date)
# CSI300 / CSI500 / CSI100 等与 A 股交易日一致,统一用 Baostock,避免东方财富接口
end_date = time.strftime("%Y-%m-%d", time.localtime())
calendar = _get_calendar(end_date=end_date)
_CALENDAR_MAP[bench_code] = calendar
logger.info(f"end of get calendar list: {bench_code}.")
return calendar
Expand Down Expand Up @@ -331,40 +263,11 @@ def _get_symbol():

if _HS_SYMBOLS is None:
symbols = set()
src = (os.environ.get(_HS_SYMBOLS_SOURCE_ENV) or "auto").strip().lower()
if src == "eastmoney":
logger.info("QLIB_HS_SYMBOLS_SOURCE=eastmoney,仅使用东方财富全市场列表接口")
elif src == "baostock":
logger.info("QLIB_HS_SYMBOLS_SOURCE=baostock,仅使用 Baostock")
try:
symbols = _get_hs_symbols_from_baostock()
except ImportError as e:
raise ImportError("QLIB_HS_SYMBOLS_SOURCE=baostock 时需安装: pip install baostock") from e
else:
# auto:东财优先,异常或少于阈值再尝试 Baostock
logger.info(
"QLIB_HS_SYMBOLS_SOURCE=auto(默认):优先东方财富全市场列表,不足或失败时回退 Baostock"
)
try:
symbols = _get_symbol()
except Exception as e:
logger.warning("东方财富获取 A 股列表失败: {},将尝试 Baostock", e)
symbols = set()
if len(symbols) < MINIMUM_SYMBOLS_NUM:
try:
symbols |= _get_hs_symbols_from_baostock()
if symbols:
logger.info("Baostock 回退/补充后标的数量: {}", len(symbols))
except ImportError:
logger.warning("未安装 baostock(pip install baostock),无法使用 Baostock 回退")
except Exception as e:
logger.warning("Baostock 获取 A 股列表失败: {}", e)

if len(symbols) < MINIMUM_SYMBOLS_NUM:
_retry = 60
while len(symbols) < MINIMUM_SYMBOLS_NUM:
symbols |= _get_symbol()
time.sleep(3)
_retry = 60
# It may take multiple times to get the complete
while len(symbols) < MINIMUM_SYMBOLS_NUM:
symbols |= _get_symbol()
time.sleep(3)

symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve()
symbol_cache_path.parent.mkdir(parents=True, exist_ok=True)
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_collector/yahoo/run_cn_incremental_update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# 网络说明
# -----------------------------------------------------------------------------
# 本脚本不设置 http(s)_proxy / ALL_PROXY。流量走系统路由(本机 VPN TUN、运营商等)。
# 需能访问:Yahoo(yahooquery)、东方财富或 Baostock(标的列表)等,否则会在 Python 里报错。
# 需能访问:Yahoo(yahooquery)、东方财富(A 股全市场列表分页)等,否则会在 Python 里报错。
#
# -----------------------------------------------------------------------------
# 用法
Expand Down