diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 507d095231..6ffe9e8c8a 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import os import re import copy import importlib @@ -27,16 +26,13 @@ HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" -CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231" SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}" +# A 股指数日历由 Baostock 拉取;海外用 Yahoo 指数代码,避免东方财富 K 线接口 CALENDAR_BENCH_URL_MAP = { - "CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"), - "CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"), - "CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"), - # NOTE: Use the time series of SH600000 as the sequence of all stocks - "ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"), - # NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks + "CSI300": "000300.ss", + "CSI500": "000905.ss", + "CSI100": "000903.ss", "US_ALL": "^GSPC", "IN_ALL": "^NSEI", "BR_ALL": "^BVSP", @@ -54,70 +50,6 @@ # NOTE: Until 2020-10-20 20:00:00 MINIMUM_SYMBOLS_NUM = 3900 -# 沪深 A 股列表:auto=优先东方财富,失败或数量不足则 Baostock;eastmoney=仅用东财;baostock=仅用 Baostock。 -_HS_SYMBOLS_SOURCE_ENV = "QLIB_HS_SYMBOLS_SOURCE" - - -def _baostock_code_to_yahoo_symbol(code: str, trade_status: str): - """将 Baostock 的 code(如 sh.600000)转为 yahooquery 后缀,规则对齐东财分页结果的过滤。""" - if trade_status == "3": - return None - if "." not in code: - return None - mkt, num = code.split(".", 1) - if not num.isdigit() or len(num) != 6: - return None - if mkt == "sh" and num.startswith("6"): - return f"{num}.ss" - if mkt == "sz" and num.startswith("399"): - return None - if mkt == "sz" and (num.startswith("0") or num.startswith("3")): - return f"{num}.sz" - return None - - -def _bs_error_ok(code) -> bool: - return code in ("0", 0) or str(code) == "0" - - -def _get_hs_symbols_from_baostock() -> set: - """通过 Baostock query_all_stock 获取沪深 A 股 Yahoo 代码集合(开源免费,见 http://baostock.com )。""" - import baostock as bs # pylint: disable=C0415 - from datetime import date, timedelta # pylint: disable=C0415 - - lg = bs.login() - if not _bs_error_ok(lg.error_code): - raise RuntimeError(getattr(lg, "error_msg", "") or "baostock 登录失败") - try: - for i in range(20): - day = (date.today() - timedelta(days=i)).strftime("%Y-%m-%d") - rs = bs.query_all_stock(day=day) - if not _bs_error_ok(rs.error_code): - logger.warning("Baostock query_all_stock({}) 失败: {}", day, getattr(rs, "error_msg", "")) - continue - symbols = set() - row_cap = 20000 - n = 0 - while rs.next(): - n += 1 - if n > row_cap: - logger.warning("Baostock query_all_stock 行数超过 {},中止当日解析", row_cap) - break - row = rs.get_row_data() - status = row[1] if len(row) > 1 else "1" - y = _baostock_code_to_yahoo_symbol(row[0], status) - if y: - symbols.add(y) - if len(symbols) >= MINIMUM_SYMBOLS_NUM: - logger.info("Baostock 于 {} 解析得 {} 只 A 股(Yahoo 代码)", day, len(symbols)) - return symbols - logger.warning("Baostock {} 仅解析 {} 只股票,尝试更早日期", day, len(symbols)) - finally: - bs.logout() - raise ValueError( - f"Baostock 连续尝试后仍不足 {MINIMUM_SYMBOLS_NUM} 只股票,请检查网络或 baostock 服务状态" - ) - def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]: """get SH/SZ history calendar list @@ -152,19 +84,19 @@ def _get_calendar(end_date): print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")) df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max") calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist() + elif bench_code.upper() == "ALL": + import akshare as ak # pylint: disable=C0415 + + trade_date_df = ak.tool_trade_date_hist_sina() + trade_date_list = trade_date_df["trade_date"].tolist() + trade_date_list = [pd.Timestamp(d) for d in trade_date_list] + dates = pd.DatetimeIndex(trade_date_list) + filtered_dates = dates[(dates >= "2000-01-04") & (dates <= pd.Timestamp.today().normalize())] + calendar = filtered_dates.tolist() else: - if bench_code.upper() == "ALL": - import akshare as ak # pylint: disable=C0415 - - trade_date_df = ak.tool_trade_date_hist_sina() - trade_date_list = trade_date_df["trade_date"].tolist() - trade_date_list = [pd.Timestamp(d) for d in trade_date_list] - dates = pd.DatetimeIndex(trade_date_list) - filtered_dates = dates[(dates >= "2000-01-04") & (dates <= pd.Timestamp.today().normalize())] - calendar = filtered_dates.tolist() - else: - end_date = time.strftime("%Y-%m-%d", time.localtime()) - calendar = _get_calendar(end_date=end_date) + # CSI300 / CSI500 / CSI100 等与 A 股交易日一致,统一用 Baostock,避免东方财富接口 + end_date = time.strftime("%Y-%m-%d", time.localtime()) + calendar = _get_calendar(end_date=end_date) _CALENDAR_MAP[bench_code] = calendar logger.info(f"end of get calendar list: {bench_code}.") return calendar @@ -331,40 +263,11 @@ def _get_symbol(): if _HS_SYMBOLS is None: symbols = set() - src = (os.environ.get(_HS_SYMBOLS_SOURCE_ENV) or "auto").strip().lower() - if src == "eastmoney": - logger.info("QLIB_HS_SYMBOLS_SOURCE=eastmoney,仅使用东方财富全市场列表接口") - elif src == "baostock": - logger.info("QLIB_HS_SYMBOLS_SOURCE=baostock,仅使用 Baostock") - try: - symbols = _get_hs_symbols_from_baostock() - except ImportError as e: - raise ImportError("QLIB_HS_SYMBOLS_SOURCE=baostock 时需安装: pip install baostock") from e - else: - # auto:东财优先,异常或少于阈值再尝试 Baostock - logger.info( - "QLIB_HS_SYMBOLS_SOURCE=auto(默认):优先东方财富全市场列表,不足或失败时回退 Baostock" - ) - try: - symbols = _get_symbol() - except Exception as e: - logger.warning("东方财富获取 A 股列表失败: {},将尝试 Baostock", e) - symbols = set() - if len(symbols) < MINIMUM_SYMBOLS_NUM: - try: - symbols |= _get_hs_symbols_from_baostock() - if symbols: - logger.info("Baostock 回退/补充后标的数量: {}", len(symbols)) - except ImportError: - logger.warning("未安装 baostock(pip install baostock),无法使用 Baostock 回退") - except Exception as e: - logger.warning("Baostock 获取 A 股列表失败: {}", e) - - if len(symbols) < MINIMUM_SYMBOLS_NUM: - _retry = 60 - while len(symbols) < MINIMUM_SYMBOLS_NUM: - symbols |= _get_symbol() - time.sleep(3) + _retry = 60 + # It may take multiple times to get the complete + while len(symbols) < MINIMUM_SYMBOLS_NUM: + symbols |= _get_symbol() + time.sleep(3) symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve() symbol_cache_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/scripts/data_collector/yahoo/run_cn_incremental_update.sh b/scripts/data_collector/yahoo/run_cn_incremental_update.sh index f4d03e1370..2486e6bfc1 100755 --- a/scripts/data_collector/yahoo/run_cn_incremental_update.sh +++ b/scripts/data_collector/yahoo/run_cn_incremental_update.sh @@ -25,7 +25,7 @@ # 网络说明 # ----------------------------------------------------------------------------- # 本脚本不设置 http(s)_proxy / ALL_PROXY。流量走系统路由(本机 VPN TUN、运营商等)。 -# 需能访问:Yahoo(yahooquery)、东方财富或 Baostock(标的列表)等,否则会在 Python 里报错。 +# 需能访问:Yahoo(yahooquery)、东方财富(A 股全市场列表分页)等,否则会在 Python 里报错。 # # ----------------------------------------------------------------------------- # 用法