In [None]:
!pip install baostock -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
SAVE_DIR = '/content/drive/MyDrive/kronos/data'
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
import os, time, pickle
import baostock as bs
import pandas as pd

def save_pickle(data, path):
    tmp = path + '.tmp'
    with open(tmp, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
    os.replace(tmp, path)

def load_pickle(path):
    if os.path.exists(path):
        with open(path, 'rb') as f:
            return pickle.load(f)
    return {}

lg = bs.login()
print(f'baostock login: {lg.error_msg}')

In [None]:
# Get all A-share stock codes
rs = bs.query_all_stock(day='2026-02-25')
all_stocks = rs.get_data()

# Filter to A-shares only: sh.6xxxxx, sz.0xxxxx, sz.3xxxxx
mask = all_stocks['code'].str.match(r'^(sh\.6|sz\.0|sz\.3)')
symbols = all_stocks[mask]['code'].tolist()
print(f'Found {len(symbols)} A-share stocks')
print(f'Examples: {symbols[:5]}')

In [None]:
# === TEST: download first 5 stocks ===
# Change symbols[:5] to `symbols` for the full run.

SAVE_PATH = f'{SAVE_DIR}/ohlcv_all_a.pkl'
START = '2015-01-01'
END   = '2026-02-26'
FIELDS = 'date,open,high,low,close,volume,amount'

data = load_pickle(SAVE_PATH)
if data:
    print(f'Resuming: {len(data)} stocks already downloaded')

batch = symbols[:5]  # <-- change to `symbols` for full download
total = len(batch)

for i, code in enumerate(batch, 1):
    if code in data:
        continue
    try:
        rs = bs.query_history_k_data_plus(
            code=code, fields=FIELDS,
            start_date=START, end_date=END,
            frequency='d', adjustflag='2',  # qfq
        )
        rows = []
        while (rs.error_code == '0') & rs.next():
            rows.append(rs.get_row_data())
        if not rows:
            print(f'[{i}/{total}] {code} — empty')
            continue
        df = pd.DataFrame(rows, columns=rs.fields)
        for col in ['open','high','low','close','volume','amount']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date')
        data[code] = df
        rng = f'{df.index.min().date()} to {df.index.max().date()}'
        print(f'[{i}/{total}] {code} — {len(df)} rows ({rng})')
    except Exception as e:
        print(f'[{i}/{total}] {code} — FAILED: {e}')

    if len(data) % 100 == 0 and len(data) > 0:
        save_pickle(data, SAVE_PATH)
        print(f'  ** checkpoint ({len(data)} stocks) **')

save_pickle(data, SAVE_PATH)
print(f'\nDone. {len(data)} stocks saved to {SAVE_PATH}')

In [None]:
# Download CSI500 benchmark (sh.000905)
rs = bs.query_history_k_data_plus(
    code='sh.000905', fields='date,open,high,low,close,volume,amount',
    start_date='2015-01-01', end_date='2026-02-26',
    frequency='d',
)
rows = []
while (rs.error_code == '0') & rs.next():
    rows.append(rs.get_row_data())
bench = pd.DataFrame(rows, columns=rs.fields)
for col in ['open','high','low','close','volume','amount']:
    bench[col] = pd.to_numeric(bench[col], errors='coerce')
bench['date'] = pd.to_datetime(bench['date'])
bench = bench.set_index('date')

bench_path = f'{SAVE_DIR}/benchmark_000905.pkl'
save_pickle(bench, bench_path)
print(f'Benchmark CSI500: {len(bench)} rows saved to {bench_path}')

In [None]:
bs.logout()

# Verify
check = load_pickle(f'{SAVE_DIR}/ohlcv_all_a.pkl')
print(f'Stocks in pickle: {len(check)}')
for sym, df in list(check.items())[:3]:
    print(f'  {sym}: {df.shape}, cols={list(df.columns)}, '
          f'dtypes=[{df["close"].dtype}]')