# 基础数据收集与存储

In [6]:
!pip install dotenv
# 配置日志 & 加载路径 & 检查Setting清单
import sys
from pathlib import Path
import logging
import pandas as pd
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)

print("=== 路径设置测试 ===")
print(f"当前工作目录: {Path.cwd()}")
print(f"当前 Python 路径: {sys.path[:3]}...")  # 只显示前3个避免太长

# 正确的路径设置
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"\n项目根目录: {project_root}")

# 检查系统配置
from src.config.settings import config

print("=== 系统配置 ===")
print(f"特征窗口大小: {config.FEATURE_WINDOW_SIZE}")
print(f"时间窗口设置: {config.TIME_WINDOWS}")
print(f"分类阈值类别数: {len(config.CLASSIFICATION_THRESHOLDS)}")
print()

print("分类标准:")
for class_label, (min_val, max_val) in config.CLASSIFICATION_THRESHOLDS.items():
    print(f"  类别 {class_label}: {min_val}% 到 {max_val}%")

Looking in indexes: https://pypi.org/simple/

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
=== 路径设置测试 ===
当前工作目录: /Users/anthony/Documents/github/technial_analysis_helper/notebooks
当前 Python 路径: ['/Users/anthony/Documents/github/technial_analysis_helper', '/usr/local/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python313.zip', '/usr/local/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13']...

项目根目录: /Users/anthony/Documents/github/technial_analysis_helper
=== 系统配置 ===
特征窗口大小: 300
时间窗口设置: {'short': 12, 'medium': 48, 'long': 192}
分类阈值类别数: 7

分类标准:
  类别 1: -100% 到 -5.0%
  类别 2: -5.0% 到 -2.0%
  类别 3: -2.0% 到 -0.5%
  类别 4: -0.5% 到 0.5%
  类别 5: 0.5% 到 2.0%
  类别 6: 2.0% 到 5.0%
  类别 7: 5.0% 到 100%


In [7]:
!pip install pymongo
# 测试数据库连接
from src.collect.mongodb_handler import mongo_handler

print("正在测试 MongoDB 连接...")

try:
    if mongo_handler.connect():
        print("✓ MongoDB 连接成功")
        
        # 检查现有数据量
        collection = mongo_handler.db[config.MONGODB_COLLECTION]
        record_count = collection.count_documents({})
        print(f"数据库中现有记录数: {record_count:,}")
        
        if record_count > 0:
            # 显示最新几条记录的时间范围
            latest_record = collection.find_one(sort=[("timestamp", -1)])
            earliest_record = collection.find_one(sort=[("timestamp", 1)])
            
            if latest_record and earliest_record:
                latest_time = datetime.fromtimestamp(latest_record['timestamp'] / 1000)
                earliest_time = datetime.fromtimestamp(earliest_record['timestamp'] / 1000)
                print(f"数据时间范围: {earliest_time} 到 {latest_time}")
        
        mongo_handler.close()
    else:
        print("✗ MongoDB 连接失败")
except Exception as e:
    print(f"✗ 数据库连接出错: {e}")

Looking in indexes: https://pypi.org/simple/

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


2026-01-31 00:22:38,574 - src.collect.mongodb_handler - INFO - Connected to MongoDB at mongodb://localhost:27017
2026-01-31 00:22:38,575 - src.collect.mongodb_handler - INFO - Database: technical_analysis
2026-01-31 00:22:38,575 - src.collect.mongodb_handler - INFO - Collection: candlesticks
2026-01-31 00:22:38,599 - src.collect.mongodb_handler - INFO - MongoDB connection closed


正在测试 MongoDB 连接...
✓ MongoDB 连接成功
数据库中现有记录数: 5,300
数据时间范围: 2025-06-16 14:00:00 到 2026-01-23 09:00:00


In [4]:
!pip install requests redis
# 获取最近的市场数据
from src.collect.okex_fetcher import okex_fetcher

print("正在获取最新的行情数据...")

try:
    # 获取最近的 K 线数据
    raw_data = okex_fetcher.fetch_candlesticks(inst_id="ETH-USDT", bar="1H")
    
    if raw_data:
        # 处理数据格式
        processed_data = okex_fetcher._process_candlestick_data(raw_data)
        
        print(f"✓ 成功获取 {len(processed_data)} 条 K 线数据")
        
        # 显示最新几条数据
        df = pd.DataFrame(processed_data[:5])
        print("\n最新 5 条数据:")
        print(df[['inst_id','timestamp', 'open', 'high', 'low', 'close', 'volume']].to_string(index=False))
        
        # 转换时间戳为可读格式
        latest_timestamp = processed_data[0]['timestamp']
        readable_time = datetime.fromtimestamp(latest_timestamp / 1000)
        print(f"\n最新数据时间: {readable_time}")
        
    else:
        print("✗ 未能获取到数据")
        
except Exception as e:
    print(f"✗ 获取数据时出错: {e}")

Looking in indexes: https://pypi.org/simple/
Collecting redis
  Using cached redis-7.1.0-py3-none-any.whl.metadata (12 kB)
Using cached redis-7.1.0-py3-none-any.whl (354 kB)
Installing collected packages: redis
Successfully installed redis-7.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


2026-01-31 00:24:46,193 - src.utils.rate_limiter - INFO - Connected to Redis at localhost:6379
2026-01-31 00:24:46,207 - src.collect.mongodb_handler - INFO - Connected to MongoDB at mongodb://localhost:27017
2026-01-31 00:24:46,208 - src.collect.mongodb_handler - INFO - Database: technical_analysis
2026-01-31 00:24:46,209 - src.collect.mongodb_handler - INFO - Collection: candlesticks
2026-01-31 00:24:46,212 - src.collect.okex_fetcher - INFO - Fetching candlesticks: instId=ETH-USDT, bar=1H, after=None


正在获取最新的行情数据...


2026-01-31 00:24:47,199 - src.collect.okex_fetcher - INFO - Fetched 100 candlestick records


✓ 成功获取 100 条 K 线数据

最新 5 条数据:
      inst_id     timestamp    open    high     low   close       volume
ETH-USDT-SWAP 1769785200000 2737.27 2752.93 2693.66 2729.47 16681.979095
ETH-USDT-SWAP 1769781600000 2730.14 2764.68 2724.64 2737.21 10359.367635
ETH-USDT-SWAP 1769778000000 2740.78 2751.42 2719.54 2730.14  5942.151246
ETH-USDT-SWAP 1769774400000 2759.89 2766.81 2737.59 2740.77  4092.497345
ETH-USDT-SWAP 1769770800000 2736.24 2769.06 2734.65 2759.89  5611.030953

最新数据时间: 2026-01-30 23:00:00


开始准备模型数据，你可以选择以下操作

### 创建索引
```text
db.candlesticks.createIndex(
    {"inst_id": 1, "timestamp": 1, "bar": 1},
    {
        name: "inst_id_1_timestamp_1_bar_1",
        background: true
    }
);
```

### 清空数据
```text
// Clear all data from candlesticks collection
db.candlesticks.deleteMany({});

// Verify the collection is empty
db.candlesticks.countDocuments();
```

In [12]:
# 拉取历史数据入库
from src.collect.okex_fetcher import okex_fetcher

# 默认拉10万条，测试的时候可以少拉一些
okex_fetcher.fetch_historical_data(inst_id="ETH-USDT-SWAP", bar="1D", max_records=1000 )

2026-01-31 00:32:28,170 - src.collect.okex_fetcher - INFO - Starting historical data fetch and storage, max records: 1000
2026-01-31 00:32:28,171 - src.collect.okex_fetcher - INFO - Fetching candlesticks: instId=ETH-USDT-SWAP, bar=1D, after=None
2026-01-31 00:32:28,565 - src.collect.okex_fetcher - INFO - Fetched 100 candlestick records
2026-01-31 00:32:28,586 - src.collect.okex_fetcher - INFO - Upserted 100 new records, modified 0 existing records
2026-01-31 00:32:28,586 - src.collect.okex_fetcher - INFO - Saved batch of 100 records to MongoDB
2026-01-31 00:32:28,587 - src.collect.okex_fetcher - INFO - Total records processed: 100, Total saved: 100
2026-01-31 00:32:28,588 - src.collect.okex_fetcher - INFO - Fetching candlesticks: instId=ETH-USDT-SWAP, bar=1D, after=1761148799999
2026-01-31 00:32:28,987 - src.collect.okex_fetcher - INFO - Fetched 100 candlestick records
2026-01-31 00:32:29,004 - src.collect.okex_fetcher - INFO - Upserted 100 new records, modified 0 existing records
2026

True

In [7]:
#测试获取数组格式的数据
from src.collect.mongodb_handler import mongo_handler

mongo_handler.get_candlestick_data(inst_id="ETH-USDT-SWAP", limit=2)

[{'_id': ObjectId('6972d8bf9c8191091400ae82'),
  'timestamp': 1750053600000,
  'close': 2606.6,
  'confirm': 1,
  'high': 2619.36,
  'inst_id': 'ETH-USDT-SWAP',
  'low': 2602.01,
  'open': 2606.36,
  'vol_ccy': 181870.47,
  'vol_ccy_quote': 474915200.77498,
  'volume': 1818704.7},
 {'_id': ObjectId('6972d8bf9c8191091400ae81'),
  'timestamp': 1750057200000,
  'close': 2627.7,
  'confirm': 1,
  'high': 2638.0,
  'inst_id': 'ETH-USDT-SWAP',
  'low': 2606.6,
  'open': 2606.6,
  'vol_ccy': 306465.022,
  'vol_ccy_quote': 804662352.06004,
  'volume': 3064650.22}]