### Data processing ( BTC )

In [1]:
import pandas as pd
import os
from pathlib import Path
from typing import Union, List

def load_crypto_data(
    base_path: Union[str, Path],
    symbol: str = "BTCUSDT",
    year: str = "2022",
    freq: str = "1m"
) -> pd.DataFrame:
    """
    加載並合併特定交易對的所有每日CSV文件
    
    Args:
        base_path (str/Path): 數據文件的基礎路徑
        symbol (str): 交易對名稱，如 "BTCUSDT"
        year (str): 年份，如 "2022"
        freq (str): 頻率，如 "1m"
    
    Returns:
        pd.DataFrame: 合併後的DataFrame
    """
    # 確保base_path是Path對象
    base_path = Path(base_path)
    
    # 構建文件匹配模式
    pattern = f"{symbol}_{year}-*_{freq}.csv"
    
    # 獲取所有匹配的文件並排序
    csv_files = sorted(list(base_path.glob(pattern)))
    
    if not csv_files:
        raise ValueError(f"No files found matching pattern: {pattern}")
    
    # 讀取並合併所有CSV文件
    dfs = []
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            dfs.append(df)
        except Exception as e:
            print(f"Error reading file {file}: {e}")
    
    # 合併所有DataFrame
    if not dfs:
        raise ValueError("No data was successfully loaded")
    
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # 確保時間列正確格式化
    merged_df['open_time'] = pd.to_datetime(merged_df['open_time'])
    merged_df['close_time'] = pd.to_datetime(merged_df['close_time'])
    
    # 按時間排序
    merged_df = merged_df.sort_values('open_time').reset_index(drop=True)
    
    # 檢查並移除重複數據
    merged_df = merged_df.drop_duplicates(subset=['open_time'], keep='first')
    
    return merged_df

def check_data_quality(df: pd.DataFrame) -> dict:
    """
    檢查合併後數據的質量
    
    Args:
        df (pd.DataFrame): 待檢查的DataFrame
    
    Returns:
        dict: 包含各種數據質量指標的字典
    """
    quality_report = {
        'total_rows': len(df),
        'date_range': (df['open_time'].min(), df['open_time'].max()),
        'missing_values': df.isnull().sum().to_dict(),
        'duplicate_times': len(df) - len(df['open_time'].unique()),
    }
    
    # 檢查時間間隔
    time_diffs = df['open_time'].diff().value_counts()
    quality_report['time_intervals'] = time_diffs
    
    return quality_report

# 使用示例


In [2]:
if __name__ == "__main__":
    # 假設數據路徑
    data_path = "/Users/mouyasushi/Desktop/crypto_strat/alpha/Quant-Training-Group-G/Alpha-Research/kline/binance/BTCUSDT/1m"
    
    # 加載數據
    btc_data = load_crypto_data(
        base_path=data_path,
        symbol="BTCUSDT",
        year="2022",
        freq="1m"
    )
    
    # 檢查數據質量
    quality_report = check_data_quality(btc_data)
    
    # 輸出基本信息
    print(f"Loaded data shape: {btc_data.shape}")
    print(f"Date range: {quality_report['date_range']}")
    print(f"Total rows: {quality_report['total_rows']}")

Loaded data shape: (518400, 12)
Date range: (Timestamp('2022-01-01 00:00:00'), Timestamp('2022-12-26 23:59:00'))
Total rows: 518400


In [3]:
btc_data

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2022-01-01 00:00:00,46216.93,46271.08,46208.37,46250.00,40.57574,2022-01-01 00:00:59.999,1.875978e+06,796,27.26086,1.260270e+06,0
1,2022-01-01 00:01:00,46250.00,46344.23,46234.39,46312.76,42.38106,2022-01-01 00:01:59.999,1.961908e+06,973,28.92522,1.339049e+06,0
2,2022-01-01 00:02:00,46312.76,46381.69,46292.75,46368.73,51.29955,2022-01-01 00:02:59.999,2.377414e+06,1238,22.74786,1.054135e+06,0
3,2022-01-01 00:03:00,46368.73,46391.49,46314.26,46331.08,30.45894,2022-01-01 00:03:59.999,1.412004e+06,953,16.12179,7.473086e+05,0
4,2022-01-01 00:04:00,46331.07,46336.10,46300.00,46321.34,20.96029,2022-01-01 00:04:59.999,9.710099e+05,591,11.49082,5.323199e+05,0
...,...,...,...,...,...,...,...,...,...,...,...,...
518395,2022-12-26 23:55:00,16931.18,16944.52,16921.54,16925.79,493.19892,2022-12-26 23:55:59.999,8.352277e+06,10669,228.20326,3.864590e+06,0
518396,2022-12-26 23:56:00,16925.79,16939.41,16924.40,16927.61,308.61331,2022-12-26 23:56:59.999,5.225148e+06,6550,155.60933,2.634646e+06,0
518397,2022-12-26 23:57:00,16927.61,16928.43,16916.38,16920.85,166.23704,2022-12-26 23:57:59.999,2.813017e+06,4384,84.82348,1.435348e+06,0
518398,2022-12-26 23:58:00,16920.85,16921.10,16915.71,16918.79,76.21680,2022-12-26 23:58:59.999,1.289493e+06,2815,38.97172,6.593676e+05,0


### Data processing ( ALT ) : ETH 

In [4]:
if __name__ == "__main__":
    # 假設數據路徑
    data_path = "/Users/mouyasushi/Desktop/crypto_strat/alpha/Quant-Training-Group-G/Alpha-Research/kline/binance/ETHUSDT/1m"
    
    # 加載數據
    eth_data = load_crypto_data(
        base_path=data_path,
        symbol="ETHUSDT",
        year="2022",
        freq="1m"
    )
    
    # 檢查數據質量
    quality_report = check_data_quality(btc_data)
    
    # 輸出基本信息
    print(f"Loaded data shape: {btc_data.shape}")
    print(f"Date range: {quality_report['date_range']}")
    print(f"Total rows: {quality_report['total_rows']}")

Loaded data shape: (518400, 12)
Date range: (Timestamp('2022-01-01 00:00:00'), Timestamp('2022-12-26 23:59:00'))
Total rows: 518400


In [5]:
eth_data

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2022-01-01 00:00:00,3676.22,3687.05,3676.22,3684.84,504.3020,2022-01-01 00:00:59.999,1.856132e+06,749,271.3554,9.986197e+05,0
1,2022-01-01 00:01:00,3684.85,3694.20,3681.33,3691.55,273.0180,2022-01-01 00:01:59.999,1.006818e+06,580,181.6745,6.700959e+05,0
2,2022-01-01 00:02:00,3692.50,3694.42,3687.49,3693.62,216.0824,2022-01-01 00:02:59.999,7.976563e+05,460,80.1555,2.959250e+05,0
3,2022-01-01 00:03:00,3693.63,3695.41,3689.55,3690.58,250.2232,2022-01-01 00:03:59.999,9.237721e+05,448,109.0924,4.027651e+05,0
4,2022-01-01 00:04:00,3690.57,3691.03,3688.00,3690.09,119.8314,2022-01-01 00:04:59.999,4.421674e+05,279,55.1763,2.035886e+05,0
...,...,...,...,...,...,...,...,...,...,...,...,...
518395,2022-12-26 23:55:00,1229.71,1231.09,1226.87,1227.81,1471.7586,2022-12-26 23:55:59.999,1.809334e+06,1407,816.9712,1.004298e+06,0
518396,2022-12-26 23:56:00,1227.81,1229.24,1227.81,1228.02,260.7216,2022-12-26 23:56:59.999,3.202682e+05,484,125.2568,1.538509e+05,0
518397,2022-12-26 23:57:00,1228.01,1228.02,1227.34,1227.63,141.5787,2022-12-26 23:57:59.999,1.738122e+05,286,62.6124,7.686638e+04,0
518398,2022-12-26 23:58:00,1227.63,1227.66,1227.63,1227.66,34.0023,2022-12-26 23:58:59.999,4.174255e+04,179,13.5774,1.666818e+04,0


### Caculate indicators for first model 
1. 對數收益率(returns)
2. 已實現波動率(realized volatility)
3. BiPower Variation (BPV)
4. 跳躍指標(jumps)

- 先進行ACF 自相關性分析，確定results = model.fit(cov_type='HAC', cov_kwds={'maxlags': 10})
中的maxlags 如何設定較為合理


In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

class JumpAnalyzer:
    def __init__(self, window_size: int = 20):
        self.window_size = window_size

    def process_crypto_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """主要數據處理函數"""
        # 1. 計算log returns
        df['log_return'] = np.log(df['close']).diff()

        # 2. 計算realized volatility
        df['rv'] = np.sqrt(
            (df['log_return']**2).rolling(
                window=self.window_size
            ).sum()
        )

        # 3. 計算BiPower Variation
        mu1 = np.sqrt(2 / np.pi)
        abs_returns = np.abs(df['log_return'])
        df['bpv'] = abs_returns.rolling(
            window=self.window_size
        ).apply(
            lambda x: np.sum(x[1:] * x[:-1]), raw=True
        ) / (mu1**2)

        # 4. 檢測跳躍
        df['jump_component'] = np.maximum(df['rv'] - df['bpv'], 0)
        df['is_jump'] = (
            df['jump_component'] > 3.0 * np.sqrt(2 * df['bpv'])
        ).astype(int)

        return df

    @staticmethod
    def estimate_base_model(self, btc_df: pd.DataFrame, alt_df: pd.DataFrame) -> pd.DataFrame:
        """估計基礎迴歸模型"""
        # 合併BTC和Alt幣數據
        model_df = pd.merge(
            btc_df[['log_return', 'rv', 'is_jump']], 
            alt_df[['log_return']],
            left_index=True, 
            right_index=True,
            suffixes=('_btc', '_alt')
        )

        # 準備模型變量
        Y = model_df['log_return_alt']
        X = sm.add_constant(pd.DataFrame({
            'btc_jump': model_df['is_jump'],
            'btc_rv': model_df['rv_btc'],
            'btc_return': model_df['log_return_btc']
        }))

        # 估計模型
        model = sm.OLS(Y, X)
        results = model.fit(cov_type='HAC', cov_kwds={'maxlags': 10})

        return results, model_df



In [11]:
# Drop NaN values from both btc_processed and alt_processed
btc_processed = btc_data.dropna()
eth_processed = eth_data.dropna()

# Ensure indices are aligned after dropping
btc_processed = btc_processed.loc[btc_processed.index.intersection(eth_processed.index)]
eth_processed = eth_processed.loc[eth_processed.index.intersection(btc_processed.index)]

eth_processed.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume,ignore
0,2022-01-01 00:00:00,3676.22,3687.05,3676.22,3684.84,504.302,2022-01-01 00:00:59.999,1856132.0,749,271.3554,998619.715817,0
1,2022-01-01 00:01:00,3684.85,3694.2,3681.33,3691.55,273.018,2022-01-01 00:01:59.999,1006818.0,580,181.6745,670095.912422,0
2,2022-01-01 00:02:00,3692.5,3694.42,3687.49,3693.62,216.0824,2022-01-01 00:02:59.999,797656.3,460,80.1555,295925.007905,0
3,2022-01-01 00:03:00,3693.63,3695.41,3689.55,3690.58,250.2232,2022-01-01 00:03:59.999,923772.1,448,109.0924,402765.07985,0
4,2022-01-01 00:04:00,3690.57,3691.03,3688.0,3690.09,119.8314,2022-01-01 00:04:59.999,442167.4,279,55.1763,203588.562778,0


In [13]:
analyzer = JumpAnalyzer(window_size=20)

model_results, model_df = analyzer.estimate_base_model(btc_processed, eth_processed)
print(model_results.summary())

TypeError: JumpAnalyzer.estimate_base_model() missing 1 required positional argument: 'alt_df'