In [None]:
import pandas as pd
import numpy as np
import io
import os
import datetime
import IPython

import yfinance as yf

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.float_format', '{:.5f}'.format)

In [None]:
def get_trade_date(start_date, end_date):
    """
    獲取大盤指數和交易日數據
    :param start_date: 字串，起始日期
    :param end_date: 字串，結束日期
    :return: DataFrame, 交易日期數組
    """
    tw_df = yf.download('^TWII', start=start_date, end=end_date)
    tw_df['Date'] = tw_df.index
    trade_dates = tw_df.index.to_numpy()
    return tw_df, trade_dates

In [2]:
def get_history_data():
    """
    獲取所有股票的歷史數據
    :return: 包含所有歷史數據的DataFrame
    """
    folder_path = 'C:\\Users\\KaiJung\\Desktop\\z_data\\history_data'
    all_data = pd.DataFrame()

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            data = pd.read_csv(file_path)
            
            data['Date'] = pd.to_datetime(data['Date'])
            data['Date'] = data['Date'].dt.tz_localize(None)
            
            stock_code = filename.split('.')[0]
            data['code'] = stock_code

            all_data = pd.concat([all_data, data])

    all_data.reset_index(drop=True, inplace=True)
    return all_data

In [None]:
def get_shares(start_date, end_date):
    """
    獲取特定日期範圍內每個股票的流通股數
    :param start_date: 字串，起始日期
    :param end_date: 字串，結束日期
    :return: 包含流通股數的DataFrame
    """
    _, trade_dates = get_trade_date(start_date, end_date)
    folder_path = 'C:\\Users\\KaiJung\\Desktop\\z_data\\shares_data'
    all_shares_data = pd.DataFrame()

    trade_dates = pd.DatetimeIndex(trade_dates).tz_localize(None)

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            single_stock_data = pd.read_csv(file_path)

            single_stock_data['Date'] = pd.to_datetime(single_stock_data['Date'])
            single_stock_data['Date'] = single_stock_data['Date'].dt.tz_localize(None)
            single_stock_data.set_index('Date', inplace=True)

            single_stock_data = single_stock_data[~single_stock_data.index.duplicated(keep='last')]
            stock_code = filename.split('.')[0]

            earliest_data_date = single_stock_data.index.min()
            extended_start_date = earliest_data_date if earliest_data_date < pd.to_datetime(start_date) else pd.to_datetime(start_date) - pd.DateOffset(years=1)

            extended_data_range = pd.date_range(start=extended_start_date, end=end_date)
            extended_data = single_stock_data.reindex(extended_data_range, method='ffill')
            extended_data['code'] = stock_code

            filtered_data = extended_data.loc[extended_data.index.intersection(trade_dates)]
            all_shares_data = pd.concat([all_shares_data, filtered_data])

    all_shares_data.reset_index(inplace=True)
    all_shares_data.rename(columns={'index': 'Date', '0': 'Shares'}, inplace=True)
    all_shares_data = all_shares_data.groupby('code').apply(lambda group: group.ffill().bfill()).reset_index(drop=True)

    all_shares_data = all_shares_data.dropna()

    return all_shares_data

In [None]:
def cal_valuation(shares_df, stock_data):
    """
    計算股票的流通市值
    :param shares_df: DataFrame，包含股票的在外流通股數信息
    :param stock_data: DataFrame，包含股票的價格信息
    :return: DataFrame，包含每個股票在每個日期的市值計算結果
    """
    # 以日期和股票代碼為鍵，合併股票流通股數和價格資料
    valuation_df = pd.merge(shares_df, stock_data, on=['Date', 'code'], how='left')
    
    valuation_df.fillna(method='ffill', inplace=True)
    # 計算市值：收盤價 * 在外流通股數
    valuation_df['market_cap'] = valuation_df['Shares'] * valuation_df['Close']
    return valuation_df

# Market

In [None]:
def cal_rolling_beta(stock_data, market_data, window_size):
    """
    計算滾動Beta值。
    :param stock_data: DataFrame，目標股票數據。
    :param market_data: DataFrame，市場大盤數據。
    :param window_size: 整數，滾動窗口的大小。
    :return: DataFrame，包含計算出的滾動Beta值。
    """
    # 重置索引以確保數據對齊
    stock_data = stock_data.reset_index(drop=True)
    market_data = market_data.reset_index(drop=True)

    # 合併股票數據和市場數據
    aligned_data = pd.merge(stock_data, market_data, on='Date', suffixes=('_stock', '_market'))

    # 填充缺失值
    aligned_data.fillna(method='ffill', inplace=True)

    # 計算股票和市場的日收益率
    aligned_data['stock_returns'] = aligned_data['Close_stock'].pct_change().fillna(0)
    aligned_data['market_returns'] = aligned_data['Close_market'].pct_change().fillna(0)

    # 準備計算滾動Beta的DataFrame
    rolling_beta_df = pd.DataFrame(index=aligned_data['Date'][window_size - 1:], columns=['Date', 'code', 'RollingBeta'])

    # 計算滾動Beta
    for end in range(window_size - 1, len(aligned_data['Date'])):
        start = end - window_size + 1
        windowed_data = aligned_data.iloc[start:end + 1]
        cov_matrix = np.cov(windowed_data['stock_returns'], windowed_data['market_returns'])
        beta = cov_matrix[0, 1] / cov_matrix[1, 1] if cov_matrix[1, 1] != 0 else np.nan
        
        rolling_beta_df.iloc[end - window_size + 1] = [aligned_data['Date'].iloc[end], aligned_data['code'].iloc[end], beta]

    return rolling_beta_df


In [None]:
def cal_downside_beta(stock_data, market_data, window_size):
    """
    計算滾動Downside Beta值。
    :param stock_data: DataFrame，目標股票數據。
    :param market_data: DataFrame，市場大盤數據。
    :param window_size: 整數，滾動窗口的大小。
    :return: DataFrame，包含每個日期、股票代碼和對應的Downside Beta值。
    """
    # 重置索引確保數據對齊
    stock_data = stock_data.reset_index(drop=True)
    market_data = market_data.reset_index(drop=True)

    # 合併股票與市場數據
    aligned_data = pd.merge(stock_data, market_data, on='Date', suffixes=('_stock', '_market'))
    aligned_data.fillna(method='ffill', inplace=True)

    # 計算日收益率
    aligned_data['stock_returns'] = aligned_data['Close_stock'].pct_change().fillna(0)
    aligned_data['market_returns'] = aligned_data['Close_market'].pct_change().fillna(0)

    # 初始化用於存儲滾動Downside Beta值的DataFrame
    rolling_downside_beta_df = pd.DataFrame(index=aligned_data['Date'][window_size - 1:], 
                                            columns=['Date', 'code', 'DownsideBeta'])

    # 計算市場平均收益率
    market_avg_return = aligned_data['market_returns'].mean()

    # 遍歷每個滾動窗口
    for end in range(window_size - 1, len(aligned_data['Date'])):
        start = end - window_size + 1
        windowed_data = aligned_data.iloc[start:end + 1]

        # 篩選市場收益率小於等於平均收益率的數據
        downside_data = windowed_data[windowed_data['market_returns'] <= market_avg_return]
        
        # 當存在足夠的數據點時計算Downside Beta
        if len(downside_data) > 1: 
            cov_matrix = np.cov(downside_data['stock_returns'], downside_data['market_returns'])
            beta = cov_matrix[0, 1] / cov_matrix[1, 1] if cov_matrix[1, 1] != 0 else np.nan
        else:
            beta = np.nan
        
        # 儲存計算結果
        rolling_downside_beta_df.iloc[end - window_size + 1] = [aligned_data['Date'].iloc[end], 
                                                                aligned_data['code'].iloc[end], 
                                                                beta]

    return rolling_downside_beta_df


In [None]:
def MAD_winsorize(x, multiplier=5):
    """
    對數據進行贏切處理，限制極端值的影響。
    :param x: 數組，輸入數據。
    :param multiplier: 數值，決定贏切範圍的乘數。
    :return: 處理後的數組。
    """
    x_M = np.nanmedian(x)
    x_MAD = np.nanmedian(np.abs(x - x_M))
    upper = x_M + multiplier * x_MAD
    lower = x_M - multiplier * x_MAD
    x[x > upper] = upper
    x[x < lower] = lower
    return x

def cal_Size(shares_df, stock_data):
    """
    計算市值因子Size。
    :param shares_df: DataFrame，在外流通股數數據。
    :param stock_data: DataFrame，股價歷史資料。
    :return: 包含Size因子的DataFrame。
    """
    def __reg(df):
        # 回歸分析以計算Size因子
        y = df['sub_MIDCAP'].values
        X = np.c_[np.ones((len(y), 1)), df['LNCAP'].values]
        W = np.diag(np.sqrt(df['market_cap']))
        beta = np.linalg.pinv(X.T @ W @ X) @ X.T @ W @ y
        resi = MAD_winsorize(y - X @ beta, multiplier=5)
        resi -= np.nanmean(resi)
        resi /= np.nanstd(resi)
        return pd.Series(resi, index=df['code'])

    # 使用cal_valuation函數計算市值
    tmp = cal_valuation(shares_df, stock_data)
    tmp = tmp[['code', 'Date', 'market_cap']]

    # 對市值進行對數轉換和變換
    tmp['LNCAP'] = np.log(tmp['market_cap'] + 1)
    tmp['sub_MIDCAP'] = tmp['LNCAP'] ** 3

    result = []
    for date, group in tmp.groupby('Date'):
        MIDCAP = __reg(group)
        group = group.merge(MIDCAP.reset_index(name='MIDCAP'), on=['code'])
        result.append(group)

    result_df = pd.concat(result, ignore_index=True)
    return result_df

In [1]:
def cal_residual_volatility(stock_data, market_data, window_size):
    """
    計算股票的滾動殘差波動性。
    :param stock_data: DataFrame，目標股票的數據。
    :param market_data: DataFrame，市場大盤的數據。
    :param window_size: 整數，滾動窗口的大小。
    :return: DataFrame，包含每個滾動窗口結束日期、股票代碼和對應的殘差波動性。
    """
    # 重置索引以確保數據的一致性
    stock_data = stock_data.reset_index(drop=True)
    market_data = market_data.reset_index(drop=True)

    # 合併股票和市場數據，對齊日期
    aligned_data = pd.merge(stock_data, market_data, on='Date', suffixes=('_stock', '_market'))
    aligned_data.fillna(method='ffill', inplace=True)

    # 計算股票和市場的日收益率
    aligned_data['stock_returns'] = aligned_data['Close_stock'].pct_change().fillna(0)
    aligned_data['market_returns'] = aligned_data['Close_market'].pct_change().fillna(0)

    # 初始化用於存儲滾動殘差波動性的DataFrame
    rolling_residual_volatility = pd.DataFrame(index=aligned_data.index[window_size - 1:], 
                                               columns=['Date', 'code', 'ResidualVolatility'])

    # 遍歷每個滾動窗口
    for end in range(window_size - 1, len(aligned_data)):
        start = end - window_size + 1
        windowed_data = aligned_data.iloc[start:end + 1]

        # 进行线性回归分析
        X = windowed_data['market_returns'].values.reshape(-1, 1)
        y = windowed_data['stock_returns'].values
        X_with_intercept = np.hstack([np.ones((X.shape[0], 1)), X])
        beta, intercept = np.linalg.lstsq(X_with_intercept, y, rcond=None)[0]
        residuals = y - (X_with_intercept @ np.array([intercept, beta]))
        
        # 计算残差波动性并存储结果
        residual_volatility = np.std(residuals)
        rolling_residual_volatility.iloc[end - window_size + 1] = [aligned_data['Date'].iloc[end], 
                                                                   aligned_data['code'].iloc[end], 
                                                                   residual_volatility]

    return rolling_residual_volatility


ModuleNotFoundError: No module named 'numpy'

In [None]:
"""
計算股市流動因子
:param stock: 目標股票數據。
:param shares_df: 市場大盤數據。
:param window_size: 滾動窗口大小。
"""
def cal_Liquidity(stock, shares_df, window_size=252, half_life=63):
    stock['Volume'] = pd.to_numeric(stock['Volume'], errors='coerce')
    shares_df['Shares'] = pd.to_numeric(shares_df['Shares'], errors='coerce')

    stock['Volume'].fillna(method='ffill', inplace=True)
    shares_df['Shares'].fillna(method='ffill', inplace=True)

    merged_df = pd.merge(stock, shares_df, on=['Date', 'code'], how='inner')
    merged_df = merged_df.groupby('code').apply(lambda group: group.fillna(method='ffill'))
    merged_df['turnover_rate'] = merged_df['Volume'] / merged_df['Shares']
    turnover_rate = merged_df.pivot(index='Date', columns='code', values='turnover_rate')
    turnover_rate = turnover_rate.replace(0, 1e-10) 

    #月換手率
    monthly_turnover = np.log(turnover_rate.rolling(21).sum())
    #季換手率
    quarterly_turnover = np.log(turnover_rate.rolling(63).sum())
    #年換手率
    annual_turnover = np.log(turnover_rate.rolling(252).sum())

    weights = np.array([0.5**(1/half_life) ** i for i in range(window_size)])[::-1]
    weights /= weights.sum()
    annualized_traded_value_ratio = turnover_rate.rolling(window=window_size).apply(lambda x: np.dot(x, weights), raw=False) * window_size
   
    liquidity_factors = {
        'Monthly_turnover': monthly_turnover.stack().rename('Monthly_turnover'),
        'Quarterly_turnover': quarterly_turnover.stack().rename('Quarterly_turnover'),
        'Annual_turnover': annual_turnover.stack().rename('Annual_turnover'),
        'Annualized_traded_value_ratio': annualized_traded_value_ratio.stack().rename('Annualized_traded_value_ratio')
    }

    liquidity_factors_df = pd.DataFrame(liquidity_factors)
    liquidity_factors_df = liquidity_factors_df.reset_index().rename(columns={'level_1': 'code'})
    liquidity_factors_df['Date'] = pd.to_datetime(liquidity_factors_df['Date'])
    liquidity_factors_df = liquidity_factors_df.sort_values(by=['code', 'Date'])

    

    return liquidity_factors_df

# Run


In [None]:
# 創建股票和市場的DataFrame
stock_data = get_history_data()
market_data, date = get_trade_date()
shares_data = get_shares()

# 定義窗口大小
window_size = 252

# 計算並顯示各項指標
# 1. 滾動Beta
rolling_beta_df = cal_rolling_beta(stock_data, market_data, window_size)
print("滾動Beta:")
print(rolling_beta_df.head())

# 2. 滾動Downside Beta
rolling_downside_beta_df = cal_downside_beta(stock_data, market_data, window_size)
print("\n滾動Downside Beta:")
print(rolling_downside_beta_df.head())

# 3. 殘差波動性
rolling_residual_volatility = cal_residual_volatility(stock_data, market_data, window_size)
print("\n滾動殘差波動性:")
print(rolling_residual_volatility.head())

# 4. 市值因子Size
size_df = cal_Size(shares_data, stock_data)
print("\n市值因子Size:")
print(size_df.head())