# Sentiment Analysis on

In [122]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import zscore
from statsmodels.tsa.stattools import grangercausalitytests
import jieba
from snownlp import SnowNLP
import tushare as ts
import akshare as ak
from statsmodels.tsa.stattools import adfuller, coint, kpss,grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects, compare
from linearmodels.iv import IV2SLS
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [123]:
def test_stationarity(series, alpha=0.05):
    # ADF检验
    adf_result = adfuller(series.dropna())
    print(f"ADF Statistic: {adf_result[0]:.4f}")
    print(f"ADF p-value: {adf_result[1]:.4f}")
    print("ADF结论: 序列平稳" if adf_result[1] < alpha else "ADF结论: 序列非平稳")

    # KPSS检验
    kpss_result = kpss(series.dropna())
    print(f"\nKPSS Statistic: {kpss_result[0]:.4f}")
    print(f"KPSS p-value: {kpss_result[1]:.4f}")
    print("KPSS结论: 序列趋势平稳" if kpss_result[1] > alpha else "KPSS结论: 序列存在单位根")

In [124]:
def winsorize_series(series, lower=0.01, upper=0.99):
    """
    对收益率序列进行缩尾处理
    :param series: pandas Series, 收益率序列
    :param lower: 下缩尾分位数（默认1%）
    :param upper: 上缩尾分位数（默认99%）
    :return: 缩尾后的Series
    """
    # 计算分位数边界
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)

    # 执行缩尾（极端值替换为边界值）
    return series.clip(lower=lower_bound, upper=upper_bound)

In [125]:
def sentiment_analysis(text, sentiment_dict):
    words = list(jieba.cut(text))

    # 原始 counts = 0
    nums = 0
    for word in words:
        if word in sentiment_dict:
            nums += 1
    return nums

In [126]:
def SentimentIndex(df, stock):
    trade_date = stock['trade_date'].tolist()
    trade_date = sorted(trade_date)
    for i in range(len(trade_date)-1):
        start_time = pd.to_datetime(trade_date[i].strftime('%Y%m%d')+ ' ' + '15:00')
        end_time = pd.to_datetime((trade_date[i+1].strftime('%Y%m%d') + ' ' + '15:00'))
        df_sentiment = df[(df['post_datetime']>start_time) & (df['post_datetime']<=end_time)]
        if df_sentiment.empty:
            continue
        else:
            sentiment_index = (sum(df_sentiment['pos_num']) - sum(df_sentiment['neg_num']))/sum(df_sentiment['word_num'])
            stock.loc[stock['trade_date'] == pd.to_datetime(trade_date[i+1]).strftime('%Y%m%d'), 'sentiment_index'] = sentiment_index
            #snownlp_index = (sum(df_sentiment['snownlp'])/len(df_sentiment['snownlp']))
            #stock.loc[stock['trade_date'] == pd.to_datetime(trade_date[i+1]).strftime('%Y%m%d'), 'snownlp_index'] = snownlp_index
    return stock

In [127]:
def get_stock_data(stock_id, start_time, end_time):
    ## 个股数据

    pro = ts.pro_api('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
    # 拉取数据

    # 拉取数据
    stock_daily_basic = pro.daily_basic(**{
        "ts_code": stock_id,
        "trade_date": "",
        "start_date": start_time,
        "end_date": end_time,
        "limit": "",
        "offset": ""
    }, fields=[
        "ts_code",
        "trade_date",
        "turnover_rate",
        "pe",
        "pb"
    ])

    # 拉取数据
    stock_daily = pro.daily(**{
        "ts_code": stock_id,
        "trade_date": "",
        "start_date": start_time,
        "end_date": end_time,
        "offset": "",
        "limit": ""
    }, fields=[
        "ts_code",
        "trade_date",
        "open",
        "high",
        "low",
        "close",
        "vol",
        "amount"
    ])

    if stock_daily.empty:
        print(f'{stock_id} stock 行情数据 is empty')
    if stock_daily_basic.empty:
        print(f'{stock_id} stock 每日指标 is empty')
    stock = stock_daily.merge(stock_daily_basic, how='left', on=['ts_code', 'trade_date'])
    return stock

## 情绪词典处理

In [128]:
txt_file = 'Sentiment dict plus.txt'

# 初始化分类容器
positive_words = []
negative_words = []

# 读取并处理文件
try:
    with open(txt_file , "r", encoding="utf-8") as file:
        for line_num, line in enumerate(file, 1):
            # 清理并分割行内容
            cleaned_line = line.strip()
            if not cleaned_line:
                continue  # 跳过空行

            # 分割词语和数值（兼容空格/制表符分隔）
            parts = cleaned_line.split(maxsplit=1)  # 最多分割一次
            if len(parts) != 2:
                print(f"第 {line_num} 行格式错误：{line}")
                continue

            word, score = parts
            try:
                score = int(score)
            except ValueError:
                print(f"第 {line_num} 行数值格式错误：{score}")
                continue

            # 分类存储
            if score == 1:
                positive_words.append(word)
            elif score == -1:
                negative_words.append(word)
            else:
                print(f"第 {line_num} 行发现无效数值：{score}")

except FileNotFoundError:
    print("错误：未找到文件 financial_words.txt")
except Exception as e:
    print(f"发生未知错误：{str(e)}")

# 输出统计结果
print(f"\n积极词汇（共 {len(positive_words)} 个）：")
print(", ".join(positive_words[:3]) + ("..." if len(positive_words) > 3 else ""))

print(f"\n消极词汇（共 {len(negative_words)} 个）：")
print(", ".join(negative_words[:3]) + ("..." if len(negative_words) > 3 else ""))


积极词汇（共 3645 个）：
安定, 安康, 帮助...

消极词汇（共 6177 个）：
败坏名声, 被没收的, 变节...


## 读取个股股评 csv

In [187]:
stock_id_list = ['000333.SZ', '002594.SZ', '300750.SZ', '600036.SH', '600519.SH', '600900.SH','601318.SH']

In [188]:
stock_df_len = {}


In [189]:
initial_df = pd.DataFrame()

In [190]:
df_000333 = pd.DataFrame()
df_002594 = pd.DataFrame()
df_600519 = pd.DataFrame()
df_600900 = pd.DataFrame()
df_601318 = pd.DataFrame()
df_000300 = pd.DataFrame()
df_300750 = pd.DataFrame()
df_600036 = pd.DataFrame()
stock_dfs = {
    '000333.SZ': df_000333,
    '002594.SZ': df_002594,
    '300750.SZ': df_300750,
    '600519.SH': df_600519,
    '600900.SH': df_600900,
    '601318.SH': df_601318,
    '600036.SH': df_600036
}

In [191]:
for stock_id in stock_id_list:
    stock_id_str = stock_id[:6]
    post_info = f'post_info.post_{stock_id_str}.csv'
    df = pd.read_csv(post_info)
    df = df.dropna()
    if df.empty:
        print(f'{stock_id} csv empty')
    df['post_datetime'] = pd.to_datetime(df["post_date"] + " " + df["post_time"])
    df  = df.sort_values(by=['post_datetime'], ascending=True)
    stock_dfs[stock_id] = df[df['post_url'].str.contains(stock_id_str, na=False)].reset_index(drop=True).copy()
    stock_df_len[stock_id] = len(stock_dfs[stock_id])

In [192]:
stock_df_len

{'000333.SZ': 4631,
 '002594.SZ': 1393,
 '300750.SZ': 1640,
 '600036.SH': 5489,
 '600519.SH': 3681,
 '600900.SH': 4438,
 '601318.SH': 4009}

## 读取 沪深300指数股评csv

In [129]:
post_info = 'post_info.post_zssh000300_old.csv'
initial_df = pd.read_csv(post_info)

In [130]:
initial_df = initial_df[initial_df['post_url'].str.contains('zssh000300', na=False)].copy()

In [131]:
len(initial_df)

8969

In [132]:
initial_df['post_datetime'] = pd.to_datetime(initial_df["post_date"] + " " + initial_df["post_time"])
initial_df  = initial_df.sort_values(by=['post_datetime'], ascending=True)

In [133]:
initial_df = initial_df.dropna()

## 个股数据 sentiment整合 full_stock

In [193]:
full_stock = pd.DataFrame()

In [194]:
stock_id_list = ['000333.SZ', '002594.SZ', '300750.SZ', '600036.SH', '600519.SH', '600900.SH','601318.SH']

In [284]:
stock_id = '601318.SH'

In [285]:
initial_df = stock_dfs[stock_id]

In [286]:
start_time = initial_df['post_datetime'].min().strftime('%Y%m%d')
end_time = initial_df['post_datetime'].max().strftime('%Y%m%d')
stock = get_stock_data(stock_id, start_time, end_time)

In [287]:
stock = stock.sort_values(by=['trade_date'], ascending=True).reset_index(drop=True)
stock['trade_date'] = pd.to_datetime(stock['trade_date'])

In [288]:
result = stock.copy()

In [289]:
result['return'] = np.log(result['close']/result['close'].shift(1))*100
result['return'] = result['return'].fillna(0)
result['volatility'] = (result['high'] - result['low'])/((result['high']+result['low'])/2)*100

In [290]:
result = result.rename(columns={'return': 'returns'}).copy()

In [291]:
result['returns_new'] = winsorize_series(result['returns'])
result['log_volatility'] = np.log(result['volatility']/100 + 1e-5).pipe(lambda x: (x - x.mean())/x.std())*100
result['turnover_rate_new'] = result['turnover_rate'].apply(lambda x: np.log(1+x))
result['pe_new'] = winsorize_series(result['pe'], lower=0.01, upper=0.99)
result['pb_new'] = winsorize_series(result['pb'], lower=0.01, upper=0.99)

In [292]:
result['vol'] = result['vol']/1e6 # (百万)

In [293]:
result.head(2)

Unnamed: 0,ts_code,trade_date,open,high,low,close,vol,amount,turnover_rate,pe,pb,returns,volatility,returns_new,log_volatility,turnover_rate_new,pe_new,pb_new
0,601318.SH,2025-03-03,50.55,50.82,50.06,50.33,0.423161,2135730.015,0.3932,10.6989,1.0285,0.0,1.506741,0.0,19.210973,0.331603,10.6989,1.0285
1,601318.SH,2025-03-04,50.08,50.29,50.0,50.08,0.372432,1867141.471,0.346,10.6458,1.0234,-0.497959,0.578323,-0.497959,-173.401584,0.297137,10.6458,1.0234


In [294]:
len(result)

42

In [295]:
df = initial_df.drop(columns=['_id', 'post_url', 'post_author', 'post_time'])
df['post_date'] = pd.to_datetime(df['post_date'])

In [296]:
df['pos_num'] = df['post_title'].apply(sentiment_analysis, args=(positive_words,))
df['neg_num'] = df['post_title'].apply(sentiment_analysis, args=(negative_words,))
df['word_num'] = df['post_title'].apply(len)

In [297]:
df.head(2)

Unnamed: 0,post_title,post_view,comment_num,post_date,post_datetime,pos_num,neg_num,word_num
0,没跌几毛钱就拉有意义吗？差价都做不了，要么就直接砸到48,172,0,2025-03-03,2025-03-03 14:47:00,0,0,28
1,不会涨的因为小豆丁买了一千多万必须跌到45再拉开个玩笑而已,232,2,2025-03-03,2025-03-03 15:05:00,1,0,29


In [298]:
result = SentimentIndex(df, result)

In [299]:
result['agreement_index'] = result['sentiment_index'].apply(lambda x : 1-np.sqrt(1-x**2))

In [300]:
result = result[~result['sentiment_index'].isna()].reset_index(drop=True)

In [301]:
result['sentiment_index_lag_1'] = result['sentiment_index'].shift(1).fillna(0)

In [302]:
data_processed = result[['ts_code', 'trade_date', 'sentiment_index', 'agreement_index', 'vol', 'returns_new', 'log_volatility', 'pe_new', 'pb_new', 'turnover_rate_new', 'sentiment_index_lag_1']].copy()

In [303]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol']:
    data_processed[column] = data_processed[column].diff()

In [304]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol']:
    print(f"\n=== {column} 平稳性检验 ===")
    test_stationarity(data_processed[column])


=== pe_new 平稳性检验 ===
ADF Statistic: -6.0425
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.1016
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== pb_new 平稳性检验 ===
ADF Statistic: -6.2046
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.0850
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== turnover_rate_new 平稳性检验 ===
ADF Statistic: -8.8883
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.2602
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== vol 平稳性检验 ===
ADF Statistic: -8.9789
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.3198
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳


look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())


In [305]:
# 初步数据处理，标注化
# 1.1 Z-Score标准化（适用于收益率、波动率）
cols_to_standardize = ['returns_new', 'log_volatility','sentiment_index', 'sentiment_index_lag_1']
data_std = data_processed[cols_to_standardize].apply(lambda x: (x - x.mean())/x.std())


In [363]:
cols_to_minmax = []
# data_minmax = data[cols_to_minmax].apply(lambda x: (x - x.min())/(x.max() - x.min()))

In [364]:
data_processed = pd.concat([data_processed.drop(columns=cols_to_standardize+cols_to_minmax),
                           data_std], axis=1)

In [365]:
data_processed = data_processed.fillna(0)


In [340]:
full_stock = pd.concat([full_stock, data_processed], ignore_index = True)

In [341]:
len(full_stock)

331

In [367]:
len(data_processed)

13

In [342]:
full_stock.head(2)

Unnamed: 0,ts_code,trade_date,agreement_index,vol,pe_new,pb_new,turnover_rate_new,returns_new,log_volatility,sentiment_index,sentiment_index_lag_1
0,600036.SH,2024-10-16,8e-05,0.0,0.0,0.0,0.0,1.846326,1.79256,1.020653,-0.015369
1,600036.SH,2024-10-17,1.9e-05,-0.163626,-0.1738,-0.0265,-0.05466,-1.964721,1.62925,-0.519892,1.017939


In [368]:
full_stock = full_stock.fillna(0)

In [369]:
missing_values = full_stock.isnull().sum()
missing_values

ts_code                  0
trade_date               0
agreement_index          0
vol                      0
pe_new                   0
pb_new                   0
turnover_rate_new        0
returns_new              0
log_volatility           0
sentiment_index          0
sentiment_index_lag_1    0
dtype: int64

In [370]:
data_processed.head(2)

Unnamed: 0,ts_code,trade_date,agreement_index,vol,pe_new,pb_new,turnover_rate_new,returns_new,log_volatility,sentiment_index,sentiment_index_lag_1
0,300750.SZ,2025-04-14,2e-06,0.0,0.0,0.0,0.0,-0.210053,0.360226,-0.7602,-0.947351
1,300750.SZ,2025-04-15,8.3e-05,0.0599,0.3601,-0.2428,0.095589,1.343693,0.336742,0.762126,-0.657808


In [371]:
data_processed.columns

Index(['ts_code', 'trade_date', 'agreement_index', 'vol', 'pe_new', 'pb_new',
       'turnover_rate_new', 'returns_new', 'log_volatility', 'sentiment_index',
       'sentiment_index_lag_1'],
      dtype='object')

In [None]:
y_return = data_processed['returns_new']
y_volume = data_processed['vol']
y_volatility = data_processed['log_volatility']
X = sm.add_constant(data_processed[['sentiment_index', 'agreement_index', 'pe_new', 'pb_new', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_return, X).fit()
model_volume_ols = sm.OLS(y_volume, X).fit()
model_volatility_ols = sm.OLS(y_volatility, X).fit()

print("对数收益率 OLS 回归结果:")
print(model_return_ols.summary())
print("成交量一阶差分 OLS 回归结果:")
print(model_volume_ols.summary())
print("波动率 OLS 回归结果:")
print(model_volatility_ols.summary())

### 000333.SZ 格兰杰结果

In [66]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1
ssr based chi2 test:   chi2=0.5711  , p=0.4498  , df=1
likelihood ratio test: chi2=0.5681  , p=0.4510  , df=1
parameter F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2
ssr based chi2 test:   chi2=0.4179  , p=0.8114  , df=2
likelihood ratio test: chi2=0.4163  , p=0.8121  , df=2
parameter F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3
ssr based chi2 test:   chi2=1.6828  , p=0.6408  , df=3
likelihood ratio test: chi2=1.6561  , p=0.6467  , df=3
parameter F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2406  , p=0.9137  , df_d

In [282]:
test_data = data_processed[['sentiment_index', 'returns_new']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=5)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.2673  , p=0.1374  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=2.3807  , p=0.1228  , df=1
likelihood ratio test: chi2=2.3368  , p=0.1264  , df=1
parameter F test:         F=2.2673  , p=0.1374  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.2502  , p=0.1147  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=4.8951  , p=0.0865  , df=2
likelihood ratio test: chi2=4.7115  , p=0.0948  , df=2
parameter F test:         F=2.2502  , p=0.1147  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.5695  , p=0.2074  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=5.3190  , p=0.1499  , df=3
likelihood ratio test: chi2=5.0997  , p=0.1646  , df=3
parameter F test:         F=1.5695  , p=0.2074  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.5891  , p=0.1914  , df_d

In [283]:
test_data = data_processed[['sentiment_index', 'log_volatility']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=5)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0723  , p=0.7889  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=0.0759  , p=0.7829  , df=1
likelihood ratio test: chi2=0.0759  , p=0.7829  , df=1
parameter F test:         F=0.0723  , p=0.7889  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5725  , p=0.5673  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=1.2455  , p=0.5365  , df=2
likelihood ratio test: chi2=1.2332  , p=0.5398  , df=2
parameter F test:         F=0.5725  , p=0.5673  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.2514  , p=0.3003  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=4.2407  , p=0.2366  , df=3
likelihood ratio test: chi2=4.0998  , p=0.2509  , df=3
parameter F test:         F=1.2514  , p=0.3003  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.9058  , p=0.4676  , df_d

#### '600519.SH'

In [187]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0037  , p=0.9518  , df_denom=28, df_num=1
ssr based chi2 test:   chi2=0.0041  , p=0.9489  , df=1
likelihood ratio test: chi2=0.0041  , p=0.9489  , df=1
parameter F test:         F=0.0037  , p=0.9518  , df_denom=28, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1206  , p=0.8869  , df_denom=25, df_num=2
ssr based chi2 test:   chi2=0.2894  , p=0.8653  , df=2
likelihood ratio test: chi2=0.2880  , p=0.8659  , df=2
parameter F test:         F=0.1206  , p=0.8869  , df_denom=25, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.0803  , p=0.9700  , df_denom=22, df_num=3
ssr based chi2 test:   chi2=0.3174  , p=0.9567  , df=3
likelihood ratio test: chi2=0.3157  , p=0.9570  , df=3
parameter F test:         F=0.0803  , p=0.9700  , df_denom=22, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2666  , p=0.8958  , df_d

#### '600900.SH'

In [221]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0499  , p=0.8239  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=0.0524  , p=0.8189  , df=1
likelihood ratio test: chi2=0.0524  , p=0.8189  , df=1
parameter F test:         F=0.0499  , p=0.8239  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.3332  , p=0.7180  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=0.7249  , p=0.6960  , df=2
likelihood ratio test: chi2=0.7207  , p=0.6974  , df=2
parameter F test:         F=0.3332  , p=0.7180  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.9235  , p=0.4357  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=3.1297  , p=0.3721  , df=3
likelihood ratio test: chi2=3.0521  , p=0.3837  , df=3
parameter F test:         F=0.9235  , p=0.4357  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7471  , p=0.5645  , df_d

#### '601318.SH'

In [255]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2113  , p=0.6485  , df_denom=36, df_num=1
ssr based chi2 test:   chi2=0.2289  , p=0.6323  , df=1
likelihood ratio test: chi2=0.2282  , p=0.6328  , df=1
parameter F test:         F=0.2113  , p=0.6485  , df_denom=36, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.8544  , p=0.4348  , df_denom=33, df_num=2
ssr based chi2 test:   chi2=1.9676  , p=0.3739  , df=2
likelihood ratio test: chi2=1.9184  , p=0.3832  , df=2
parameter F test:         F=0.8544  , p=0.4348  , df_denom=33, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.5625  , p=0.0734  , df_denom=30, df_num=3
ssr based chi2 test:   chi2=9.4813  , p=0.0235  , df=3
likelihood ratio test: chi2=8.4409  , p=0.0377  , df=3
parameter F test:         F=2.5625  , p=0.0734  , df_denom=30, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.8992  , p=0.1395  , df_d

#### '000333.SZ'

In [289]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1
ssr based chi2 test:   chi2=0.5711  , p=0.4498  , df=1
likelihood ratio test: chi2=0.5681  , p=0.4510  , df=1
parameter F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2
ssr based chi2 test:   chi2=0.4179  , p=0.8114  , df=2
likelihood ratio test: chi2=0.4163  , p=0.8121  , df=2
parameter F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3
ssr based chi2 test:   chi2=1.6828  , p=0.6408  , df=3
likelihood ratio test: chi2=1.6561  , p=0.6467  , df=3
parameter F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2406  , p=0.9137  , df_d

#### 固定效应

In [373]:
full_stock = full_stock.set_index(['ts_code', 'trade_date'])  # 设置为面板数据索引

In [295]:
full_stock = full_stock.reset_index()

In [374]:
# 定义变量
dependent_var = full_stock['returns_new']  # 因变量
independent_vars = full_stock[['sentiment_index', 'pe_new', 'pb_new', 'turnover_rate_new']]
independent_vars = sm.add_constant(independent_vars)  # 添加截距项

# 固定效应模型 (控制个体和时间效应)
model_fe = PanelOLS(
    dependent_var,
    independent_vars,
    entity_effects=True,  # 个股固定效应
    time_effects=True     # 时间固定效应（可选）
)
results_fe = model_fe.fit()
print(results_fe.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:            returns_new   R-squared:                        0.2413
Estimator:                   PanelOLS   R-squared (Between):          -1.827e+30
No. Observations:                 331   R-squared (Within):               0.2227
Date:                Wed, May 07 2025   R-squared (Overall):              0.2220
Time:                        15:46:46   Log-likelihood                   -245.43
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      15.263
Entities:                           6   P-value                           0.0000
Avg Obs:                       55.167   Distribution:                   F(4,192)
Min Obs:                       13.000                                           
Max Obs:                       127.00   F-statistic (robust):             15.263
                            

#### 随机效应

In [375]:
model_re = RandomEffects(
    dependent_var,
    independent_vars
)
results_re = model_re.fit()
print(results_re.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:            returns_new   R-squared:                        0.2915
Estimator:              RandomEffects   R-squared (Between):          -8.793e+30
No. Observations:                 331   R-squared (Within):               0.2946
Date:                Wed, May 07 2025   R-squared (Overall):              0.2915
Time:                        15:47:30   Log-likelihood                   -409.61
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      33.531
Entities:                           6   P-value                           0.0000
Avg Obs:                       55.167   Distribution:                   F(4,326)
Min Obs:                       13.000                                           
Max Obs:                       127.00   F-statistic (robust):             33.531
                            

In [376]:
n_stocks = full_stock.index.get_level_values('ts_code').nunique()
n_vars = independent_vars.shape[1]  # 包括截距项
print(f"个股数量：{n_stocks}，解释变量数：{n_vars}")

个股数量：6，解释变量数：5


In [378]:
# 比较固定效应和随机效应
from linearmodels.panel import compare

print(compare({"FE": results_fe, "RE": results_re}))

                     Model Comparison                    
                                     FE                RE
---------------------------------------------------------
Dep. Variable               returns_new       returns_new
Estimator                      PanelOLS     RandomEffects
No. Observations                    331               331
Cov. Est.                    Unadjusted        Unadjusted
R-squared                        0.2413            0.2915
R-Squared (Within)               0.2227            0.2946
R-Squared (Between)          -1.827e+30        -8.793e+30
R-Squared (Overall)              0.2220            0.2915
F-statistic                      15.263            33.531
P-value (F-stat)                 0.0000            0.0000
const                            0.0196            0.0383
                               (0.5322)          (0.8267)
sentiment_index                  0.2512            0.2285
                               (4.9253)          (4.8700)
pe_new        

In [377]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["Variable"] = independent_vars.columns
vif_data["VIF"] = [variance_inflation_factor(independent_vars.values, i) for i in range(independent_vars.shape[1])]
print(vif_data)

            Variable       VIF
0              const  1.007170
1    sentiment_index  1.013342
2             pe_new  1.632454
3             pb_new  1.605830
4  turnover_rate_new  1.014184


In [379]:
data_processed[['returns_new', 'log_volatility', 'sentiment_index', 'pb_new', 'pe_new', 'turnover_rate_new']].corr()

Unnamed: 0,returns_new,log_volatility,sentiment_index,pb_new,pe_new,turnover_rate_new
returns_new,1.0,0.591862,0.390926,0.00177,0.999826,0.595639
log_volatility,0.591862,1.0,0.320199,0.058841,0.595513,0.651195
sentiment_index,0.390926,0.320199,1.0,0.239429,0.387644,0.290585
pb_new,0.00177,0.058841,0.239429,1.0,0.012088,-0.230339
pe_new,0.999826,0.595513,0.387644,0.012088,1.0,0.597144
turnover_rate_new,0.595639,0.651195,0.290585,-0.230339,0.597144,1.0


## 沪深300指数股评分析

In [68]:
# delete unwanted columns
df = initial_df.drop(columns=['_id', 'post_url', 'post_author', 'post_time'])
df.head(1)

Unnamed: 0,post_title,post_view,comment_num,post_date,post_datetime
10588,人工智能ETF159819行业紧紧把握科技创新发展趋势积极拥抱生成式人工智能相,135,0,2023-08-07,2023-08-07 12:31:00


In [69]:
df['post_date'] = pd.to_datetime(df['post_date'])

In [70]:
df['pos_num'] = df['post_title'].apply(sentiment_analysis, args=(positive_words,))
df['neg_num'] = df['post_title'].apply(sentiment_analysis, args=(negative_words,))
df['word_num'] = df['post_title'].apply(len)

### 沪深300指数daily数据获取

#### M2

In [134]:
ts.set_token('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
pro = ts.pro_api()
# 拉取数据
m2_yoy = pro.cn_m(**{
    "m": "",
    "start_m": 202301,
    "end_m": 202505,
    "limit": "",
    "offset": ""
}, fields=[
    "month",
    "m2_yoy"
])
print(m2_yoy)



     month  m2_yoy
0   202503     7.0
1   202502     7.0
2   202501     7.0
3   202412     7.3
4   202411     7.1
5   202410     7.5
6   202409     6.8
7   202408     6.3
8   202407     6.3
9   202406     6.2
10  202405     7.0
11  202404     7.2
12  202403     8.3
13  202402     8.7
14  202401     8.7
15  202312     9.7
16  202311    10.0
17  202310    10.3
18  202309    10.3
19  202308    10.6
20  202307    10.7
21  202306    11.3
22  202305    11.6
23  202304    12.4
24  202303    12.7
25  202302    12.9
26  202301    12.6


In [135]:
m2_yoy = m2_yoy.sort_values(by='month', ascending=True)

In [136]:
m2_yoy['m2_yoy_adjusted'] = m2_yoy['m2_yoy'].rolling(window=12, center=True).mean()

In [137]:
m2_yoy['m2_yoy_lag_3'] = m2_yoy['m2_yoy_adjusted'].shift(3)

In [138]:
m2_yoy = m2_yoy.ffill()
m2_yoy = m2_yoy.bfill()

In [139]:
m2_yoy = m2_yoy[['month', 'm2_yoy_lag_3']].copy()
m2_yoy = m2_yoy.rename(columns={'m2_yoy_lag_3': 'm2_yoy'})

In [140]:
m2_yoy = m2_yoy.set_index('month')['m2_yoy'].to_dict()

In [141]:
m2_yoy['202504'] = m2_yoy['202503']

In [142]:
m2_yoy

{'202301': 11.258333333333333,
 '202302': 11.258333333333333,
 '202303': 11.258333333333333,
 '202304': 11.258333333333333,
 '202305': 11.258333333333333,
 '202306': 11.258333333333333,
 '202307': 11.258333333333333,
 '202308': 11.258333333333333,
 '202309': 11.258333333333333,
 '202310': 11.258333333333333,
 '202311': 10.933333333333335,
 '202312': 10.583333333333334,
 '202401': 10.216666666666667,
 '202402': 9.783333333333333,
 '202403': 9.4,
 '202404': 8.975,
 '202405': 8.608333333333333,
 '202406': 8.25,
 '202407': 7.958333333333333,
 '202408': 7.724999999999999,
 '202409': 7.483333333333333,
 '202410': 7.283333333333334,
 '202411': 7.141666666666667,
 '202412': 7.0,
 '202501': 6.891666666666667,
 '202502': 6.891666666666667,
 '202503': 6.891666666666667,
 '202504': 6.891666666666667}

#### 指数日线数据

In [143]:
## HS300指数日线数据
ts.set_token('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
pro = ts.pro_api()

# 拉取数据
HS300 = pro.index_daily(**{
    "ts_code": "000300.SH",
    "trade_date": "",
    "start_date": 20230808,
    "end_date": 20250418,
    "limit": "",
    "offset": ""
}, fields=[
    "ts_code",
    "trade_date",
    "close",
    "open",
    "high",
    "low",
    "pre_close",
    "change",
    "pct_chg",
    "vol",
    "amount"
])
print(HS300)



       ts_code trade_date      close       open       high        low  \
0    000300.SH   20250418  3772.5230  3760.0425  3782.0443  3754.4072   
1    000300.SH   20250417  3772.2221  3755.0862  3779.1253  3749.5040   
2    000300.SH   20250416  3772.8204  3757.8757  3775.1661  3721.6025   
3    000300.SH   20250415  3761.2348  3756.4788  3764.4860  3737.8495   
4    000300.SH   20250414  3759.1422  3772.0407  3776.6344  3754.0330   
..         ...        ...        ...        ...        ...        ...   
405  000300.SH   20230814  3855.9061  3840.8872  3859.4718  3814.0711   
406  000300.SH   20230811  3884.2538  3977.9834  3977.9834  3884.2538   
407  000300.SH   20230810  3975.7166  3962.8569  3976.4637  3945.3869   
408  000300.SH   20230809  3967.5652  3967.3566  3982.1091  3961.8434   
409  000300.SH   20230808  3979.7322  3980.0423  4002.2904  3963.4632   

     pre_close   change  pct_chg          vol        amount  
0    3772.2221   0.3009   0.0080  102948509.0  1.731094e+08  

In [144]:
HS300 = HS300.sort_values(by=['trade_date'], ascending=True).reset_index(drop=True)

#### 指数 basic 数据

In [145]:
ts.set_token('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
pro = ts.pro_api()

# 拉取数据
HS300_basic = pro.index_dailybasic(**{
    "trade_date": "",
    "ts_code": "000300.SH",
    "start_date": 20230808,
    "end_date": 20250418,
    "limit": "",
    "offset": ""
}, fields=[
    "ts_code",
    "trade_date",
    "total_mv",
    "float_mv",
    "turnover_rate",
    "pe",
    "pb",
    "total_share"
])
print(HS300_basic)



       ts_code trade_date      total_mv      float_mv  turnover_rate     pe  \
0    000300.SH   20250418  5.593480e+13  4.198855e+13           0.32  12.10   
1    000300.SH   20250417  5.586207e+13  4.197099e+13           0.38  12.09   
2    000300.SH   20250416  5.586239e+13  4.197110e+13           0.48  12.09   
3    000300.SH   20250415  5.550268e+13  4.172010e+13           0.41  12.01   
4    000300.SH   20250414  5.531078e+13  4.160352e+13           0.50  11.96   
..         ...        ...           ...           ...            ...    ...   
405  000300.SH   20230814  4.973195e+13  3.821329e+13           0.38  11.89   
406  000300.SH   20230811  5.007981e+13  3.848109e+13           0.41  11.97   
407  000300.SH   20230810  5.113598e+13  3.934532e+13           0.32  12.23   
408  000300.SH   20230809  5.099382e+13  3.926694e+13           0.31  12.19   
409  000300.SH   20230808  5.114554e+13  3.938285e+13           0.36  12.23   

       pb   total_share  
0    1.31  4.210865e+12  

#### 指数数据结合

In [146]:
HS300 = HS300.merge(HS300_basic, how='left', on=['ts_code', 'trade_date'])

In [147]:
HS300['trade_date'] = pd.to_datetime(HS300['trade_date'])

In [148]:
HS300['m2_yoy'] = HS300['trade_date'].apply(lambda x: m2_yoy[x.strftime('%Y%m')])

#### 数据检查

In [149]:
missing_values = HS300.isnull().sum()
missing_values

ts_code          0
trade_date       0
close            0
open             0
high             0
low              0
pre_close        0
change           0
pct_chg          0
vol              0
amount           0
total_mv         0
float_mv         0
turnover_rate    0
pe               0
pb               0
total_share      0
m2_yoy           0
dtype: int64

### 沪深300指数本数据和股票数据结合

In [150]:
result = SentimentIndex(df, HS300)

In [151]:
result['agreement_index'] = result['sentiment_index'].apply(lambda x : 1-np.sqrt(1-x**2))

In [152]:
result = result[~result['sentiment_index'].isna()].reset_index(drop=True)

In [153]:
result['sentiment_index_lag_1'] = result['sentiment_index'].shift(1).fillna(0)

In [154]:
result['return'] = np.log(result['close']/result['close'].shift(1))*100
result['return'] = result['return'].fillna(0)
result['volatility'] = (result['high'] - result['low'])/((result['high']+result['low'])/2)*100

In [155]:
result = result.rename(columns={'return': 'returns'}).copy()

In [156]:
result['returns_new'] = winsorize_series(result['returns'])
result['log_volatility'] = np.log(result['volatility']  + 1e-5).pipe(lambda x: (x - x.mean()) / x.std())
result['turnover_rate_new'] = result['turnover_rate'].apply(lambda x: np.log(1 + x))
result['pe_new'] = winsorize_series(result['pe'], lower=0.01, upper=0.99)
result['pb_new'] = winsorize_series(result['pb'], lower=0.01, upper=0.99)

In [157]:
result['vol'] = result['vol'] / 1e6  # (百万)

In [158]:
len(result)

409

In [159]:
result['volatility'].describe()

count    409.000000
mean       1.355402
std        0.908618
min        0.360828
25%        0.846813
50%        1.097468
75%        1.499679
max        8.332997
Name: volatility, dtype: float64

In [160]:
result['m2_yoy'].describe()

count    409.000000
mean       8.705949
std        1.614093
min        6.891667
25%        7.141667
50%        8.250000
75%       10.216667
max       11.258333
Name: m2_yoy, dtype: float64

In [161]:
data_processed = result[['trade_date', 'sentiment_index', 'agreement_index', 'vol', 'returns_new', 'volatility','log_volatility', 'pe_new', 'pb_new', 'turnover_rate_new', 'sentiment_index_lag_1', 'm2_yoy']].copy()

In [162]:
for column in data_processed.columns:
    print(f"\n=== {column} 平稳性检验 ===")
    test_stationarity(data_processed[column])


=== trade_date 平稳性检验 ===
ADF Statistic: 5.9313
ADF p-value: 1.0000
ADF结论: 序列非平稳

KPSS Statistic: 3.2465
KPSS p-value: 0.0100
KPSS结论: 序列存在单位根

=== sentiment_index 平稳性检验 ===
ADF Statistic: -4.5751
ADF p-value: 0.0001
ADF结论: 序列平稳

KPSS Statistic: 0.2273
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== agreement_index 平稳性检验 ===
ADF Statistic: -12.2284
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.1822
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== vol 平稳性检验 ===
ADF Statistic: -2.0924
ADF p-value: 0.2476
ADF结论: 序列非平稳

KPSS Statistic: 1.2049
KPSS p-value: 0.0100
KPSS结论: 序列存在单位根

=== returns_new 平稳性检验 ===
ADF Statistic: -11.5261
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.1837
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== volatility 平稳性检验 ===
ADF Statistic: -4.9723
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.2579
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== log_volatility 平稳性检验 ===
ADF Statistic: -4.7732
ADF p-value: 0.0001
ADF结论: 序列平稳

KPSS Statistic: 0.2820
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_result = kpss(series.dropna())
l

In [163]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol', 'm2_yoy']:
    data_processed[column] = data_processed[column].diff()

In [164]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol', 'm2_yoy']:
    print(f"\n=== {column} 平稳性检验 ===")
    test_stationarity(data_processed[column])


=== pe_new 平稳性检验 ===
ADF Statistic: -8.8101
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.0991
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== pb_new 平稳性检验 ===
ADF Statistic: -12.8356
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.1476
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== turnover_rate_new 平稳性检验 ===
ADF Statistic: -7.6441
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.0761
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== vol 平稳性检验 ===
ADF Statistic: -7.8461
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.0608
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== m2_yoy 平稳性检验 ===
ADF Statistic: -20.8179
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.3147
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳


look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())


In [165]:
data_processed = data_processed.fillna(0)

In [166]:
# 初步数据处理，标注化
# 1.1 Z-Score标准化（适用于收益率、波动率）
cols_to_standardize = ['returns_new', 'log_volatility','sentiment_index', 'sentiment_index_lag_1', 'volatility']
data_std = data_processed[cols_to_standardize].apply(lambda x: (x - x.mean())/x.std())

In [167]:
cols_to_minmax = []
# data_minmax = data[cols_to_minmax].apply(lambda x: (x - x.min())/(x.max() - x.min()))

In [168]:
data_processed = pd.concat([data_processed.drop(columns=cols_to_standardize+cols_to_minmax),
                           data_std], axis=1)

In [169]:
data_processed['log_volatility'].describe()

count    4.090000e+02
mean    -2.605902e-17
std      1.000000e+00
min     -2.393232e+00
25%     -6.663260e-01
50%     -1.414527e-01
75%      4.906389e-01
max      3.962350e+00
Name: log_volatility, dtype: float64

### 相关性检验

In [170]:
data_processed[['returns_new', 'log_volatility', 'sentiment_index', 'pb_new', 'pe_new', 'turnover_rate_new', 'm2_yoy', 'sentiment_index_lag_1']].corr()

Unnamed: 0,returns_new,log_volatility,sentiment_index,pb_new,pe_new,turnover_rate_new,m2_yoy,sentiment_index_lag_1
returns_new,1.0,0.223819,0.165013,0.838278,0.880583,0.323064,-0.047296,0.012967
log_volatility,0.223819,1.0,0.076819,0.115505,0.141544,0.435001,-0.03432,-0.044703
sentiment_index,0.165013,0.076819,1.0,0.089726,0.12256,0.030306,-0.055663,0.167548
pb_new,0.838278,0.115505,0.089726,1.0,0.906296,0.202773,-0.010927,-0.017291
pe_new,0.880583,0.141544,0.12256,0.906296,1.0,0.257529,-0.036617,-0.00907
turnover_rate_new,0.323064,0.435001,0.030306,0.202773,0.257529,1.0,-0.031053,-0.053413
m2_yoy,-0.047296,-0.03432,-0.055663,-0.010927,-0.036617,-0.031053,1.0,0.087494
sentiment_index_lag_1,0.012967,-0.044703,0.167548,-0.017291,-0.00907,-0.053413,0.087494,1.0


### 回归模型

In [173]:
y_return = data_processed['returns_new']
X = sm.add_constant(data_processed[['sentiment_index_lag_1','pe_new','turnover_rate_new']])

model_return_ols = sm.OLS(y_return, X).fit()

print("对数收益率 OLS 回归结果:")
print(model_return_ols.summary())

对数收益率 OLS 回归结果:
                            OLS Regression Results                            
Dep. Variable:            returns_new   R-squared:                       0.786
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     496.0
Date:                Thu, 08 May 2025   Prob (F-statistic):          3.50e-135
Time:                        09:30:58   Log-Likelihood:                -264.51
No. Observations:                 409   AIC:                             537.0
Df Residuals:                     405   BIC:                             553.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const         

In [458]:
y_return = data_processed['returns_new']
X = sm.add_constant(data_processed[['sentiment_index','pe_new', 'pb_new', 'm2_yoy', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_return, X).fit()

print("对数收益率 OLS 回归结果:")
print(model_return_ols.summary())

对数收益率 OLS 回归结果:
                            OLS Regression Results                            
Dep. Variable:            returns_new   R-squared:                       0.800
Model:                            OLS   Adj. R-squared:                  0.798
Method:                 Least Squares   F-statistic:                     322.7
Date:                Wed, 07 May 2025   Prob (F-statistic):          1.94e-138
Time:                        16:24:11   Log-Likelihood:                -250.57
No. Observations:                 409   AIC:                             513.1
Df Residuals:                     403   BIC:                             537.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 

In [457]:
y_return = data_processed['returns_new']
X = sm.add_constant(data_processed[['pe_new', 'pb_new', 'm2_yoy', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_return, X).fit()

print("对数收益率 OLS 回归结果:")
print(model_return_ols.summary())

对数收益率 OLS 回归结果:
                            OLS Regression Results                            
Dep. Variable:            returns_new   R-squared:                       0.796
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     394.8
Date:                Wed, 07 May 2025   Prob (F-statistic):          4.33e-138
Time:                        16:23:26   Log-Likelihood:                -254.49
No. Observations:                 409   AIC:                             519.0
Df Residuals:                     404   BIC:                             539.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 

In [444]:
y_vol = data_processed['vol']
X = sm.add_constant(data_processed[['sentiment_index','pe_new', 'pb_new', 'm2_yoy', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_vol, X).fit()

print("成交量一阶差分 OLS 回归结果:")
print(model_return_ols.summary())

成交量一阶差分 OLS 回归结果:
                            OLS Regression Results                            
Dep. Variable:                    vol   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.968
Method:                 Least Squares   F-statistic:                     2031.
Date:                Wed, 07 May 2025   Prob (F-statistic):          4.31e-297
Time:                        16:16:10   Log-Likelihood:                -1295.5
No. Observations:                 409   AIC:                             2605.
Df Residuals:                     402   BIC:                             2633.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const               

In [174]:
y_volatility = data_processed['log_volatility']
X = sm.add_constant(data_processed[['sentiment_index','agreement_index','pe_new', 'pb_new','m2_yoy','turnover_rate_new']])

model_return_ols = sm.OLS(y_volatility, X).fit()

print("波动率 回归结果:")
print(model_return_ols.summary())

波动率 回归结果:
                            OLS Regression Results                            
Dep. Variable:         log_volatility   R-squared:                       0.195
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     16.21
Date:                Thu, 08 May 2025   Prob (F-statistic):           9.77e-17
Time:                        09:32:29   Log-Likelihood:                -535.52
No. Observations:                 409   AIC:                             1085.
Df Residuals:                     402   BIC:                             1113.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.0184

In [178]:
y_volatility = data_processed['log_volatility']
X = sm.add_constant(data_processed[['sentiment_index_lag_1','pe_new', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_volatility, X).fit()

print("波动率 回归结果:")
print(model_return_ols.summary())

波动率 回归结果:
                            OLS Regression Results                            
Dep. Variable:         log_volatility   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.185
Method:                 Least Squares   F-statistic:                     31.80
Date:                Thu, 08 May 2025   Prob (F-statistic):           1.78e-18
Time:                        09:35:35   Log-Likelihood:                -536.59
No. Observations:                 409   AIC:                             1081.
Df Residuals:                     405   BIC:                             1097.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const               

In [176]:
# VIF计算
vif_data = pd.DataFrame()
X = data_processed[['sentiment_index_lag_1','pe_new', 'turnover_rate_new']].copy()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [121]:
vif_data

Unnamed: 0,Variable,VIF
0,sentiment_index,1.015252
1,pe_new,1.086368
2,turnover_rate_new,1.071033


### 格兰杰因果检验

In [180]:
test_data = data_processed[['returns_new', 'sentiment_index_lag_1']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0014  , p=0.9706  , df_denom=405, df_num=1
ssr based chi2 test:   chi2=0.0014  , p=0.9705  , df=1
likelihood ratio test: chi2=0.0014  , p=0.9705  , df=1
parameter F test:         F=0.0014  , p=0.9706  , df_denom=405, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.7004  , p=0.4970  , df_denom=402, df_num=2
ssr based chi2 test:   chi2=1.4183  , p=0.4921  , df=2
likelihood ratio test: chi2=1.4158  , p=0.4927  , df=2
parameter F test:         F=0.7004  , p=0.4970  , df_denom=402, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.6113  , p=0.6080  , df_denom=399, df_num=3
ssr based chi2 test:   chi2=1.8660  , p=0.6007  , df=3
likelihood ratio test: chi2=1.8617  , p=0.6016  , df=3
parameter F test:         F=0.6113  , p=0.6080  , df_denom=399, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.8134  , p=0.5171  

In [186]:
test_data_reverse = data_processed[['sentiment_index', 'log_volatility']].dropna()
granger_test_reverse = grangercausalitytests(test_data_reverse, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.4282  , p=0.2328  , df_denom=405, df_num=1
ssr based chi2 test:   chi2=1.4388  , p=0.2303  , df=1
likelihood ratio test: chi2=1.4363  , p=0.2307  , df=1
parameter F test:         F=1.4282  , p=0.2328  , df_denom=405, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.9581  , p=0.0531  , df_denom=402, df_num=2
ssr based chi2 test:   chi2=5.9897  , p=0.0500  , df=2
likelihood ratio test: chi2=5.9461  , p=0.0511  , df=2
parameter F test:         F=2.9581  , p=0.0531  , df_denom=402, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.7936  , p=0.0401  , df_denom=399, df_num=3
ssr based chi2 test:   chi2=8.5279  , p=0.0363  , df=3
likelihood ratio test: chi2=8.4396  , p=0.0377  , df=3
parameter F test:         F=2.7936  , p=0.0401  , df_denom=399, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=3.3181  , p=0.0109  

In [181]:
test_data = data_processed[['', 'sentiment_index_lag_1']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2149  , p=0.6432  , df_denom=405, df_num=1
ssr based chi2 test:   chi2=0.2165  , p=0.6417  , df=1
likelihood ratio test: chi2=0.2164  , p=0.6418  , df=1
parameter F test:         F=0.2149  , p=0.6432  , df_denom=405, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.5617  , p=0.2111  , df_denom=402, df_num=2
ssr based chi2 test:   chi2=3.1622  , p=0.2057  , df=2
likelihood ratio test: chi2=3.1500  , p=0.2070  , df=2
parameter F test:         F=1.5617  , p=0.2111  , df_denom=402, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.9975  , p=0.3939  , df_denom=399, df_num=3
ssr based chi2 test:   chi2=3.0451  , p=0.3847  , df=3
likelihood ratio test: chi2=3.0337  , p=0.3865  , df=3
parameter F test:         F=0.9975  , p=0.3939  , df_denom=399, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7959  , p=0.5284  

In [456]:
test_data = data_processed[['vol', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.8039  , p=0.3705  , df_denom=405, df_num=1
ssr based chi2 test:   chi2=0.8099  , p=0.3682  , df=1
likelihood ratio test: chi2=0.8091  , p=0.3684  , df=1
parameter F test:         F=0.8039  , p=0.3705  , df_denom=405, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.4013  , p=0.6697  , df_denom=402, df_num=2
ssr based chi2 test:   chi2=0.8126  , p=0.6661  , df=2
likelihood ratio test: chi2=0.8118  , p=0.6664  , df=2
parameter F test:         F=0.4013  , p=0.6697  , df_denom=402, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.2053  , p=0.8927  , df_denom=399, df_num=3
ssr based chi2 test:   chi2=0.6266  , p=0.8903  , df=3
likelihood ratio test: chi2=0.6261  , p=0.8904  , df=3
parameter F test:         F=0.2053  , p=0.8927  , df_denom=399, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.1563  , p=0.9601  

In [1]:
data_processed['log_volatility'].describe()

NameError: name 'data_processed' is not defined