# Sentiment Analysis on

In [3]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import zscore
from statsmodels.tsa.stattools import grangercausalitytests
import jieba
from snownlp import SnowNLP
import tushare as ts
import akshare as ak
from statsmodels.tsa.stattools import adfuller, coint, kpss,grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects, compare
from linearmodels.iv import IV2SLS
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
def test_stationarity(series, alpha=0.05):
    # ADF检验
    adf_result = adfuller(series.dropna())
    print(f"ADF Statistic: {adf_result[0]:.4f}")
    print(f"ADF p-value: {adf_result[1]:.4f}")
    print("ADF结论: 序列平稳" if adf_result[1] < alpha else "ADF结论: 序列非平稳")

    # KPSS检验
    kpss_result = kpss(series.dropna())
    print(f"\nKPSS Statistic: {kpss_result[0]:.4f}")
    print(f"KPSS p-value: {kpss_result[1]:.4f}")
    print("KPSS结论: 序列趋势平稳" if kpss_result[1] > alpha else "KPSS结论: 序列存在单位根")

In [5]:
def winsorize_series(series, lower=0.01, upper=0.99):
    """
    对收益率序列进行缩尾处理
    :param series: pandas Series, 收益率序列
    :param lower: 下缩尾分位数（默认1%）
    :param upper: 上缩尾分位数（默认99%）
    :return: 缩尾后的Series
    """
    # 计算分位数边界
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)

    # 执行缩尾（极端值替换为边界值）
    return series.clip(lower=lower_bound, upper=upper_bound)

In [6]:
def sentiment_analysis(text, sentiment_dict):
    words = list(jieba.cut(text))

    # 原始 counts = 0
    nums = 0
    for word in words:
        if word in sentiment_dict:
            nums += 1
    return nums

In [7]:
def SentimentIndex(df, stock):
    trade_date = stock['trade_date'].tolist()
    trade_date = sorted(trade_date)
    for i in range(len(trade_date)-1):
        start_time = pd.to_datetime(trade_date[i].strftime('%Y%m%d')+ ' ' + '15:00')
        end_time = pd.to_datetime((trade_date[i+1].strftime('%Y%m%d') + ' ' + '15:00'))
        df_sentiment = df[(df['post_datetime']>start_time) & (df['post_datetime']<=end_time)]
        if df_sentiment.empty:
            continue
        else:
            sentiment_index = (sum(df_sentiment['pos_num']) - sum(df_sentiment['neg_num']))/sum(df_sentiment['word_num'])
            stock.loc[stock['trade_date'] == pd.to_datetime(trade_date[i+1]).strftime('%Y%m%d'), 'sentiment_index'] = sentiment_index
            #snownlp_index = (sum(df_sentiment['snownlp'])/len(df_sentiment['snownlp']))
            #stock.loc[stock['trade_date'] == pd.to_datetime(trade_date[i+1]).strftime('%Y%m%d'), 'snownlp_index'] = snownlp_index
    return stock

In [8]:
def get_stock_data(stock_id, start_time, end_time):
    ## 个股数据

    pro = ts.pro_api('2876ea85cb005fb5fa17c809a98174f2d5aae8b1f830110a5ead6211')
    # 拉取数据

    # 拉取数据
    stock_daily_basic = pro.daily_basic(**{
        "ts_code": stock_id,
        "trade_date": "",
        "start_date": start_time,
        "end_date": end_time,
        "limit": "",
        "offset": ""
    }, fields=[
        "ts_code",
        "trade_date",
        "turnover_rate",
        "pe",
        "pb"
    ])

    # 拉取数据
    stock_daily = pro.daily(**{
        "ts_code": stock_id,
        "trade_date": "",
        "start_date": start_time,
        "end_date": end_time,
        "offset": "",
        "limit": ""
    }, fields=[
        "ts_code",
        "trade_date",
        "open",
        "high",
        "low",
        "close",
        "vol",
        "amount"
    ])

    if stock_daily.empty:
        print(f'{stock_id} stock 行情数据 is empty')
    if stock_daily_basic.empty:
        print(f'{stock_id} stock 每日指标 is empty')
    stock = stock_daily.merge(stock_daily_basic, how='left', on=['ts_code', 'trade_date'])
    return stock

## 情绪词典处理

In [9]:
txt_file = 'Sentiment dict plus.txt'

# 初始化分类容器
positive_words = []
negative_words = []

# 读取并处理文件
try:
    with open(txt_file , "r", encoding="utf-8") as file:
        for line_num, line in enumerate(file, 1):
            # 清理并分割行内容
            cleaned_line = line.strip()
            if not cleaned_line:
                continue  # 跳过空行

            # 分割词语和数值（兼容空格/制表符分隔）
            parts = cleaned_line.split(maxsplit=1)  # 最多分割一次
            if len(parts) != 2:
                print(f"第 {line_num} 行格式错误：{line}")
                continue

            word, score = parts
            try:
                score = int(score)
            except ValueError:
                print(f"第 {line_num} 行数值格式错误：{score}")
                continue

            # 分类存储
            if score == 1:
                positive_words.append(word)
            elif score == -1:
                negative_words.append(word)
            else:
                print(f"第 {line_num} 行发现无效数值：{score}")

except FileNotFoundError:
    print("错误：未找到文件 financial_words.txt")
except Exception as e:
    print(f"发生未知错误：{str(e)}")

# 输出统计结果
print(f"\n积极词汇（共 {len(positive_words)} 个）：")
print(", ".join(positive_words[:3]) + ("..." if len(positive_words) > 3 else ""))

print(f"\n消极词汇（共 {len(negative_words)} 个）：")
print(", ".join(negative_words[:3]) + ("..." if len(negative_words) > 3 else ""))


积极词汇（共 3645 个）：
安定, 安康, 帮助...

消极词汇（共 6177 个）：
败坏名声, 被没收的, 变节...


## 读取个股股评 csv

In [10]:
stock_id_list = ['000333.SZ', '002594.SZ', '300750.SZ', '600036.SH', '600519.SH', '600900.SH','601318.SH']

In [11]:
stock_df_len = {}


In [12]:
initial_df = pd.DataFrame()

In [13]:
df_000333 = pd.DataFrame()
df_002594 = pd.DataFrame()
df_600519 = pd.DataFrame()
df_600900 = pd.DataFrame()
df_601318 = pd.DataFrame()
df_000300 = pd.DataFrame()
df_300750 = pd.DataFrame()
df_600036 = pd.DataFrame()
stock_dfs = {
    '000333.SZ': df_000333,
    '002594.SZ': df_002594,
    '300750.SZ': df_300750,
    '600519.SH': df_600519,
    '600900.SH': df_600900,
    '601318.SH': df_601318,
    '600036.SH': df_600036
}

In [14]:
for stock_id in stock_id_list:
    stock_id_str = stock_id[:6]
    post_info = f'post_info.post_{stock_id_str}.csv'
    df = pd.read_csv(post_info)
    df = df.dropna()
    if df.empty:
        print(f'{stock_id} csv empty')
    df['post_datetime'] = pd.to_datetime(df["post_date"] + " " + df["post_time"])
    df  = df.sort_values(by=['post_datetime'], ascending=True)
    stock_dfs[stock_id] = df[df['post_url'].str.contains(stock_id_str, na=False)].reset_index(drop=True).copy()
    stock_df_len[stock_id] = len(stock_dfs[stock_id])

In [15]:
stock_df_len

{'000333.SZ': 4631,
 '002594.SZ': 1393,
 '300750.SZ': 1640,
 '600036.SH': 5489,
 '600519.SH': 3681,
 '600900.SH': 4438,
 '601318.SH': 4009}

## 个股数据 sentiment整合 full_stock

In [16]:
full_stock = pd.DataFrame()

In [17]:
stock_id_list = ['000333.SZ', '300750.SZ', '600036.SH', '600519.SH', '600900.SH','601318.SH']

In [189]:
stock_id = '601318.SH'

In [190]:
initial_df = stock_dfs[stock_id]

In [191]:
start_time = initial_df['post_datetime'].min().strftime('%Y%m%d')
end_time = initial_df['post_datetime'].max().strftime('%Y%m%d')
stock = get_stock_data(stock_id, start_time, end_time)

In [192]:
print(start_time,end_time)

20250303 20250505


In [193]:
stock = stock.sort_values(by=['trade_date'], ascending=True).reset_index(drop=True)
stock['trade_date'] = pd.to_datetime(stock['trade_date'])

In [194]:
result = stock.copy()

In [195]:
result['return'] = np.log(result['close']/result['close'].shift(1))*100

In [196]:
result.head(2)

Unnamed: 0,ts_code,trade_date,open,high,low,close,vol,amount,turnover_rate,pe,pb,return
0,601318.SH,2025-03-03,50.55,50.82,50.06,50.33,423160.83,2135730.015,0.3932,10.6989,1.0285,
1,601318.SH,2025-03-04,50.08,50.29,50.0,50.08,372431.63,1867141.471,0.346,10.6458,1.0234,-0.497959


In [197]:
result['return'] = result['return'].fillna(0)
result['volatility'] = (result['high'] - result['low'])/((result['high']+result['low'])/2)*100

In [198]:
result = result.rename(columns={'return': 'returns'}).copy()

In [199]:
result['returns_new'] = winsorize_series(result['returns'])
result['log_volatility'] = np.log(result['volatility']/100 + 1e-5).pipe(lambda x: (x - x.mean())/x.std())*100
result['turnover_rate_new'] = result['turnover_rate'].apply(lambda x: np.log(1+x))
result['pe_new'] = winsorize_series(result['pe'], lower=0.01, upper=0.99)
result['pb_new'] = winsorize_series(result['pb'], lower=0.01, upper=0.99)

In [200]:
result['vol'] = result['vol']/1e6 # (百万)

In [201]:
result.head(2)

Unnamed: 0,ts_code,trade_date,open,high,low,close,vol,amount,turnover_rate,pe,pb,returns,volatility,returns_new,log_volatility,turnover_rate_new,pe_new,pb_new
0,601318.SH,2025-03-03,50.55,50.82,50.06,50.33,0.423161,2135730.015,0.3932,10.6989,1.0285,0.0,1.506741,0.0,19.210973,0.331603,10.6989,1.0285
1,601318.SH,2025-03-04,50.08,50.29,50.0,50.08,0.372432,1867141.471,0.346,10.6458,1.0234,-0.497959,0.578323,-0.497959,-173.401584,0.297137,10.6458,1.0234


In [202]:
len(result)

42

In [203]:
df = initial_df.drop(columns=['_id', 'post_url', 'post_author', 'post_time'])
df['post_date'] = pd.to_datetime(df['post_date'])

In [204]:
df['pos_num'] = df['post_title'].apply(sentiment_analysis, args=(positive_words,))
df['neg_num'] = df['post_title'].apply(sentiment_analysis, args=(negative_words,))
df['word_num'] = df['post_title'].apply(len)

In [205]:
df.head(2)

Unnamed: 0,post_title,post_view,comment_num,post_date,post_datetime,pos_num,neg_num,word_num
0,没跌几毛钱就拉有意义吗？差价都做不了，要么就直接砸到48,172,0,2025-03-03,2025-03-03 14:47:00,0,0,28
1,不会涨的因为小豆丁买了一千多万必须跌到45再拉开个玩笑而已,232,2,2025-03-03,2025-03-03 15:05:00,1,0,29


In [206]:
result = SentimentIndex(df, result)

In [207]:
result['agreement_index'] = result['sentiment_index'].apply(lambda x : 1-np.sqrt(1-x**2))

In [208]:
result = result[~result['sentiment_index'].isna()].reset_index(drop=True)

In [209]:
result['sentiment_index_lag_1'] = result['sentiment_index'].shift(1).fillna(0)

In [210]:
data_processed = result[['ts_code', 'trade_date', 'sentiment_index', 'agreement_index', 'vol', 'returns_new', 'log_volatility', 'pe_new', 'pb_new', 'turnover_rate_new', 'sentiment_index_lag_1']].copy()

In [211]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol']:
    data_processed[column] = data_processed[column].diff()

In [212]:
for column in ['pe_new', 'pb_new', 'turnover_rate_new', 'vol']:
    print(f"\n=== {column} 平稳性检验 ===")
    test_stationarity(data_processed[column])


=== pe_new 平稳性检验 ===
ADF Statistic: -6.0425
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.1016
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== pb_new 平稳性检验 ===
ADF Statistic: -6.2046
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.0850
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== turnover_rate_new 平稳性检验 ===
ADF Statistic: -8.8883
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.2602
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳

=== vol 平稳性检验 ===
ADF Statistic: -8.9789
ADF p-value: 0.0000
ADF结论: 序列平稳

KPSS Statistic: 0.3198
KPSS p-value: 0.1000
KPSS结论: 序列趋势平稳


look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())
look-up table. The actual p-value is greater than the p-value returned.

  kpss_result = kpss(series.dropna())


In [213]:
# 初步数据处理，标注化
# 1.1 Z-Score标准化（适用于收益率、波动率）
cols_to_standardize = ['returns_new', 'log_volatility','sentiment_index', 'sentiment_index_lag_1']
data_std = data_processed[cols_to_standardize].apply(lambda x: (x - x.mean())/x.std())


In [214]:
cols_to_minmax = []
# data_minmax = data[cols_to_minmax].apply(lambda x: (x - x.min())/(x.max() - x.min()))

In [215]:
data_processed = pd.concat([data_processed.drop(columns=cols_to_standardize+cols_to_minmax),
                           data_std], axis=1)

In [216]:
data_processed = data_processed.fillna(0)


In [217]:
full_stock = pd.concat([full_stock, data_processed], ignore_index = True)

In [218]:
len(full_stock)

331

In [219]:
len(data_processed)

40

In [220]:
full_stock.head(2)

Unnamed: 0,ts_code,trade_date,agreement_index,vol,pe_new,pb_new,turnover_rate_new,returns_new,log_volatility,sentiment_index,sentiment_index_lag_1
0,600036.SH,2024-10-16,8e-05,0.0,0.0,0.0,0.0,1.846326,1.79256,1.020653,-0.015369
1,600036.SH,2024-10-17,1.9e-05,-0.163626,-0.1738,-0.0265,-0.05466,-1.964721,1.62925,-0.519892,1.017939


In [221]:
full_stock = full_stock.fillna(0)

In [222]:
missing_values = full_stock.isnull().sum()
missing_values

ts_code                  0
trade_date               0
agreement_index          0
vol                      0
pe_new                   0
pb_new                   0
turnover_rate_new        0
returns_new              0
log_volatility           0
sentiment_index          0
sentiment_index_lag_1    0
dtype: int64

In [52]:
data_processed.head(2)

Unnamed: 0,ts_code,trade_date,agreement_index,vol,pe_new,pb_new,turnover_rate_new,returns_new,log_volatility,sentiment_index,sentiment_index_lag_1
0,600036.SH,2024-10-16,8e-05,0.0,0.0,0.0,0.0,1.846326,1.79256,1.020653,-0.015369
1,600036.SH,2024-10-17,1.9e-05,-0.163626,-0.1738,-0.0265,-0.05466,-1.964721,1.62925,-0.519892,1.017939


In [53]:
data_processed.columns

Index(['ts_code', 'trade_date', 'agreement_index', 'vol', 'pe_new', 'pb_new',
       'turnover_rate_new', 'returns_new', 'log_volatility', 'sentiment_index',
       'sentiment_index_lag_1'],
      dtype='object')

In [223]:
y_return = data_processed['returns_new']
y_volume = data_processed['vol']
y_volatility = data_processed['log_volatility']
X = sm.add_constant(data_processed[['sentiment_index', 'agreement_index', 'pe_new', 'pb_new', 'turnover_rate_new']])

model_return_ols = sm.OLS(y_return, X).fit()
model_volume_ols = sm.OLS(y_volume, X).fit()
model_volatility_ols = sm.OLS(y_volatility, X).fit()

print("对数收益率 OLS 回归结果:")
print(model_return_ols.summary())
print("成交量一阶差分 OLS 回归结果:")
print(model_volume_ols.summary())
print("波动率 OLS 回归结果:")
print(model_volatility_ols.summary())

对数收益率 OLS 回归结果:
                            OLS Regression Results                            
Dep. Variable:            returns_new   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.963
Method:                 Least Squares   F-statistic:                     204.0
Date:                Tue, 13 May 2025   Prob (F-statistic):           2.48e-24
Time:                        12:23:20   Log-Likelihood:                 12.428
No. Observations:                  40   AIC:                            -12.86
Df Residuals:                      34   BIC:                            -2.723
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 

### 000333.SZ 格兰杰结果

In [66]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1
ssr based chi2 test:   chi2=0.5711  , p=0.4498  , df=1
likelihood ratio test: chi2=0.5681  , p=0.4510  , df=1
parameter F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2
ssr based chi2 test:   chi2=0.4179  , p=0.8114  , df=2
likelihood ratio test: chi2=0.4163  , p=0.8121  , df=2
parameter F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3
ssr based chi2 test:   chi2=1.6828  , p=0.6408  , df=3
likelihood ratio test: chi2=1.6561  , p=0.6467  , df=3
parameter F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2406  , p=0.9137  , df_d

In [282]:
test_data = data_processed[['sentiment_index', 'returns_new']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=5)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.2673  , p=0.1374  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=2.3807  , p=0.1228  , df=1
likelihood ratio test: chi2=2.3368  , p=0.1264  , df=1
parameter F test:         F=2.2673  , p=0.1374  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.2502  , p=0.1147  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=4.8951  , p=0.0865  , df=2
likelihood ratio test: chi2=4.7115  , p=0.0948  , df=2
parameter F test:         F=2.2502  , p=0.1147  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.5695  , p=0.2074  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=5.3190  , p=0.1499  , df=3
likelihood ratio test: chi2=5.0997  , p=0.1646  , df=3
parameter F test:         F=1.5695  , p=0.2074  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.5891  , p=0.1914  , df_d

In [283]:
test_data = data_processed[['sentiment_index', 'log_volatility']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=5)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0723  , p=0.7889  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=0.0759  , p=0.7829  , df=1
likelihood ratio test: chi2=0.0759  , p=0.7829  , df=1
parameter F test:         F=0.0723  , p=0.7889  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.5725  , p=0.5673  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=1.2455  , p=0.5365  , df=2
likelihood ratio test: chi2=1.2332  , p=0.5398  , df=2
parameter F test:         F=0.5725  , p=0.5673  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=1.2514  , p=0.3003  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=4.2407  , p=0.2366  , df=3
likelihood ratio test: chi2=4.0998  , p=0.2509  , df=3
parameter F test:         F=1.2514  , p=0.3003  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.9058  , p=0.4676  , df_d

#### '600519.SH'

In [187]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0037  , p=0.9518  , df_denom=28, df_num=1
ssr based chi2 test:   chi2=0.0041  , p=0.9489  , df=1
likelihood ratio test: chi2=0.0041  , p=0.9489  , df=1
parameter F test:         F=0.0037  , p=0.9518  , df_denom=28, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1206  , p=0.8869  , df_denom=25, df_num=2
ssr based chi2 test:   chi2=0.2894  , p=0.8653  , df=2
likelihood ratio test: chi2=0.2880  , p=0.8659  , df=2
parameter F test:         F=0.1206  , p=0.8869  , df_denom=25, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.0803  , p=0.9700  , df_denom=22, df_num=3
ssr based chi2 test:   chi2=0.3174  , p=0.9567  , df=3
likelihood ratio test: chi2=0.3157  , p=0.9570  , df=3
parameter F test:         F=0.0803  , p=0.9700  , df_denom=22, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2666  , p=0.8958  , df_d

#### '600900.SH'

In [221]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.0499  , p=0.8239  , df_denom=60, df_num=1
ssr based chi2 test:   chi2=0.0524  , p=0.8189  , df=1
likelihood ratio test: chi2=0.0524  , p=0.8189  , df=1
parameter F test:         F=0.0499  , p=0.8239  , df_denom=60, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.3332  , p=0.7180  , df_denom=57, df_num=2
ssr based chi2 test:   chi2=0.7249  , p=0.6960  , df=2
likelihood ratio test: chi2=0.7207  , p=0.6974  , df=2
parameter F test:         F=0.3332  , p=0.7180  , df_denom=57, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.9235  , p=0.4357  , df_denom=54, df_num=3
ssr based chi2 test:   chi2=3.1297  , p=0.3721  , df=3
likelihood ratio test: chi2=3.0521  , p=0.3837  , df=3
parameter F test:         F=0.9235  , p=0.4357  , df_denom=54, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.7471  , p=0.5645  , df_d

#### '601318.SH'

In [255]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2113  , p=0.6485  , df_denom=36, df_num=1
ssr based chi2 test:   chi2=0.2289  , p=0.6323  , df=1
likelihood ratio test: chi2=0.2282  , p=0.6328  , df=1
parameter F test:         F=0.2113  , p=0.6485  , df_denom=36, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.8544  , p=0.4348  , df_denom=33, df_num=2
ssr based chi2 test:   chi2=1.9676  , p=0.3739  , df=2
likelihood ratio test: chi2=1.9184  , p=0.3832  , df=2
parameter F test:         F=0.8544  , p=0.4348  , df_denom=33, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.5625  , p=0.0734  , df_denom=30, df_num=3
ssr based chi2 test:   chi2=9.4813  , p=0.0235  , df=3
likelihood ratio test: chi2=8.4409  , p=0.0377  , df=3
parameter F test:         F=2.5625  , p=0.0734  , df_denom=30, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=1.8992  , p=0.1395  , df_d

#### '000333.SZ'

In [289]:
test_data = data_processed[['returns_new', 'sentiment_index']].dropna()
granger_test = grangercausalitytests(test_data, maxlag=8)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1
ssr based chi2 test:   chi2=0.5711  , p=0.4498  , df=1
likelihood ratio test: chi2=0.5681  , p=0.4510  , df=1
parameter F test:         F=0.5394  , p=0.4661  , df_denom=51, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2
ssr based chi2 test:   chi2=0.4179  , p=0.8114  , df=2
likelihood ratio test: chi2=0.4163  , p=0.8121  , df=2
parameter F test:         F=0.1892  , p=0.8282  , df_denom=48, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3
ssr based chi2 test:   chi2=1.6828  , p=0.6408  , df=3
likelihood ratio test: chi2=1.6561  , p=0.6467  , df=3
parameter F test:         F=0.4854  , p=0.6941  , df_denom=45, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.2406  , p=0.9137  , df_d

#### 固定效应

In [228]:
full_stock = full_stock.set_index(['ts_code', 'trade_date'])  # 设置为面板数据索引

In [226]:
full_stock = full_stock.reset_index()

In [229]:
# 定义变量
dependent_var = full_stock['returns_new']  # 因变量
independent_vars = full_stock[['sentiment_index', 'pe_new', 'pb_new', 'turnover_rate_new']]
independent_vars = sm.add_constant(independent_vars)  # 添加截距项

# 固定效应模型 (控制个体和时间效应)
model_fe = PanelOLS(
    dependent_var,
    independent_vars,
    entity_effects=True,  # 个股固定效应
    time_effects=True     # 时间固定效应（可选）
)
results_fe = model_fe.fit()
print(results_fe.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:            returns_new   R-squared:                        0.2413
Estimator:                   PanelOLS   R-squared (Between):          -1.827e+30
No. Observations:                 331   R-squared (Within):               0.2227
Date:                Tue, May 13 2025   R-squared (Overall):              0.2220
Time:                        12:24:21   Log-likelihood                   -245.43
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      15.263
Entities:                           6   P-value                           0.0000
Avg Obs:                       55.167   Distribution:                   F(4,192)
Min Obs:                       13.000                                           
Max Obs:                       127.00   F-statistic (robust):             15.263
                            

#### 随机效应

In [375]:
model_re = RandomEffects(
    dependent_var,
    independent_vars
)
results_re = model_re.fit()
print(results_re.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:            returns_new   R-squared:                        0.2915
Estimator:              RandomEffects   R-squared (Between):          -8.793e+30
No. Observations:                 331   R-squared (Within):               0.2946
Date:                Wed, May 07 2025   R-squared (Overall):              0.2915
Time:                        15:47:30   Log-likelihood                   -409.61
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      33.531
Entities:                           6   P-value                           0.0000
Avg Obs:                       55.167   Distribution:                   F(4,326)
Min Obs:                       13.000                                           
Max Obs:                       127.00   F-statistic (robust):             33.531
                            

In [376]:
n_stocks = full_stock.index.get_level_values('ts_code').nunique()
n_vars = independent_vars.shape[1]  # 包括截距项
print(f"个股数量：{n_stocks}，解释变量数：{n_vars}")

个股数量：6，解释变量数：5


In [378]:
# 比较固定效应和随机效应
from linearmodels.panel import compare

print(compare({"FE": results_fe, "RE": results_re}))

                     Model Comparison                    
                                     FE                RE
---------------------------------------------------------
Dep. Variable               returns_new       returns_new
Estimator                      PanelOLS     RandomEffects
No. Observations                    331               331
Cov. Est.                    Unadjusted        Unadjusted
R-squared                        0.2413            0.2915
R-Squared (Within)               0.2227            0.2946
R-Squared (Between)          -1.827e+30        -8.793e+30
R-Squared (Overall)              0.2220            0.2915
F-statistic                      15.263            33.531
P-value (F-stat)                 0.0000            0.0000
const                            0.0196            0.0383
                               (0.5322)          (0.8267)
sentiment_index                  0.2512            0.2285
                               (4.9253)          (4.8700)
pe_new        

In [377]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["Variable"] = independent_vars.columns
vif_data["VIF"] = [variance_inflation_factor(independent_vars.values, i) for i in range(independent_vars.shape[1])]
print(vif_data)

            Variable       VIF
0              const  1.007170
1    sentiment_index  1.013342
2             pe_new  1.632454
3             pb_new  1.605830
4  turnover_rate_new  1.014184


In [379]:
data_processed[['returns_new', 'log_volatility', 'sentiment_index', 'pb_new', 'pe_new', 'turnover_rate_new']].corr()

Unnamed: 0,returns_new,log_volatility,sentiment_index,pb_new,pe_new,turnover_rate_new
returns_new,1.0,0.591862,0.390926,0.00177,0.999826,0.595639
log_volatility,0.591862,1.0,0.320199,0.058841,0.595513,0.651195
sentiment_index,0.390926,0.320199,1.0,0.239429,0.387644,0.290585
pb_new,0.00177,0.058841,0.239429,1.0,0.012088,-0.230339
pe_new,0.999826,0.595513,0.387644,0.012088,1.0,0.597144
turnover_rate_new,0.595639,0.651195,0.290585,-0.230339,0.597144,1.0
