In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import zscore
from statsmodels.tsa.stattools import grangercausalitytests
import jieba
from snownlp import SnowNLP
import tushare as ts
import akshare as ak
from statsmodels.tsa.stattools import adfuller, coint, kpss, grangercausalitytests
from statsmodels.tsa.vector_ar.vecm import coint_johansen
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects, compare
from linearmodels.iv import IV2SLS
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
def winsorize_series(series, lower=0.01, upper=0.99):
    """
    对收益率序列进行缩尾处理
    :param series: pandas Series, 收益率序列
    :param lower: 下缩尾分位数（默认1%）
    :param upper: 上缩尾分位数（默认99%）
    :return: 缩尾后的Series
    """
    # 计算分位数边界
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)

    # 执行缩尾（极端值替换为边界值）
    return series.clip(lower=lower_bound, upper=upper_bound)

In [None]:
def stock_processed(result):
    result['agreement_index'] = result['sentiment_index'].apply(lambda x : 1-np.sqrt(1-x**2))
    result['return'] = np.log(result['close']/result['close'].shift(1))*100
    result['return'] = result['return'].fillna(0)
    result['volatility'] = (result['high'] - result['low'])/((result['high']+result['low'])/2)*100
    result = result[~result['sentiment_index'].isna()].reset_index(drop=True)
    result = result.rename(columns={'return': 'returns'}).copy()

    result['returns_new'] = winsorize_series(result['returns'])
    result['log_volatility'] = np.log(result['volatility']/100 + 1e-5).pipe(lambda x: (x - x.mean())/x.std())*100

    result['sentiment_index_lag_1'] = result['sentiment_index'].shift(1).fillna(0)

    result['turnover_rate_new'] = result['turnover_rate'].apply(lambda x: np.log(1+x))
    result['pe_new'] = winsorize_series(result['pe'], lower=0.01, upper=0.99)
    result['pb_new'] = winsorize_series(result['pb'], lower=0.01, upper=0.99)

    return result