In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from scipy import stats

# 读取数据
df = pd.read_csv('/root/Download/Modis-algae-aliyun/AlgaeBloomForecast-20241107/merged_all_data_filtered.csv')

# 选择需要分析相关性的列
target_cols = ['density_mean_y', 'density_lower_y', 'density_upper_y', 
               'chla_mean', 'chla_lower', 'chla_upper']

# 选择数值型特征列,排除目标列、日期列和density_x相关列
exclude_cols = target_cols + ['date', 'density_mean_x', 'density_lower_x', 'density_upper_x']
feature_cols = df.select_dtypes(include=[np.number]).columns.difference(exclude_cols)

# 1. 使用Spearman相关系数分析非线性相关性
spearman_corr = df[list(feature_cols) + target_cols].corr(method='spearman')

# 2. 计算互信息得分
mi_scores = {}
for target in target_cols:
    mi_scores[target] = {}
    for feature in feature_cols:
        mi_score = mutual_info_regression(df[[feature]], df[target], random_state=42)[0]
        mi_scores[target][feature] = mi_score

# 3. 高级滞后相关性分析
max_lag = 15
lag_analysis = {}

for target in target_cols:
    lag_analysis[target] = {}
    
    for feature in feature_cols:
        lag_analysis[target][feature] = {
            'pearson': [],
            'spearman': [],
            'kendall': [],
            'granger_causality': []
        }
        
        for lag in range(1, max_lag + 1):
            # Pearson相关系数
            pearson = df[feature].shift(lag).corr(df[target])
            
            # Spearman相关系数
            spearman = df[feature].shift(lag).corr(df[target], method='spearman')
            
            # Kendall相关系数
            kendall = df[feature].shift(lag).corr(df[target], method='kendall')
            
            # Granger因果关系检验
            feature_lagged = df[feature].shift(lag).dropna()
            target_aligned = df[target][lag:]
            if len(feature_lagged) > lag:
                granger_test = stats.pearsonr(feature_lagged, target_aligned[:len(feature_lagged)])[0]
            else:
                granger_test = np.nan
                
            lag_analysis[target][feature]['pearson'].append(pearson)
            lag_analysis[target][feature]['spearman'].append(spearman)
            lag_analysis[target][feature]['kendall'].append(kendall)
            lag_analysis[target][feature]['granger_causality'].append(granger_test)

# 4. 特征重要性综合评分
feature_importance = {}
for target in target_cols:
    feature_importance[target] = {}
    
    for feature in feature_cols:
        # 计算综合得分
        abs_corr = abs(spearman_corr.loc[feature, target])
        mi_score = mi_scores[target][feature]
        max_lag_corr = max([abs(x) for x in lag_analysis[target][feature]['spearman'] if not np.isnan(x)], default=0)
        
        # 综合评分 = 0.4*相关系数 + 0.3*互信息 + 0.3*最大滞后相关
        composite_score = 0.4 * abs_corr + 0.3 * mi_score + 0.3 * max_lag_corr
        feature_importance[target][feature] = composite_score

# 输出关键影响因子
print("关键影响因子分析结果:")
for target in target_cols:
    print(f"\n目标变量 {target} 的top 10关键影响因子:")
    
    # 按综合得分排序
    sorted_features = sorted(feature_importance[target].items(), 
                           key=lambda x: x[1], 
                           reverse=True)[:10]
    
    for feature, score in sorted_features:
        best_lag = np.argmax([abs(x) for x in lag_analysis[target][feature]['spearman']]) + 1
        print(f"特征: {feature}")
        print(f"- 综合重要性得分: {score:.3f}")
        print(f"- Spearman相关系数: {spearman_corr.loc[feature, target]:.3f}")
        print(f"- 最佳滞后期: {best_lag}天")
        print(f"- 互信息得分: {mi_scores[target][feature]:.3f}")
        print()

# 天气特征分析
df = pd.read_csv('merged_all_data_filtered.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

def split_weather(weather):
    if pd.isna(weather):
        return []
    return str(weather).split('-')

all_weather_types = set()
for weather in df['weather'].dropna():
    all_weather_types.update(split_weather(weather))

for weather_type in all_weather_types:
    df[f'weather_{weather_type}'] = df['weather'].apply(lambda x: 1 if pd.notna(x) and weather_type in split_weather(x) else 0)

weather_features = [col for col in df.columns if col.startswith('weather_')]

# 对天气特征进行同样的分析
weather_importance = {}
for target in target_cols:
    weather_importance[target] = {}
    
    for feature in weather_features:
        # 计算基本相关性
        corr = df[feature].corr(df[target], method='spearman')
        
        # 计算最大滞后相关性
        lag_corrs = []
        for lag in range(1, max_lag + 1):
            lag_corr = df[feature].shift(lag).corr(df[target], method='spearman')
            lag_corrs.append(lag_corr if not np.isnan(lag_corr) else 0)
        
        max_lag_corr = max([abs(x) for x in lag_corrs])
        best_lag = np.argmax([abs(x) for x in lag_corrs]) + 1
        
        weather_importance[target][feature] = {
            'correlation': corr,
            'max_lag_correlation': max_lag_corr,
            'best_lag': best_lag
        }

# 输出天气影响分析结果
print("\n天气因素影响分析结果:")
for target in target_cols:
    print(f"\n目标变量 {target} 的主要天气影响因子:")
    
    sorted_weather = sorted(weather_importance[target].items(), 
                          key=lambda x: abs(x[1]['correlation']), 
                          reverse=True)[:5]
    
    for feature, metrics in sorted_weather:
        print(f"天气类型: {feature.replace('weather_','')}")
        print(f"- 相关系数: {metrics['correlation']:.3f}")
        print(f"- 最佳滞后期: {metrics['best_lag']}天")
        print(f"- 最大滞后相关: {metrics['max_lag_correlation']:.3f}")
        print()


关键影响因子分析结果:

目标变量 density_mean_y 的top 10关键影响因子:
特征: temperature
- 综合重要性得分: 0.533
- Spearman相关系数: 0.561
- 最佳滞后期: 7天
- 互信息得分: 0.450

特征: min_temperature
- 综合重要性得分: 0.476
- Spearman相关系数: 0.546
- 最佳滞后期: 4天
- 互信息得分: 0.282

特征: max_temperature
- 综合重要性得分: 0.469
- Spearman相关系数: 0.533
- 最佳滞后期: 3天
- 互信息得分: 0.267

特征: oxygen
- 综合重要性得分: 0.426
- Spearman相关系数: -0.440
- 最佳滞后期: 15天
- 互信息得分: 0.329

特征: TN
- 综合重要性得分: 0.412
- Spearman相关系数: -0.442
- 最佳滞后期: 4天
- 互信息得分: 0.316

特征: conductivity
- 综合重要性得分: 0.411
- Spearman相关系数: -0.412
- 最佳滞后期: 5天
- 互信息得分: 0.401

特征: pH
- 综合重要性得分: 0.328
- Spearman相关系数: 0.335
- 最佳滞后期: 1天
- 互信息得分: 0.306

特征: aqi
- 综合重要性得分: 0.256
- Spearman相关系数: -0.351
- 最佳滞后期: 1天
- 互信息得分: 0.078

特征: aqiLevel
- 综合重要性得分: 0.247
- Spearman相关系数: -0.330
- 最佳滞后期: 1天
- 互信息得分: 0.095

特征: TP
- 综合重要性得分: 0.241
- Spearman相关系数: 0.272
- 最佳滞后期: 1天
- 互信息得分: 0.216


目标变量 density_lower_y 的top 10关键影响因子:
特征: temperature
- 综合重要性得分: 0.447
- Spearman相关系数: 0.465
- 最佳滞后期: 3天
- 互信息得分: 0.382

特征: min_temperature
- 综合重要性得分: