In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import config
from powerutils import data_processing as dp
from powerutils import data_visualization as dv
from powerutils.accuracy import national_accuracy, southern_accuracy

plt.rcParams['font.family'] = 'Source Han Sans SC'
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 数据完整性

In [None]:
filepath = config.dirpath_merge / 'train_v1.csv'
df = pd.read_csv(str(filepath), index_col=0, parse_dates=True)
dp.print_time_index(df.index)
dp.describe(df)

In [None]:
dv.plot_missing(df)

## 时间序列

In [None]:
dv.plot_multi_timeseries(
    df=df,
    keys=['功率', '风速', '气温', '气压', '湿度'],
    start_time=None,
    end_time=None,
    filter=True
)

In [5]:
df.loc['':'', '实际功率'] = np.nan

## 功率异常时段

In [7]:
count = dp.count_consecutive_zeros(df['实际功率']).asfreq('15T')
count.plot(figsize=(10, 4), xlabel='')

In [None]:
dv.plot_twin_timeseries(
    df=df,
    left_key=['实际功率', '短期预测功率'],
    right_key=['实测风速',  '预报风速'],
    start_time=None,
    end_time=None
)

In [None]:
df.loc['':'', '实际功率'] = np.nan

## 相关性

In [None]:
dp.calc_corr(df, key='实际功率').to_frame().round(3)

In [None]:
corr = dp.combination_corr(df, key='实际功率')
acorr = corr.abs()
cond = (acorr['组合'] - acorr['左'] > 0.2) & (acorr['组合'] - acorr['右'] > 0.2)
corr.loc[cond].round(3)

## 散点分布

In [None]:
dfa = pd.DataFrame()
dfa['实际功率'] = df['实际功率']
dfa['短期预测功率'] = df['短期预测功率']
dfa['预报风速'] = df['预报风速']
dfa['实测风速'] = df['实测风速']
sns.pairplot(dfa, plot_kws={'s': 5})

## 准确率

In [None]:
y_true = df['实际功率']
y_pred = df['短期预测功率']
df_acc = pd.DataFrame({
    '国网准确率': national_accuracy(y_true, y_pred, config.cap, positive=True),
    '南网准确率': southern_accuracy(y_true, y_pred, config.cap)
})

ax = df_acc.plot(figsize=(10, 4), xlabel='')
ax.axhline(65, ls='--', lw=1, c='k')
ax.set_ylim(None, 100)

In [None]:
df_monthly = df_acc.clip(0).resample('M').mean()
df_monthly['有效天数'] = df_acc.iloc[:, 0].resample('M').count()
df_monthly.index = df_monthly.index.strftime('%Y-%m')
df_monthly.loc['mean'] = df_acc.clip(0).mean()
df_monthly.round(2)

画出准确率最差的几天。

In [None]:
df_acc['南网准确率'].sort_values().head(10).round(2)

In [None]:
dv.plot_twin_timeseries(
    df=df,
    left_key=['实际功率', '短期预测功率'],
    right_key=['实测风速', '预报风速'],
    start_time=None,
    end_time=None
)

In [None]:
df.loc['':'', '实际功率'] = np.nan

## 保存修改

In [15]:
filepath = config.dirpath_merge / 'train_v2'
df.to_csv(str(filepath))