<h2>导入包</h2>

In [55]:
from datetime import datetime
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import Dataset
import time
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
from matplotlib.pyplot import plot, show, title

<h2>读取数据</h2>

In [56]:
df_train = pd.read_csv('../data/A榜-训练集_海上风电预测_气象变量及实际功率数据.csv', encoding='gbk')
df_test = pd.read_csv('../data/A榜-测试集_海上风电预测_气象变量数据.csv', encoding='gbk')
info_train = pd.read_csv('../data/A榜-训练集_海上风电预测_基本信息.csv', encoding='gbk')
info_test = pd.read_csv('../data/A榜-测试集_海上风电预测_基本信息.csv', encoding='gbk')

In [57]:
df_train = pd.merge(df_train, info_train[["站点编号", "装机容量(MW)"]], how="left", on="站点编号")
df_test = pd.merge(df_test, info_test[["站点编号", "装机容量(MW)"]], how="left", on="站点编号")

<h2>特征工程</h2>

<h3>处理空值</h3>

In [58]:
df_train = df_train.replace("<NULL>", np.nan)
df_test = df_test.replace("<NULL>", np.nan)

<h3>时间特征</h3>

In [59]:
df_train["时间"] = pd.to_datetime(df_train["时间"])
df_train["年"] = df_train["时间"].dt.year
df_train["月"] = df_train["时间"].dt.month
df_train["日"] = df_train["时间"].dt.day
df_train["分"] = (df_train["时间"].dt.hour * 60 + df_train["时间"].dt.minute) // 15
df_train = df_train.drop(columns=["时间"])

In [60]:
df_test["时间"] = pd.to_datetime(df_test["时间"])
df_test["年"] = df_test["时间"].dt.year
df_test["月"] = df_test["时间"].dt.month
df_test["日"] = df_test["时间"].dt.day
df_test["分"] = (df_test["时间"].dt.hour * 60 + df_test["时间"].dt.minute) // 15
df_test = df_test.drop(columns=["时间"])

<h2>逐个建模</h2>

<h3>lightgbm超参数</h3>

In [61]:
params_lgb = {
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'mse',
    'num_leaves': 64,
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
}

<h3>分组训练</h3>

In [62]:
model_lgb = dict()
for site, df in df_train.groupby("站点编号"):
    df = df.drop(columns=["站点编号"]).astype(np.float32)
    y = df["出力(MW)"]
    x = df.drop(columns=["出力(MW)"])
    x["10米风速（10m/s） - 1"] = x["10米风速（10m/s）"].shift(1)
    x["100m风速（100m/s） - 1"] = x["100m风速（100m/s）"].shift(1)
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_val, y_val)
    model_lgb[site] = lgb.train(params_lgb, trainset, valid_sets=[trainset, valset], num_boost_round=5000, callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)])
    model_lgb[site] = lgb.train(params_lgb, Dataset(x, y), num_boost_round=model_lgb[site].best_iteration)
    model_lgb[site].save_model("../models/lgb_%s.txt" % site)

Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 6.3883	valid_1's l2: 10.5232
[2000]	training's l2: 3.13315	valid_1's l2: 7.44521
[3000]	training's l2: 1.89543	valid_1's l2: 6.21733
[4000]	training's l2: 1.26768	valid_1's l2: 5.57674
[5000]	training's l2: 0.900339	valid_1's l2: 5.16667
Did not meet early stopping. Best iteration is:
[5000]	training's l2: 0.900339	valid_1's l2: 5.16667
Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 272.705	valid_1's l2: 436.153
[2000]	training's l2: 128.064	valid_1's l2: 300.159
[3000]	training's l2: 74.1102	valid_1's l2: 245.881
[4000]	training's l2: 48.2455	valid_1's l2: 217.762
[5000]	training's l2: 33.1708	valid_1's l2: 200.184
Did not meet early stopping. Best iteration is:
[5000]	training's l2: 33.1708	valid_1's l2: 200.184
Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 6.9709	valid_1's l2: 11.3418
[2000]	training's l2: 3.20898	valid_1's l2

<h2>预测</h2>

In [63]:
submit_file = pd.read_csv("../data/A_submit_example.csv", encoding="utf-8")
for site, df in df_test.groupby("站点编号"):
    df = df.drop(columns=["站点编号"]).astype(np.float32)
    df["10米风速（10m/s） - 1"] = df["10米风速（10m/s）"].shift(1)
    df["100m风速（100m/s） - 1"] = df["100m风速（100m/s）"].shift(1)
    y_pred = model_lgb[site].predict(df)
    submit_file.loc[submit_file["站点编号"] == site, "出力(MW)"] = y_pred

In [64]:
submit_file.to_csv("../data/%s.csv" % datetime.now().strftime("%Y%m%d_%H%M%S"), encoding="utf-8", index=False)