<h2>导入包</h2>

In [93]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import Dataset

<h2>读取数据</h2>

In [47]:
x_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_实际功率数据.csv", encoding="gbk")
info_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_基本信息.csv", encoding="gbk")
x_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_test = pd.read_csv("../data/submit_example.csv", encoding="utf-8")
info_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_基本信息.csv", encoding="gbk")

In [48]:
x_train = pd.merge(x_train, info_train[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_train["时间"] = pd.to_datetime(x_train["时间"])
x_test = pd.merge(x_test, info_test[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_test["时间"] = pd.to_datetime(x_test["时间"])

In [49]:
y_train = y_train.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_train["level_3"] = y_train["level_3"].apply(lambda x: int(x[1:]))
y_train["时间"] = pd.to_datetime(y_train["时间"])
y_train["时间"] = y_train["时间"] + (y_train["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_train = y_train.drop(columns=["level_3"])

y_test = y_test.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_test["level_3"] = y_test["level_3"].apply(lambda x: int(x[1:]))
y_test["时间"] = pd.to_datetime(y_test["时间"])
y_test["时间"] = y_test["时间"] + (y_test["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_test = y_test.drop(columns=["level_3"])

In [50]:
df_train = pd.merge(x_train, y_train, on=["光伏用户编号", "时间"], how="left")
df_test = pd.merge(x_test, y_test, on=["光伏用户编号", "时间"], how="left")

<h2>特征工程</h2>

<h3>时间特征</h3>

In [51]:
df_train["年"] = df_train["时间"].dt.year
df_train["月"] = df_train["时间"].dt.month
df_train["日"] = df_train["时间"].dt.day
df_train["分"] = df_train["时间"].dt.minute // 15 + df_train["时间"].dt.hour * 4
df_train = df_train.drop(columns=["时间"])

df_test["年"] = df_test["时间"].dt.year
df_test["月"] = df_test["时间"].dt.month
df_test["日"] = df_test["时间"].dt.day
df_test["分"] = df_test["时间"].dt.minute // 15 + df_test["时间"].dt.hour * 4
df_test = df_test.drop(columns=["时间"])

<h2>训练模型</h2>

In [52]:
params_lgb = {
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'mse',
    'num_leaves': 64,
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
}

In [53]:
model_lgb = dict()
for site, df in df_train.groupby("光伏用户编号"):
    df = df.drop(columns=["光伏用户编号"]).astype(np.float32)
    y = df.pop("target")
    x_train, x_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=42)
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_val, y_val)
    model_lgb[site] = lgb.train(params_lgb, trainset, valid_sets=[trainset, valset], num_boost_round=5000, callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)])
    model_lgb[site].save_model("../models/lgb_%s.txt" % site)


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.00839727	valid_1's l2: 0.0188463
[2000]	training's l2: 0.00443911	valid_1's l2: 0.0180348
Early stopping, best iteration is:
[2284]	training's l2: 0.00383934	valid_1's l2: 0.0179457
Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0114937	valid_1's l2: 0.029827
[2000]	training's l2: 0.00612601	valid_1's l2: 0.0283156
[3000]	training's l2: 0.00381735	valid_1's l2: 0.0278973
Early stopping, best iteration is:
[3368]	training's l2: 0.00324705	valid_1's l2: 0.0278525
Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0187837	valid_1's l2: 0.0427794
[2000]	training's l2: 0.0100032	valid_1's l2: 0.041003
Early stopping, best iteration is:
[2645]	training's l2: 0.00718975	valid_1's l2: 0.0407087
Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.00797945	valid_1's l2: 0.0221909
Early stopping, best iter

<h2>预测</h2>

In [54]:
for site, df in df_test.groupby("光伏用户编号"):
    df = df.drop(columns=["光伏用户编号"]).astype(np.float32)
    y = df.pop("target")
    y_pred = model_lgb[site].predict(df)
    df_test.loc[df_test["光伏用户编号"] == site, "target"] = y_pred

In [55]:
df_test = df_test[["光伏用户编号", "综合倍率", "年", "月", "日", "分", "target"]]
df_test["时间"] = df_test["年"].astype(str) + "-" + df_test["月"].astype(str) + "-" + df_test["日"].astype(str) + " 0:00"
df_test["分"] = "p" + (df_test["分"] + 1).astype(str)
df_test = df_test.drop(columns=["年", "月", "日"])

In [91]:
result = pd.pivot(df_test, index=["光伏用户编号", "综合倍率", "时间"], columns="分", values="target").reset_index()
result = result[result["综合倍率"].notnull()]
result["综合倍率"] = result["综合倍率"].astype(int)

In [94]:
result.to_csv("../data/%s.csv" % datetime.now().strftime("%Y%m%d_%H%M%S"), encoding="utf-8", index=False)