<h2>导入包</h2>

In [13]:
import logging
from datetime import datetime
import numpy as np
import pandas as pd
from pandas import Series
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from lightgbm import Dataset


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

<h2>读取数据</h2>

In [14]:
x_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_实际功率数据.csv", encoding="gbk")
info_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_基本信息.csv", encoding="gbk")
x_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_test = pd.read_csv("../data/submit_example.csv", encoding="utf-8")
info_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_基本信息.csv", encoding="gbk")

In [15]:
x_train = pd.merge(x_train, info_train[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_train["时间"] = pd.to_datetime(x_train["时间"])
x_test = pd.merge(x_test, info_test[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_test["时间"] = pd.to_datetime(x_test["时间"])

In [16]:
y_train = y_train.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_train["level_3"] = y_train["level_3"].apply(lambda x: int(x[1:]))
y_train["时间"] = pd.to_datetime(y_train["时间"])
y_train["时间"] = y_train["时间"] + (y_train["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_train = y_train.drop(columns=["level_3"])

y_test = y_test.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_test["level_3"] = y_test["level_3"].apply(lambda x: int(x[1:]))
y_test["时间"] = pd.to_datetime(y_test["时间"])
y_test["时间"] = y_test["时间"] + (y_test["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_test = y_test.drop(columns=["level_3"])

In [17]:
df_train = pd.merge(x_train, y_train, on=["光伏用户编号", "时间"], how="left")
df_test = pd.merge(x_test, y_test, on=["光伏用户编号", "时间"], how="left")
df = pd.concat([df_train, df_test], axis=0)

<h2>特征工程</h2>

<h3>时间特征</h3>

In [18]:
df["年"] = df["时间"].dt.year
df["月"] = df["时间"].dt.month
df["日"] = df["时间"].dt.day
df["分"] = df["时间"].dt.minute // 15 + df["时间"].dt.hour * 4

<h3>光伏用户编号</h3>

In [19]:
df["光伏用户编号_"] = df["光伏用户编号"].copy()
df = pd.get_dummies(df, columns=["光伏用户编号_"], prefix_sep="")

<h3>历史值特征</h3>

In [20]:
dfs = []
for site, df_site in df.groupby("光伏用户编号"):
    df_site["辐照强度（J/m2） - 1"] = df_site["辐照强度（J/m2）"].shift(1)
    df_site["辐照强度（J/m2） - 2"] = df_site["辐照强度（J/m2）"].shift(2)
    dfs.append(df_site)
df = pd.concat(dfs, axis=0)

<h3>划分测试集</h3>

In [21]:
df_train = df[df["时间"] <= df_train["时间"].max()]
df_test = df[df["时间"] >= df_test["时间"].min()]

<h2>训练模型</h2>

<h3>评测指标</h3>

In [22]:
def score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return 1 / (1 + rmse)

<h3>lightgbm模型</h3>

In [23]:
params_lgb = {
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'mse',
    'num_leaves': 64,
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
}

In [24]:
model_lgb = []
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

x = df_train.drop(columns=["光伏用户编号", "时间"]).astype(np.float32)
y = x.pop("target")
mse = 0
for fold, (train_index, val_index) in enumerate(kfold.split(x, y)):
    logging.info(f'############ fold: {fold} ###########')
    x_train, x_val, y_train, y_val = x.iloc[train_index], x.iloc[val_index], y.iloc[train_index], y.iloc[val_index]
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_val, y_val)
    model = lgb.train(params_lgb, trainset, valid_sets=[trainset, valset], num_boost_round=8000, callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)])
    model.save_model("../models/lgb_%d.txt" % fold)
    model_lgb.append(model)
    val_pred = Series(model.predict(x_val, num_iteration=model.best_iteration), index=y_val.index).fillna(0)
    mse += mean_squared_error(y_val.fillna(0), val_pred)
rmse = np.sqrt(mse)
score = 1 / (1 + rmse)
logging.info(f"--------------本地分数 {score}--------------")

2024-03-08 16:38:22,467 : INFO : ############ fold: 0 ###########


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0205642	valid_1's l2: 0.0254107
[2000]	training's l2: 0.0156391	valid_1's l2: 0.0227475
[3000]	training's l2: 0.0127196	valid_1's l2: 0.0213536
[4000]	training's l2: 0.0107204	valid_1's l2: 0.0205253
[5000]	training's l2: 0.00923794	valid_1's l2: 0.0200041
[6000]	training's l2: 0.00807261	valid_1's l2: 0.0195981
[7000]	training's l2: 0.00709543	valid_1's l2: 0.0192645
[8000]	training's l2: 0.00633244	valid_1's l2: 0.0190239
Did not meet early stopping. Best iteration is:
[8000]	training's l2: 0.00633244	valid_1's l2: 0.0190239


2024-03-08 16:39:34,931 : INFO : ############ fold: 1 ###########


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0204672	valid_1's l2: 0.0263589
[2000]	training's l2: 0.0154793	valid_1's l2: 0.0236902
[3000]	training's l2: 0.0125343	valid_1's l2: 0.0223019
[4000]	training's l2: 0.0105182	valid_1's l2: 0.0214586
[5000]	training's l2: 0.00903103	valid_1's l2: 0.0208954
[6000]	training's l2: 0.00785569	valid_1's l2: 0.0204789
[7000]	training's l2: 0.00695066	valid_1's l2: 0.0202028
[8000]	training's l2: 0.00616008	valid_1's l2: 0.0199665
Did not meet early stopping. Best iteration is:
[8000]	training's l2: 0.00616008	valid_1's l2: 0.0199665


2024-03-08 16:40:22,834 : INFO : ############ fold: 2 ###########


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0207026	valid_1's l2: 0.0248044
[2000]	training's l2: 0.0157271	valid_1's l2: 0.0221588
[3000]	training's l2: 0.0128256	valid_1's l2: 0.020737
[4000]	training's l2: 0.0107956	valid_1's l2: 0.019895
[5000]	training's l2: 0.00934123	valid_1's l2: 0.0193811
[6000]	training's l2: 0.00816371	valid_1's l2: 0.0189939
[7000]	training's l2: 0.00722377	valid_1's l2: 0.0186461
[8000]	training's l2: 0.0064281	valid_1's l2: 0.018402
Did not meet early stopping. Best iteration is:
[8000]	training's l2: 0.0064281	valid_1's l2: 0.018402


2024-03-08 16:41:05,877 : INFO : ############ fold: 3 ###########


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0206914	valid_1's l2: 0.02517
[2000]	training's l2: 0.0156317	valid_1's l2: 0.0224086
[3000]	training's l2: 0.0127332	valid_1's l2: 0.0211162
[4000]	training's l2: 0.0107501	valid_1's l2: 0.0203588
[5000]	training's l2: 0.0092389	valid_1's l2: 0.0197777
[6000]	training's l2: 0.0080699	valid_1's l2: 0.0193842
[7000]	training's l2: 0.00710109	valid_1's l2: 0.0190671
[8000]	training's l2: 0.00631801	valid_1's l2: 0.0188206
Did not meet early stopping. Best iteration is:
[8000]	training's l2: 0.00631801	valid_1's l2: 0.0188206


2024-03-08 16:42:21,983 : INFO : ############ fold: 4 ###########


Training until validation scores don't improve for 100 rounds
[1000]	training's l2: 0.0207379	valid_1's l2: 0.0249936
[2000]	training's l2: 0.0157035	valid_1's l2: 0.0221958
[3000]	training's l2: 0.0128237	valid_1's l2: 0.0208243
[4000]	training's l2: 0.010751	valid_1's l2: 0.0199443
[5000]	training's l2: 0.00928263	valid_1's l2: 0.0194477
[6000]	training's l2: 0.0081234	valid_1's l2: 0.0190756
[7000]	training's l2: 0.00716806	valid_1's l2: 0.0187613
[8000]	training's l2: 0.00636851	valid_1's l2: 0.0185309
Did not meet early stopping. Best iteration is:
[8000]	training's l2: 0.00636851	valid_1's l2: 0.0185309


2024-03-08 16:43:37,861 : INFO : --------------本地分数 0.7646399860540124--------------


<h2>预测</h2>

In [25]:
x_test = df_test.drop(columns=["光伏用户编号", "时间"]).astype(np.float32)
y_test = x_test.pop("target")
y_pred = np.zeros((df_test.shape[0], ))
for model in model_lgb:
    y_pred += model.predict(x_test, num_iteration=model.best_iteration)
y_pred = y_pred / kfold.n_splits
df_test["target"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [26]:
df_test = df_test[["光伏用户编号", "综合倍率", "年", "月", "日", "分", "target"]]
df_test["时间"] = df_test["年"].astype(str) + "-" + df_test["月"].astype(str) + "-" + df_test["日"].astype(str) + " 0:00"
df_test["分"] = "p" + (df_test["分"] + 1).astype(str)
df_test = df_test.drop(columns=["年", "月", "日"])

In [27]:
result = pd.pivot(df_test, index=["光伏用户编号", "综合倍率", "时间"], columns="分", values="target").reset_index()
result = result[result["综合倍率"].notnull()]
result["综合倍率"] = result["综合倍率"].astype(int)

In [28]:
result.to_csv("../data/%s.csv" % datetime.now().strftime("%Y%m%d_%H%M%S"), encoding="utf-8", index=False)