<h2>导入包</h2>

In [38]:
import logging
import random
from datetime import datetime
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMRegressor, log_evaluation, early_stopping, Dataset
import xgboost as xgb
from sklearn.model_selection import KFold


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [39]:
class Config():
    seed=2024
    path="../data/"
    target='出力(MW)'
    num_folds=12
    
    
#两个基本信息的csv文件可以变成这个字典 位置和装机容量的关系
pos2mw={"f1":48,'f2':280,'f3':48,'f4':88,'f5':48}

#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(seed=Config.seed)

In [40]:
def feature_engineer(df):
    df['站点装机容量']=df['站点编号'].apply(lambda x: pos2mw[x])

    #对时间这列做一系列特征工程,待填充.
    df['时间'] = pd.to_datetime(df['时间'])
    # 提取年、月、日、小时和分钟
    df['year'] = df['时间'].dt.year
    df['month'] = df['时间'].dt.month
#     df['date'] = df['时间'].dt.day
    df['hour'] = df['时间'].dt.hour
    df['minute'] = df['时间'].dt.minute

    df['比大气压']=df['气压(Pa）'] / df['温度（K）']
    #将温度转换成摄氏度
    df['温度（K）']=df['温度（K）']-273.15
    #相对湿度的取值范围为0.01~0.99
    df['相对湿度（%）']=df['相对湿度（%）'].apply(lambda x:max(1,x))
    df['相对湿度（%）']=df['相对湿度（%）'].apply(lambda x:min(99,x))

    #气象数据的特征工程
    #露点温度（°C） = 温度（°C） - (100 - 相对湿度（%）) / 5
    df['露点温度']=df['温度（K）']-(100-df['相对湿度（%）'])/5

    #风速和风向的特征工程
    df['sin_100m风速（100m/s）']=df['100m风速（100m/s）']*np.sin(np.pi*df['100m风向（°)']/180)
    df['cos_100m风速（100m/s）']=df['100m风速（100m/s）']*np.cos(np.pi*df['100m风向（°)']/180)

    df['sin_10米风速（10m/s）']=df['10米风速（10m/s）']*np.sin(np.pi*df['10米风向（°)']/180)
    df['cos_10米风速（10m/s）']=df['10米风速（10m/s）']*np.cos(np.pi*df['10米风向（°)']/180)
    
    df['100m风向（°)类别']=(df['100m风向（°)']+1)//90
    df['10米风向（°)类别']=(df['10米风向（°)']+1)//90
    df['10米风向（°)_100m风向（°)']=(df['100m风向（°)类别']==df['10米风向（°)类别'])
    
    for col in ['100m风向（°)类别','10米风向（°)类别']:
        unique_value=df[col].unique()
        for value in unique_value:
            df[col+"_"+str(value)]=(df[col]==value)
            
    #由于是时序数据,如果有缺失值,这个值肯定和前面一个数据是最接近的.
    df.fillna(method='ffill',inplace=True)
    
    df.drop(['时间'],axis=1,inplace=True)
    
    return df

<h2>处理数据</h2>

<h3>训练集</h3>

In [41]:
train_df = pd.read_csv(Config.path+"A榜-训练集_海上风电预测_气象变量及实际功率数据.csv", encoding='gbk')
logging.info(f"len(train_df): {len(train_df)}")
#将缺失值替换为np.nan,并转数据类型为float.
train_df[Config.target] = train_df[Config.target].replace('<NULL>', np.nan).astype(float)
#用线性插值的方式填充缺失值
train_df[Config.target] = train_df[Config.target].interpolate()
train_df.head()

2024-03-12 15:30:48,108 : INFO : len(train_df): 231840


Unnamed: 0,站点编号,时间,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,100m风向（°),出力(MW)
0,f1,2022-1-3 0:00,102249.6094,74.8513,0.007812,7.7041,26.5195,286.0695,0.0,8e-06,9.082,27.5093,17.26
1,f1,2022-1-3 0:15,102252.0355,74.753,0.000924,7.771,23.5766,285.8647,0.0,8e-06,9.1374,24.7151,16.78
2,f1,2022-1-3 0:30,102248.59,74.4995,0.003009,7.8272,21.5451,285.6935,0.0,8e-06,9.1856,22.7445,16.25
3,f1,2022-1-3 0:45,102240.4725,74.1432,0.011402,7.8637,20.2394,285.5512,0.0,8e-06,9.2158,21.4374,15.37
4,f1,2022-1-3 1:00,102228.8828,73.7366,0.023438,7.8781,19.487,285.433,0.0,8e-06,9.2237,20.6417,15.64


<h3>测试集</h3>

In [42]:
test_df = pd.read_csv(Config.path+"A榜-测试集_海上风电预测_气象变量数据.csv", encoding='gbk')
logging.info(f"len(test_df): {len(test_df)}")
test_df.head()

2024-03-12 15:30:48,221 : INFO : len(test_df): 44160


Unnamed: 0,站点编号,时间,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,100m风向（°)
0,f1,2023-5-1 0:00,101309.625,84.3487,0.67969,4.7181,18.4066,290.0291,0.0,0.003588,5.5467,20.4047
1,f1,2023-5-1 0:15,101303.259,84.48,0.65654,4.7575,18.2344,290.003,0.0,0.000739,5.5886,20.1468
2,f1,2023-5-1 0:30,101291.8681,84.1823,0.61713,4.7772,16.0279,289.9786,0.0,0.000641,5.6422,17.9834
3,f1,2023-5-1 0:45,101277.4571,83.5966,0.58752,4.7852,12.8178,289.9495,0.0,0.002015,5.6978,14.9278
4,f1,2023-5-1 1:00,101262.0313,82.8641,0.59375,4.7795,9.6014,289.9096,0.0,0.003588,5.7344,11.924


In [43]:
# #计算两组变量的皮尔逊相关系数
# def pearson_corr(x1,x2):
#     """
#     x1,x2:np.array
#     """
#     mean_x1=np.mean(x1)
#     mean_x2=np.mean(x2)
#     std_x1=np.std(x1)
#     std_x2=np.std(x2)
#     pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)
#     return pearson
# drop_cols=[]
# for col in train_df.drop([Config.target],axis=1).columns:
#     pearson=pearson_corr(train_df[col].values,train_df[Config.target].values)
#     print(f"col:{col},pearson_corr:{pearson}")
#     if abs(pearson)<=0.01:#如果基本上没有什么相关性的特征那就直接drop好了
#         drop_cols+=[col]
drop_cols = ['相对湿度（%）', 'year']
logging.info(f"删除列: {drop_cols}")
logging.info(f"总特征数: {len(test_df.columns)}")

2024-03-12 15:30:48,239 : INFO : 删除列: ['相对湿度（%）', 'year']
2024-03-12 15:30:48,239 : INFO : 总特征数: 12


<h2>特征工程</h2>

<h3>基础特征工程</h3>

In [44]:
total_df = pd.concat((train_df,test_df), axis=0)
total_df = feature_engineer(total_df)

<h3>gap特征</h3>

In [45]:
gaps = [1, 2, 4, 7, 15, 30, 50, 80]
for gap in gaps:
    for col in ['气压(Pa）', '相对湿度（%）', '云量', '10米风速（10m/s）', '10米风向（°)', '温度（K）', '辐照强度（J/m2）', '降水（m）', '100m风速（100m/s）', '100m风向（°)']:
        logging.info(f"特征{col}的{gap}gap")
        total_df[col + f"_shift{gap}"] = total_df[col].groupby(total_df['站点编号']).shift(gap)
        total_df[col + f"_gap{gap}"] = total_df[col+f"_shift{gap}"] - total_df[col]
        total_df.drop([col + f"_shift{gap}"], axis=1, inplace=True)

2024-03-12 15:30:48,803 : INFO : 特征气压(Pa）的1gap
2024-03-12 15:30:48,881 : INFO : 特征相对湿度（%）的1gap
2024-03-12 15:30:48,960 : INFO : 特征云量的1gap
2024-03-12 15:30:49,041 : INFO : 特征10米风速（10m/s）的1gap
2024-03-12 15:30:49,124 : INFO : 特征10米风向（°)的1gap
2024-03-12 15:30:49,206 : INFO : 特征温度（K）的1gap
2024-03-12 15:30:49,290 : INFO : 特征辐照强度（J/m2）的1gap
2024-03-12 15:30:49,375 : INFO : 特征降水（m）的1gap
2024-03-12 15:30:49,462 : INFO : 特征100m风速（100m/s）的1gap
2024-03-12 15:30:49,553 : INFO : 特征100m风向（°)的1gap
2024-03-12 15:30:49,645 : INFO : 特征气压(Pa）的2gap
2024-03-12 15:30:49,739 : INFO : 特征相对湿度（%）的2gap
2024-03-12 15:30:49,839 : INFO : 特征云量的2gap
2024-03-12 15:30:49,936 : INFO : 特征10米风速（10m/s）的2gap
2024-03-12 15:30:50,037 : INFO : 特征10米风向（°)的2gap
2024-03-12 15:30:50,138 : INFO : 特征温度（K）的2gap
2024-03-12 15:30:50,241 : INFO : 特征辐照强度（J/m2）的2gap
2024-03-12 15:30:50,352 : INFO : 特征降水（m）的2gap
2024-03-12 15:30:50,461 : INFO : 特征100m风速（100m/s）的2gap
2024-03-12 15:30:50,570 : INFO : 特征100m风向（°)的2gap
2024-03-12 15:30:50,682 

<h3>onehot特征</h3>

In [46]:
for col in ['站点编号']:
    logging.info(f"特征{col}的onehot")
    unique_value = total_df[col].unique()
    for value in unique_value:
        total_df[col + "_" + str(value)] = (total_df[col] == value)
    total_df.drop([col], axis=1, inplace=True)

2024-03-12 15:31:01,310 : INFO : 特征站点编号的onehot


<h3>处理缺失值</h3>

In [47]:
train_df = total_df[:len(train_df)].copy()
test_df = total_df[len(train_df):].copy()
train_df.dropna(inplace=True)
test_df.head()

Unnamed: 0,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,100m风向（°),出力(MW),站点装机容量,year,month,hour,minute,比大气压,露点温度,sin_100m风速（100m/s）,cos_100m风速（100m/s）,sin_10米风速（10m/s）,cos_10米风速（10m/s）,100m风向（°)类别,10米风向（°)类别,10米风向（°)_100m风向（°),100m风向（°)类别_0.0,100m风向（°)类别_4.0,100m风向（°)类别_3.0,100m风向（°)类别_1.0,100m风向（°)类别_2.0,10米风向（°)类别_0.0,10米风向（°)类别_3.0,10米风向（°)类别_4.0,10米风向（°)类别_1.0,10米风向（°)类别_2.0,气压(Pa）_gap1,相对湿度（%）_gap1,云量_gap1,10米风速（10m/s）_gap1,10米风向（°)_gap1,温度（K）_gap1,辐照强度（J/m2）_gap1,降水（m）_gap1,100m风速（100m/s）_gap1,100m风向（°)_gap1,气压(Pa）_gap2,相对湿度（%）_gap2,云量_gap2,10米风速（10m/s）_gap2,10米风向（°)_gap2,温度（K）_gap2,辐照强度（J/m2）_gap2,降水（m）_gap2,100m风速（100m/s）_gap2,100m风向（°)_gap2,气压(Pa）_gap4,相对湿度（%）_gap4,云量_gap4,10米风速（10m/s）_gap4,10米风向（°)_gap4,温度（K）_gap4,辐照强度（J/m2）_gap4,降水（m）_gap4,100m风速（100m/s）_gap4,100m风向（°)_gap4,气压(Pa）_gap7,相对湿度（%）_gap7,云量_gap7,10米风速（10m/s）_gap7,10米风向（°)_gap7,温度（K）_gap7,辐照强度（J/m2）_gap7,降水（m）_gap7,100m风速（100m/s）_gap7,100m风向（°)_gap7,气压(Pa）_gap15,相对湿度（%）_gap15,云量_gap15,10米风速（10m/s）_gap15,10米风向（°)_gap15,温度（K）_gap15,辐照强度（J/m2）_gap15,降水（m）_gap15,100m风速（100m/s）_gap15,100m风向（°)_gap15,气压(Pa）_gap30,相对湿度（%）_gap30,云量_gap30,10米风速（10m/s）_gap30,10米风向（°)_gap30,温度（K）_gap30,辐照强度（J/m2）_gap30,降水（m）_gap30,100m风速（100m/s）_gap30,100m风向（°)_gap30,气压(Pa）_gap50,相对湿度（%）_gap50,云量_gap50,10米风速（10m/s）_gap50,10米风向（°)_gap50,温度（K）_gap50,辐照强度（J/m2）_gap50,降水（m）_gap50,100m风速（100m/s）_gap50,100m风向（°)_gap50,气压(Pa）_gap80,相对湿度（%）_gap80,云量_gap80,10米风速（10m/s）_gap80,10米风向（°)_gap80,温度（K）_gap80,辐照强度（J/m2）_gap80,降水（m）_gap80,100m风速（100m/s）_gap80,100m风向（°)_gap80,站点编号_f1,站点编号_f2,站点编号_f3,站点编号_f4,站点编号_f5
0,101309.625,84.3487,0.67969,4.7181,18.4066,16.8791,0.0,0.003588,5.5467,20.4047,1.02,48,2023,5,0,0,349.308483,13.74884,1.933851,5.198663,1.489779,4.47672,0.0,0.0,True,True,False,False,False,False,True,False,False,False,False,0.3196,-0.6269,-0.01358,-0.0543,-2.5231,0.0323,0.0,0.006326,-0.0152,-2.2694,-2.4944,-1.5177,-0.06208,-0.0932,-6.6584,0.0691,0.0,0.014664,0.0083,-5.9895,-1.9375,-2.8659,-0.23438,-0.0949,-13.1945,0.1437,0.0,0.029329,0.102,-11.8134,34.6136,-1.1249,-0.54662,-0.1751,-6.6368,0.2259,0.0,0.030901,-0.0843,-6.4017,-47.2293,6.668,-0.650832,-0.5453,13.2007,0.3361,0.0,0.029442,-0.6244,12.1679,-263.4133,14.6513,0.32031,-0.1149,34.2253,0.348,169.3212,0.02933,-0.3034,33.0143,-62.5297,14.6513,0.32023,1.0271,22.7187,0.0805,396.4313,0.029327,0.97,21.2189,-253.8594,13.1017,0.32031,3.5637,28.9353,-0.5495,0.0,0.029327,4.1642,26.7224,True,False,False,False,False
1,101303.259,84.48,0.65654,4.7575,18.2344,16.853,0.0,0.000739,5.5886,20.1468,1.02,48,2023,5,0,15,349.317969,13.749,1.924863,5.246652,1.488647,4.518599,0.0,0.0,True,True,False,False,False,False,True,False,False,False,False,6.366,-0.1313,0.02315,-0.0394,0.1722,0.0261,0.0,0.002848,-0.0419,0.2579,6.6856,-0.7582,0.00957,-0.0937,-2.3509,0.0584,0.0,0.009174,-0.0571,-2.0115,1.8204,-2.4974,-0.11499,-0.14,-10.5983,0.1335,0.0,0.025852,0.0195,-9.3866,27.8917,-2.293,-0.4277,-0.1762,-10.0509,0.2288,0.0,0.035125,-0.0293,-9.193,-23.1266,6.3455,-0.635278,-0.5993,10.3588,0.3562,0.0,0.03239,-0.6738,9.4258,-253.4968,14.52,0.34346,-0.2215,36.3325,0.4036,151.2269,0.032179,-0.4288,35.0966,-65.5145,14.52,0.34339,1.0188,23.5712,0.1378,399.5241,0.032176,0.9599,22.1733,-241.8028,13.4625,0.34346,3.4944,28.248,-0.5239,0.0,0.032183,4.1283,26.2132,True,False,False,False,False
2,101291.8681,84.1823,0.61713,4.7772,16.0279,16.8286,0.0,0.000641,5.6422,17.9834,1.02,48,2023,5,0,30,349.30808,13.66506,1.741981,5.366556,1.319011,4.591498,0.0,0.0,True,True,False,False,False,False,True,False,False,False,False,11.3909,0.2977,0.03941,-0.0197,2.2065,0.0244,0.0,9.9e-05,-0.0536,2.1634,17.7569,0.1664,0.06256,-0.0591,2.3787,0.0505,0.0,0.002947,-0.0955,2.4213,15.2625,-1.3513,0.00048,-0.1523,-4.2797,0.1196,0.0,0.017611,-0.0872,-3.5682,25.5686,-2.6153,-0.27998,-0.1655,-10.4048,0.226,0.0,0.035124,-0.0121,-9.1374,5.3879,6.3556,-0.594563,-0.6339,9.5694,0.3696,0.0,0.032481,-0.7384,8.6672,-236.7118,14.8087,0.32037,-0.2993,39.7102,0.4577,137.2317,0.032276,-0.5526,38.3745,-65.204,14.8177,0.38287,1.0448,26.2531,0.1908,414.3139,0.032274,0.9558,24.8237,-226.8872,14.1706,0.38287,3.5056,29.6024,-0.4943,0.0,0.032281,4.1263,27.5775,True,False,False,False,False
3,101277.4571,83.5966,0.58752,4.7852,12.8178,16.7995,0.0,0.002015,5.6978,14.9278,1.02,48,2023,5,0,45,349.293436,13.51882,1.467763,5.505506,1.061603,4.665955,0.0,0.0,True,True,False,False,False,False,True,False,False,False,False,14.411,0.5857,0.02961,-0.008,3.2101,0.0291,0.0,-0.001375,-0.0556,3.0556,25.8019,0.8834,0.06902,-0.0277,5.4166,0.0535,0.0,-0.001276,-0.1092,5.219,32.4875,0.1252,0.07859,-0.1214,3.0657,0.1119,0.0,0.007898,-0.1663,3.2075,30.2304,-2.1138,-0.14221,-0.162,-7.6057,0.2233,0.0,0.030902,-0.0491,-6.3365,36.4648,6.3559,-0.564082,-0.6251,10.1831,0.3853,0.0,0.030902,-0.7727,9.2241,-215.098,13.7479,0.17451,-0.3744,42.8132,0.5109,125.8533,0.030899,-0.6828,41.3467,-63.4495,15.4034,0.41248,1.0899,29.6857,0.2438,447.2527,0.030899,0.9595,28.1076,-208.979,15.1236,0.41248,3.5411,31.907,-0.457,0.0,0.030903,4.1251,29.7502,True,False,False,False,False
4,101262.0313,82.8641,0.59375,4.7795,9.6014,16.7596,0.0,0.003588,5.7344,11.924,1.02,48,2023,5,1,0,349.2883,13.33242,1.184808,5.610666,0.797186,4.712549,0.0,0.0,True,True,False,False,False,False,True,False,False,False,False,15.4258,0.7325,-0.00623,0.0057,3.2164,0.0399,0.0,-0.001572,-0.0366,3.0038,29.8368,1.3182,0.02338,-0.0023,6.4265,0.069,0.0,-0.002947,-0.0922,6.0594,47.5937,1.4846,0.08594,-0.0614,8.8052,0.1195,0.0,0.0,-0.1877,8.4807,43.0481,-0.8815,-0.0522,-0.162,-1.9653,0.2269,0.0,0.023003,-0.1263,-1.1638,67.832,6.0652,-0.575669,-0.5529,11.4885,0.4116,0.0,0.028908,-0.7346,10.4085,-190.6013,12.4235,-0.05333,-0.4502,44.8601,0.5656,115.1538,0.029325,-0.8044,43.2772,-62.0104,16.1359,0.40625,1.1488,33.007,0.3052,491.0538,0.029327,0.9833,31.222,-187.9454,16.1359,0.40625,3.5557,34.0839,-0.4092,0.0,0.029327,4.1039,31.7221,True,False,False,False,False


<h2>训练模型</h2>

<h3>定义测评指标</h3>

In [48]:
#对train_feats做交叉验证,然后用
def RMSE(y_true, y_pred):
    return  np.sqrt(np.mean((y_true - y_pred) ** 2))
def metric(y_true, y_pred):
    rmses = []
    for i in range(0, len(y_true), len(y_true) // 5):
        rmse = RMSE(y_true[i : i+len(y_true) // 5], y_pred[i : i+len(y_true) // 5])
        rmses.append(rmse)
    return np.mean(np.array(rmses))

<h3>lightgbm</h3>

In [49]:
#训练数据选择的是2022年1月到2023年1月
train_feats = train_df[((train_df['year'] == 2022)) | ((train_df['year'] == 2023) & (train_df['month'] == 1))].copy()
#将表格数据打乱,不按照时间顺序排列
train_feats = train_feats.sample(frac=1).reset_index(drop=True)
#验证集选择的是2023年2月到2023年4月
valid_feats = train_df[(train_df['year'] == 2023) & (train_df['month'] > 1)].copy()
#年份和相对湿度基本和target无关
train_feats.drop(drop_cols, axis=1, inplace=True)
valid_feats.drop(drop_cols, axis=1, inplace=True)

# import optuna#自动超参数优化软件框架

# def objective(trial):
#     lgb_params = {
#         "verbosity": -1,'objective': 'regression',
#         'metric': 'rmse','boosting_type': 'gbdt',
#         'random_state': Config.seed,
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),#对数分布的建议值
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),#浮点数
#         'subsample': trial.suggest_float('subsample', 0.5, 1),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
#         'num_leaves' : trial.suggest_int('num_leaves', 8, 64),#整数
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#     }
#     X=train_feats.drop([Config.target],axis=1).copy()
#     y=train_feats[Config.target].copy()
#     test_X=valid_feats.drop([Config.target],axis=1).values.copy()
#     test_y=valid_feats[Config.target].values.copy()
#     test_preds=np.zeros((Config.num_folds,len(test_X)))
#     # 初始化 KFold
#     kf = KFold(n_splits=Config.num_folds, shuffle=True,random_state=Config.seed)
#     # 进行 k 折交叉验证
#     for fold, (train_index, valid_index) in (enumerate(kf.split(X))):
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#         model=LGBMRegressor(**lgb_params)
#         model.fit(X_train,y_train)
#         test_preds[fold]=model.predict(test_X)
#     test_preds=test_preds.mean(axis=0)
#     mean_rmse=metric(test_y,test_preds)
#     return mean_rmse
# #创建的研究命名,找最小值.
# study = optuna.create_study(direction='minimize', study_name='Optimize boosting hyperparameters')
# #目标函数,尝试的次数
# study.optimize(objective, n_trials=50)
# lgb_params=study.best_trial.params
#Best is trial 29 with value: 15.058260259234075.
lgb_params = {
    'n_estimators': 75, 
    'reg_alpha': 0.022825982577566684, 
    'reg_lambda': 5.284325352952156, 
    'colsample_bytree': 0.8286196779453388,
    'subsample': 0.8853286861359038, 
    'learning_rate': 0.2484233791090533,   
    'num_leaves': 37, 
    'min_child_samples': 44, 
    'objective': 'regression', 
    'metric': 'rmse',
    'boosting_type': "gbdt", 
    'random_state': 2024
}

In [50]:
train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)
model_lgb = []
x = train_df.drop([Config.target], axis=1).copy()
y = train_df[Config.target].copy()
# 初始化 KFold
kf = KFold(n_splits=Config.num_folds, shuffle=True, random_state=Config.seed)
# 进行 k 折交叉验证
for fold, (train_index, valid_index) in (enumerate(kf.split(x))):
    logging.info(f"########fold:{fold}########")
    x_train, x_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_valid, y_valid)
    model = lgb.train(lgb_params, trainset, valid_sets=[valset], callbacks=[lgb.log_evaluation(1000), lgb.early_stopping(100)])
    model.save_model("../models/lgb_%d.txt" % fold)
    model_lgb.append(model)    

2024-03-12 15:31:02,403 : INFO : ########fold:0########


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.552821
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:04,992 : INFO : ########fold:1########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 13.83
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.539215
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:07,664 : INFO : ########fold:2########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.2483
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.535811
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:10,306 : INFO : ########fold:3########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.1583
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.528507
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:12,968 : INFO : ########fold:4########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 13.8345
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.561543
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:15,460 : INFO : ########fold:5########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.1631
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.547062
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:18,071 : INFO : ########fold:6########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.127
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.585859
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:20,779 : INFO : ########fold:7########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.0079
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.591897
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:23,488 : INFO : ########fold:8########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.0471
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212154, number of used features: 117
[LightGBM] [Info] Start training from score 36.587312
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:26,250 : INFO : ########fold:9########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 13.7401
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212154, number of used features: 117
[LightGBM] [Info] Start training from score 36.548654
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:28,996 : INFO : ########fold:10########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.0182
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212154, number of used features: 117
[LightGBM] [Info] Start training from score 36.492626
Training until validation scores don't improve for 100 rounds


2024-03-12 15:31:31,733 : INFO : ########fold:11########


Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.5355
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212154, number of used features: 117
[LightGBM] [Info] Start training from score 36.586447
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[75]	valid_0's rmse: 14.2157


<h2>预测</h2>

<h3>预测结果</h3>

In [51]:
test_x = test_df.drop([Config.target],axis=1).values
test_preds = np.zeros((Config.num_folds,len(test_x)))
for fold in range(kf.n_splits):
    model = model_lgb[fold]
    test_preds[fold] = model.predict(test_x, num_iteration=model.best_iteration)

<h3>保存结果</h3>

In [52]:
submission = pd.read_csv(Config.path + "A_submit_example.csv")  
test_preds = test_preds.mean(axis=0)
submission['出力(MW)'] = test_preds
submission.to_csv("../data/%s.csv" % datetime.now().strftime("%Y%m%d_%H%M%S"), encoding="utf-8", index=False)
submission.head()

Unnamed: 0,站点编号,时间,出力(MW)
0,f1,2023-5-1 0:00,5.667822
1,f1,2023-5-1 0:15,4.777382
2,f1,2023-5-1 0:30,4.970961
3,f1,2023-5-1 0:45,6.183783
4,f1,2023-5-1 1:00,7.146252
