<h2>导入包</h2>

In [36]:
import os
import re
import logging
from datetime import datetime, date
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from lightgbm import Dataset
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import sunrise, sunset, dawn, noon, dusk


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

<h2>读取数据</h2>

<h3>比赛数据</h3>

In [37]:
x_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_实际功率数据.csv", encoding="gbk")
info_train = pd.read_csv("../data/A榜-训练集_分布式光伏发电预测_基本信息.csv", encoding="gbk")
x_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_气象变量数据.csv", encoding="gbk")
y_test = pd.read_csv("../data/submit_example.csv", encoding="utf-8")
info_test = pd.read_csv("../data/A榜-测试集_分布式光伏发电预测_基本信息.csv", encoding="gbk")

In [38]:
x_train = pd.merge(x_train, info_train[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_train["时间"] = pd.to_datetime(x_train["时间"], utc=False).dt.tz_localize('Asia/Shanghai')
x_test = pd.merge(x_test, info_test[["光伏用户编号", "装机容量(kW)", "经度", "纬度"]], how="left", on="光伏用户编号")
x_test["时间"] = pd.to_datetime(x_test["时间"], utc=False).dt.tz_localize('Asia/Shanghai')

In [39]:
y_train = y_train.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_train["level_3"] = y_train["level_3"].apply(lambda x: int(x[1:]))
y_train["时间"] = pd.to_datetime(y_train["时间"], utc=False).dt.tz_localize('Asia/Shanghai')
y_train["时间"] = y_train["时间"] + (y_train["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_train = y_train.drop(columns=["level_3"])

y_test = y_test.set_index(["光伏用户编号", "综合倍率", "时间"]).stack().reset_index().rename(columns={0:"target"})
y_test["level_3"] = y_test["level_3"].apply(lambda x: int(x[1:]))
y_test["时间"] = pd.to_datetime(y_test["时间"], utc=False).dt.tz_localize('Asia/Shanghai')
y_test["时间"] = y_test["时间"] + (y_test["level_3"] - 1) * 15 * pd.Timedelta(1, unit="minutes")
y_test = y_test.drop(columns=["level_3"])

In [40]:
df_train = pd.merge(x_train, y_train, on=["光伏用户编号", "时间"], how="left")
df_test = pd.merge(x_test, y_test, on=["光伏用户编号", "时间"], how="left")
df = pd.concat([df_train, df_test], axis=0)
logging.info(df.columns)

2024-03-30 15:32:17,049 : INFO : Index(['光伏用户编号', '时间', '气压(Pa）', '相对湿度（%）', '云量', '10米风速（10m/s）', '10米风向（°)',
       '温度（K）', '辐照强度（J/m2）', '降水（m）', '100m风速（100m/s）', '100m风向（°)',
       '装机容量(kW)', '经度', '纬度', '综合倍率', 'target'],
      dtype='object')


<h3>外部数据</h3>

In [41]:
outer_files = os.listdir("../data/")
outer_files = [x for x in outer_files if re.match(r"^open-meteo-f\d\.csv$", x)]
outer_dfs = list()
for outer_file in outer_files:
    outer_df = pd.read_csv(os.path.join("../data", outer_file), skiprows=3)
    outer_df["time"] = pd.to_datetime(outer_df["time"], utc=False).dt.tz_localize('Asia/Shanghai')
    outer_df = outer_df.set_index("time", drop=True).asfreq("15T")
    outer_df = outer_df.interpolate(method="time", axis=0)

    outer_df = outer_df.reset_index(drop=False).rename(columns={"time": "时间"})
    outer_df["光伏用户编号"] = re.match(r"^open-meteo-(f\d)\.csv$", outer_file).group(1)    
    outer_dfs.append(outer_df)
outer_data = pd.concat(outer_dfs, axis=0)
logging.info(outer_data.columns)

2024-03-30 15:33:18,201 : INFO : Index(['时间', 'temperature_2m_archive_best_match (°C)',
       'relative_humidity_2m_archive_best_match (%)',
       'dew_point_2m_archive_best_match (°C)',
       'apparent_temperature_archive_best_match (°C)',
       'precipitation_archive_best_match (mm)', 'rain_archive_best_match (mm)',
       'snowfall_archive_best_match (cm)', 'snow_depth_archive_best_match (m)',
       'weather_code_archive_best_match (wmo code)',
       ...
       'direct_normal_irradiance_era5_land (W/m²)',
       'global_tilted_irradiance_era5_land (W/m²)',
       'terrestrial_radiation_era5_land (W/m²)',
       'shortwave_radiation_instant_era5_land (W/m²)',
       'direct_radiation_instant_era5_land (W/m²)',
       'diffuse_radiation_instant_era5_land (W/m²)',
       'direct_normal_irradiance_instant_era5_land (W/m²)',
       'global_tilted_irradiance_instant_era5_land (W/m²)',
       'terrestrial_radiation_instant_era5_land (W/m²)', '光伏用户编号'],
      dtype='object', length=22

In [42]:
outer_day_files = os.listdir("../data/")
outer_day_files = [x for x in outer_day_files if re.match(r"^open-meteo-day-f\d\.csv$", x)]
outer_day_dfs = list()
for outer_day_file in outer_day_files:
    outer_day_df = pd.read_csv(os.path.join("../data", outer_day_file), skiprows=3)
    outer_day_df["time"] = pd.to_datetime(outer_day_df["time"], utc=False).dt.tz_localize('Asia/Shanghai')
    outer_day_df = outer_day_df.rename(columns={"time": "时间"})
    outer_day_df["光伏用户编号"] = re.match(r"^open-meteo-day-(f\d)\.csv$", outer_day_file).group(1)    
    outer_day_dfs.append(outer_day_df)
outer_day_data = pd.concat(outer_day_dfs, axis=0)
logging.info(outer_day_data.columns)

2024-03-30 15:33:18,291 : INFO : Index(['时间', 'weather_code (wmo code)', 'temperature_2m_max (°C)',
       'temperature_2m_min (°C)', 'temperature_2m_mean (°C)',
       'apparent_temperature_max (°C)', 'apparent_temperature_min (°C)',
       'apparent_temperature_mean (°C)', 'sunrise (iso8601)',
       'sunset (iso8601)', 'daylight_duration (s)', 'sunshine_duration (s)',
       'precipitation_sum (mm)', 'rain_sum (mm)', 'snowfall_sum (cm)',
       'precipitation_hours (h)', 'wind_speed_10m_max (km/h)',
       'wind_gusts_10m_max (km/h)', 'wind_direction_10m_dominant (°)',
       'shortwave_radiation_sum (MJ/m²)', 'et0_fao_evapotranspiration (mm)',
       '光伏用户编号'],
      dtype='object')


In [43]:
outer_data["日期"] = outer_data["时间"].dt.date
outer_day_data["日期"] = outer_day_data["时间"].dt.date
outer_day_data = outer_day_data.drop(columns=["时间"])
outer_data = pd.merge(outer_data, outer_day_data, on=["光伏用户编号", "日期"], how="left").drop(columns=["日期"])
for column in outer_data.columns:
    logging.info(column)

2024-03-30 15:33:19,936 : INFO : 时间
2024-03-30 15:33:19,936 : INFO : temperature_2m_archive_best_match (°C)
2024-03-30 15:33:19,937 : INFO : relative_humidity_2m_archive_best_match (%)
2024-03-30 15:33:19,937 : INFO : dew_point_2m_archive_best_match (°C)
2024-03-30 15:33:19,938 : INFO : apparent_temperature_archive_best_match (°C)
2024-03-30 15:33:19,938 : INFO : precipitation_archive_best_match (mm)
2024-03-30 15:33:19,939 : INFO : rain_archive_best_match (mm)
2024-03-30 15:33:19,940 : INFO : snowfall_archive_best_match (cm)
2024-03-30 15:33:19,940 : INFO : snow_depth_archive_best_match (m)
2024-03-30 15:33:19,940 : INFO : weather_code_archive_best_match (wmo code)
2024-03-30 15:33:19,941 : INFO : pressure_msl_archive_best_match (hPa)
2024-03-30 15:33:19,941 : INFO : surface_pressure_archive_best_match (hPa)
2024-03-30 15:33:19,941 : INFO : cloud_cover_archive_best_match (%)
2024-03-30 15:33:19,942 : INFO : cloud_cover_low_archive_best_match (%)
2024-03-30 15:33:19,943 : INFO : cloud_

2024-03-30 15:33:19,975 : INFO : soil_temperature_100_to_255cm_era5_seamless (°C)
2024-03-30 15:33:19,976 : INFO : soil_moisture_0_to_7cm_era5_seamless (m³/m³)
2024-03-30 15:33:19,976 : INFO : soil_moisture_7_to_28cm_era5_seamless (m³/m³)
2024-03-30 15:33:19,976 : INFO : soil_moisture_28_to_100cm_era5_seamless (m³/m³)
2024-03-30 15:33:19,976 : INFO : soil_moisture_100_to_255cm_era5_seamless (m³/m³)
2024-03-30 15:33:19,977 : INFO : is_day_era5_seamless ()
2024-03-30 15:33:19,977 : INFO : sunshine_duration_era5_seamless (s)
2024-03-30 15:33:19,977 : INFO : shortwave_radiation_era5_seamless (W/m²)
2024-03-30 15:33:19,978 : INFO : direct_radiation_era5_seamless (W/m²)
2024-03-30 15:33:19,978 : INFO : diffuse_radiation_era5_seamless (W/m²)
2024-03-30 15:33:19,978 : INFO : direct_normal_irradiance_era5_seamless (W/m²)
2024-03-30 15:33:19,978 : INFO : global_tilted_irradiance_era5_seamless (W/m²)
2024-03-30 15:33:19,979 : INFO : terrestrial_radiation_era5_seamless (W/m²)
2024-03-30 15:33:19,9

2024-03-30 15:33:20,015 : INFO : wind_speed_10m_max (km/h)
2024-03-30 15:33:20,015 : INFO : wind_gusts_10m_max (km/h)
2024-03-30 15:33:20,015 : INFO : wind_direction_10m_dominant (°)
2024-03-30 15:33:20,015 : INFO : shortwave_radiation_sum (MJ/m²)
2024-03-30 15:33:20,016 : INFO : et0_fao_evapotranspiration (mm)


<h3>整合数据</h3>

In [44]:
df = pd.merge(df, outer_data[[
    "光伏用户编号", 
    "时间",
    "apparent_temperature_archive_best_match (°C)", 
    "surface_pressure_archive_best_match (hPa)",
    "is_day_archive_best_match ()",
    "relative_humidity_2m_archive_best_match (%)",
    "cloud_cover_archive_best_match (%)",
    "global_tilted_irradiance_instant_archive_best_match (W/m²)",
    "precipitation_archive_best_match (mm)",
    "wind_speed_10m_archive_best_match (km/h)",
    "wind_speed_100m_archive_best_match (km/h)",
    "wind_direction_10m_archive_best_match (°)",
    "wind_direction_100m_archive_best_match (°)",
    "sunshine_duration_archive_best_match (s)",
    "weather_code_archive_best_match (wmo code)",
    "temperature_2m_max (°C)",
    "temperature_2m_min (°C)",
    "temperature_2m_mean (°C)",
    "precipitation_sum (mm)",
    "rain_sum (mm)",
    "snowfall_sum (cm)",
    "apparent_temperature_max (°C)",
    "apparent_temperature_min (°C)",
    "apparent_temperature_mean (°C)",
    "sunrise (iso8601)",
    "sunset (iso8601)"
]], how="left", on=["光伏用户编号", "时间"])

df["温度（K）"] = df["apparent_temperature_archive_best_match (°C)"] + 273.15
df["气压(Pa）"] = df["surface_pressure_archive_best_match (hPa)"] * 100
df["是白天"] = df["is_day_archive_best_match ()"].copy()
df["相对湿度（%）"] = df["relative_humidity_2m_archive_best_match (%)"].copy()
df["云量"] = df["cloud_cover_archive_best_match (%)"] / 100
df["辐照强度（J/m2）"] = df["global_tilted_irradiance_instant_archive_best_match (W/m²)"] * 15 * 60
df["降水（m）"] = df["precipitation_archive_best_match (mm)"] / 1000
df["10米风速（10m/s）"] = df["wind_speed_10m_archive_best_match (km/h)"] / 3600 * 100
df["100m风速（100m/s）"] = df["wind_speed_100m_archive_best_match (km/h)"] / 3600 * 10
df["10米风向（°)"] = df["wind_direction_10m_archive_best_match (°)"].copy()
df["100m风向（°)"] = df["wind_direction_100m_archive_best_match (°)"].copy()
df["sunrise (iso8601)"] = pd.to_datetime(df["sunrise (iso8601)"], utc=False).dt.tz_localize('Asia/Shanghai')
df["sunset (iso8601)"] = pd.to_datetime(df["sunset (iso8601)"], utc=False).dt.tz_localize('Asia/Shanghai')
df["时间-日出时间"] = ((df["时间"] - df["sunrise (iso8601)"]).dt.days * 24 * 3600 + (df["时间"] - df["sunrise (iso8601)"]).dt.seconds) // 60
df["日落时间-时间"] = ((df["sunset (iso8601)"] - df["时间"]).dt.days * 24 * 3600 + (df["sunset (iso8601)"] - df["时间"]).dt.seconds) // 60

df = df.drop(columns=[
    "apparent_temperature_archive_best_match (°C)", 
    "surface_pressure_archive_best_match (hPa)",
    "is_day_archive_best_match ()",
    "relative_humidity_2m_archive_best_match (%)",
    "cloud_cover_archive_best_match (%)",
    "global_tilted_irradiance_instant_archive_best_match (W/m²)",
    "precipitation_archive_best_match (mm)",
    "wind_speed_10m_archive_best_match (km/h)",
    "wind_speed_100m_archive_best_match (km/h)",
    "wind_direction_10m_archive_best_match (°)",
    "wind_direction_100m_archive_best_match (°)",
    "sunrise (iso8601)",
    "sunset (iso8601)"
])

<h2>特征工程</h2>

<h3>时间特征</h3>

In [45]:
df["年"] = df["时间"].dt.year
df["季节"] = df["时间"].dt.quarter
df["月"] = df["时间"].dt.month
df["日"] = df["时间"].dt.day
df["周"] = df["时间"].dt.week
df["分"] = df["时间"].dt.minute // 15 + df["时间"].dt.hour * 4

  """


In [46]:
# df["分_"] = df["分"].copy()
# df = pd.get_dummies(df, columns=["分_"], prefix_sep="")

<h3>根据日出时间日落时间计算时间段</h3>

In [47]:
def solar_time(current_time, dawn_time, sunrise_time, noon_time, sunset_time, dusk_time) -> int:
    """
    根据太阳判断当前时间段\n
    """
    if dawn_time < current_time < sunrise_time:
        return 1
    elif sunrise_time <= current_time < noon_time:
        return 2
    elif noon_time <= current_time < sunset_time:
        return 3
    elif sunset_time <= current_time < dusk_time:
        return 4
    else:
        return 0

In [48]:
# df["地点"] = df.apply(lambda x: LocationInfo(name=x["光伏用户编号"], region="China", timezone="Asia/Shanghai", latitude=x["纬度"], longitude=x["经度"]), axis=1)
# df["黎明时刻"] = df.apply(lambda x: dawn(x["地点"].observer, date=x["时间"], tzinfo=x["地点"].timezone), axis=1).dt.tz_convert("Asia/Shanghai")
# df["日出时刻"] = df.apply(lambda x: sunrise(x["地点"].observer, date=x["时间"], tzinfo=x["地点"].timezone), axis=1).dt.tz_convert("Asia/Shanghai")
# df["正午时刻"] = df.apply(lambda x: noon(x["地点"].observer, date=x["时间"], tzinfo=x["地点"].timezone), axis=1).dt.tz_convert("Asia/Shanghai")
# df["日落时刻"] = df.apply(lambda x: sunset(x["地点"].observer, date=x["时间"], tzinfo=x["地点"].timezone), axis=1).dt.tz_convert("Asia/Shanghai")
# df["黄昏时刻"] = df.apply(lambda x: dusk(x["地点"].observer, date=x["时间"], tzinfo=x["地点"].timezone), axis=1).dt.tz_convert("Asia/Shanghai")
# df["时间段"] = df.apply(lambda x: solar_time(x["时间"], x["黎明时刻"], x["日出时刻"], x["正午时刻"], x["日落时刻"], x["黄昏时刻"]), axis=1)
# df = df.drop(columns=["地点", "黎明时刻", "日出时刻", "正午时刻", "日落时刻", "黄昏时刻"])

<h3>光伏用户编号</h3>

In [49]:
df["光伏用户编号_"] = df["光伏用户编号"].copy()
df = pd.get_dummies(df, columns=["光伏用户编号_"], prefix_sep="")

<h3>气象特征</h3>

In [50]:
df['100m风速（100m/s）'] = df['100m风速（100m/s）'] * np.sin(np.pi * df['100m风向（°)'] / 180)
# df['cos_100m风速（100m/s）'] = df['100m风速（100m/s）'] * np.cos(np.pi * df['100m风向（°)'] / 180)
df['10米风速（10m/s）'] = df['10米风速（10m/s）'] * np.sin(np.pi * df['10米风向（°)'] / 180)
# df['cos_10米风速（10m/s）'] = df['10米风速（10m/s）'] * np.cos(np.pi * df['10米风向（°)'] / 180)

In [51]:
df["光照/温度"] = df["辐照强度（J/m2）"] / df["温度（K）"]

<h3>历史值特征</h3>

In [52]:
dfs = []
for site, df_site in df.groupby("光伏用户编号"):
    df_site = df_site.sort_values("时间")
    df_site["辐照强度（J/m2） - 1"] = df_site["辐照强度（J/m2）"].shift(1) - df_site["辐照强度（J/m2）"]
    df_site["辐照强度（J/m2） - 8"] = df_site["辐照强度（J/m2）"].shift(8) - df_site["辐照强度（J/m2）"]
#     df_site["辐照强度（J/m2） - 2"] = df_site["辐照强度（J/m2）"].shift(2) - df_site["辐照强度（J/m2）"]
    dfs.append(df_site)
df = pd.concat(dfs, axis=0)

<h3>处理异常值</h3>

In [53]:
print(df_train["target"].nsmallest(3))
df_train[df_train["target"] < -8]
df.loc[207628:207633, ["光伏用户编号", "时间", "target"]]

207630   -8.8900
37316    -0.0085
39139    -0.0084
Name: target, dtype: float64


Unnamed: 0,光伏用户编号,时间,target
207628,f6,2022-08-15 20:30:00+08:00,-0.002
207629,f6,2022-08-15 20:45:00+08:00,-0.002
207630,f6,2022-08-15 21:00:00+08:00,-8.89
207631,f6,2022-08-15 21:15:00+08:00,-0.002
207632,f6,2022-08-15 21:30:00+08:00,
207633,f6,2022-08-15 21:45:00+08:00,


In [54]:
df.loc[207630, "target"] = -0.002

<h3>光照与当天最强光照的比值</h3>

In [55]:
df["日期"] = df["时间"].dt.date
day_max_values = df[["光伏用户编号", "日期", "辐照强度（J/m2）"]].groupby(by=["光伏用户编号", "日期"]).max()
day_max_values = day_max_values.rename(columns={x: x + "_max" for x in day_max_values.columns}).reset_index()
df = pd.merge(df, day_max_values, on=["光伏用户编号", "日期"], how="left").drop(columns=["日期"])
df["辐照强度（J/m2）_max"] = df["辐照强度（J/m2）"] / df["辐照强度（J/m2）_max"]

<h3>当天的平均光照</h3>

In [56]:
df["日期"] = df["时间"].dt.date
day_mean_values = df[["光伏用户编号", "日期", "辐照强度（J/m2）"]].groupby(by=["光伏用户编号", "日期"]).mean()
day_mean_values = day_mean_values.rename(columns={x: x + "_mean" for x in day_mean_values.columns}).reset_index()
df = pd.merge(df, day_mean_values, on=["光伏用户编号", "日期"], how="left").drop(columns=["日期"])

<h3>温度与当天最高温最低度的差值</h3>

In [57]:
df["日期"] = df["时间"].dt.date
day_max_values = df[["光伏用户编号", "日期", "温度（K）"]].groupby(by=["光伏用户编号", "日期"]).max()
day_min_values = df[["光伏用户编号", "日期", "温度（K）"]].groupby(by=["光伏用户编号", "日期"]).min()
day_max_values = day_max_values.rename(columns={x: x + "_max" for x in day_max_values.columns}).reset_index()
day_min_values = day_min_values.rename(columns={x: x + "_min" for x in day_min_values.columns}).reset_index()
df = pd.merge(df, day_max_values, on=["光伏用户编号", "日期"], how="left")
df = pd.merge(df, day_min_values, on=["光伏用户编号", "日期"], how="left").drop(columns=["日期"])
df["温度（K）_max"] = df["温度（K）_max"] - df["温度（K）"]
df["温度（K）_min"] = df["温度（K）"] - df["温度（K）_min"]

<h3>划分测试集</h3>

In [58]:
df_train = df[df["时间"] <= df_train["时间"].max()]
df_test = df[df["时间"] >= df_test["时间"].min()]

<h2>训练模型</h2>

<h3>评测指标</h3>

In [59]:
def score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return 1 / (1 + rmse)

<h3>lightgbm模型</h3>

In [60]:
params_lgb = {
    "num_boost_round": 1000,
    'learning_rate': 0.02,
    'boosting_type': 'gbdt',
    'objective': 'mse',
    'metric': 'mse',
    'num_leaves': 127,
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 4,
    "early_stopping_round": 100
}

<h3>交叉验证</h3>

In [61]:
model_lgb = []
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

x = df_train.drop(columns=["光伏用户编号", "时间"]).dropna().astype(np.float32)
y = x.pop("target")
mse = 0
for fold, (train_index, val_index) in enumerate(kfold.split(x, y)):
    logging.info(f'############ fold: {fold} ###########')
    x_train, x_val, y_train, y_val = x.iloc[train_index], x.iloc[val_index], y.iloc[train_index], y.iloc[val_index]
    trainset = Dataset(x_train, y_train)
    valset = Dataset(x_val, y_val)
    model = lgb.train(params_lgb, trainset, valid_sets=[trainset, valset], categorical_feature=["分"], callbacks=[lgb.log_evaluation(1000)])
    model.save_model("../models/lgb_%d.txt" % fold)
    model_lgb.append(model)
    val_pred = Series(model.predict(x_val, num_iteration=model.best_iteration), index=y_val.index).fillna(0)
    mse += mean_squared_error(y_val.fillna(0), val_pred)
rmse = np.sqrt(mse)
score = 1 / (1 + rmse)
logging.info(f"--------------本地分数 {score}--------------")

2024-03-30 15:33:23,974 : INFO : ############ fold: 0 ###########
New categorical_feature is ['分']


[1000]	training's l2: 0.012161	valid_1's l2: 0.0212056


2024-03-30 15:33:44,081 : INFO : ############ fold: 1 ###########
New categorical_feature is ['分']


[1000]	training's l2: 0.0123475	valid_1's l2: 0.02018


2024-03-30 15:34:03,995 : INFO : ############ fold: 2 ###########
New categorical_feature is ['分']


[1000]	training's l2: 0.0123074	valid_1's l2: 0.0201818


2024-03-30 15:34:23,988 : INFO : ############ fold: 3 ###########
New categorical_feature is ['分']


[1000]	training's l2: 0.012312	valid_1's l2: 0.0201604


2024-03-30 15:34:43,927 : INFO : ############ fold: 4 ###########
New categorical_feature is ['分']


[1000]	training's l2: 0.0122362	valid_1's l2: 0.0212984


2024-03-30 15:35:03,852 : INFO : --------------本地分数 0.7570155303004272--------------


In [62]:
importance = DataFrame()
importance["特征"] = model.feature_name()
importance["重要性"] = 0
for model in model_lgb:
    importance["重要性"] = importance["重要性"] + model.feature_importance()
importance["重要性"] = importance["重要性"] / kfold.n_splits
importance.sort_values("重要性", ascending=False)[0:50]

Unnamed: 0,特征,重要性
47,辐照强度（J/m2）_mean,7358.2
0,气压(Pa）,7134.2
2,云量,5645.2
31,日,5118.6
1,相对湿度（%）,4710.2
44,辐照强度（J/m2）_-_1,4452.6
49,温度（K）_min,4386.2
19,precipitation_sum_(mm),4358.0
33,分,4294.4
16,temperature_2m_max_(°C),4169.4


<h2>预测</h2>

In [63]:
x_test = df_test.drop(columns=["光伏用户编号", "时间"]).astype(np.float32)
y_test = x_test.pop("target")
y_pred = np.zeros((df_test.shape[0], ))
for model in model_lgb:
    y_pred += model.predict(x_test, num_iteration=model.best_iteration)
y_pred = y_pred / kfold.n_splits
df_test["target"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [64]:
df_test = df_test[["光伏用户编号", "综合倍率", "年", "月", "日", "分", "target"]]
df_test["时间"] = df_test["年"].astype(str) + "-" + df_test["月"].astype(str) + "-" + df_test["日"].astype(str) + " 0:00"
df_test["分"] = "p" + (df_test["分"] + 1).astype(str)
df_test = df_test.drop(columns=["年", "月", "日"])

In [65]:
result = pd.pivot(df_test, index=["光伏用户编号", "综合倍率", "时间"], columns="分", values="target").reset_index()
result = result[result["综合倍率"].notnull()]
result["综合倍率"] = result["综合倍率"].astype(int)

In [66]:
result.to_csv("../data/%s.csv" % datetime.now().strftime("%Y%m%d_%H%M%S"), encoding="utf-8", index=False)