In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import pearsonr
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm
import random
import warnings
import gc
import time
from IPython.core.display import Image
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [None]:
!pip uninstall --yes lightgbm && pip install --install-option=--gpu lightgbm

In [None]:
for_train2 = pd.read_csv("./data/for_train.csv")
for_test2 = pd.read_csv("./data/for_test.csv")
df_missing = pd.read_csv("./data/df_missing.csv")
unique_id = for_train2["id"].astype(int).unique()

In [None]:
import lightgbm as lgb
import joblib
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold

folds = 5
model_paths = []
te_preds = []
scores2 = pd.DataFrame(columns=["month","train2.shape[0]","train2_mape","valid_mape","test2_mape","train2_rmse","valid_rmse","test2_rmse"])
month_list = [6,5,4,3,2,1,12,11,10,9,8]
train2 = for_train2[for_train2.month.isin(month_list)].reset_index(drop=True)
test2 = for_test2[for_test2.month==7].reset_index(drop=True)

features1 = ["year","month","day","time","minute","id","pvrate","id_lat","id_lng"] + [f"preal-30_{unique_id[i]}" for i in range(len(unique_id))] + [f"ppred_{unique_id[i]}" for i in range(len(unique_id))]
target = ["nv2"]

params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'device': 'gpu' # GPU!
}

kf = StratifiedKFold(n_splits=folds, shuffle=True,random_state=81)
for fold,(tr_idx, va_idx) in enumerate(kf.split(train2,train2["id"])):
  lgb_results = []

  train2_set = lgb.Dataset(train2.loc[train2.index.isin(tr_idx),features1],train2.loc[train2.index.isin(tr_idx),target])
  val_set = lgb.Dataset(train2.loc[train2.index.isin(va_idx),features1],train2.loc[train2.index.isin(va_idx),target])
  # train2 and evaluate
  lgb_result = {} 
  model = lgb.train(
      params, 
      train2_set, 
      num_boost_round = 100000, # 10000 
      early_stopping_rounds = 100, 
      valid_sets = [train2_set, val_set], 
      verbose_eval = 1000,
      evals_result = lgb_result,
  )
  lgb_results.append(lgb_result)

  model_path = f"./output/multi/multi_lgbm_{fold}.pkl"

  model_paths.append(model_path)
  joblib.dump(model, model_path)
  tt = joblib.load(model_path)
  #tr_pred = tt.predict(train2.loc[train2.index.isin(tr_idx),features1])
  #val_pred = tt.predict(train2.loc[train2.index.isin(va_idx),features1])

  train2.loc[train2.index.isin(va_idx),"pred"] = tt.predict(train2.loc[train2.index.isin(va_idx),features1])
  te_preds.append(tt.predict(test2[features1]))
test2["pred"] = np.mean(np.array(te_preds),axis=0)
  



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 50832
[LightGBM] [Info] Number of data points in the train set: 536025, number of used features: 207
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 207 dense feature groups (106.33 MB) transferred to GPU in 0.183937 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.363154
Training until validation scores don't improve for 100 rounds
[1000]	training's rmse: 0.0479641	valid_1's rmse: 0.0519069
[2000]	training's rmse: 0.0399326	valid_1's rmse: 0.0465442
[3000]	training's rmse: 0.0354213	valid_1's rmse: 0.0441618
[4000]	training's rmse: 0.0322234	valid_1's rmse: 0.042747
[5000]	training's rmse: 0.0296932	valid_1's rmse: 0.0417276
[6000]	training's rmse: 0.0274869	valid_1's rmse: 0.0410055
[7000]	t

In [None]:

scores2= scores2.append({"month":month_list,"train2.shape[0]":train2.shape[0],
              "train2_mape":(np.abs((train2.loc[train2.nv2!=0,"pred"] - train2.loc[train2.nv2!=0,"nv2"])/train2.loc[train2.nv2!=0,"nv2"])).mean(),
             # "valid_mape":(np.abs((valid.loc[valid.nv2!=0,"pred"] - valid.loc[valid.nv2!=0,"nv2"])/valid.loc[valid.nv2!=0,"nv2"])).mean(),
              "test2_mape":(np.abs((test2.loc[test2.nv2!=0,"pred"] - test2.loc[test2.nv2!=0,"nv2"])/test2.loc[test2.nv2!=0,"nv2"])).mean(),
              "train2_rmse":np.sqrt(mean_squared_error(train2["nv2"], train2["pred"])),
             # "valid_rmse":np.sqrt(mean_squared_error(valid["nv2"], valid["pred"])),
              "test2_rmse":np.sqrt(mean_squared_error(test2["nv2"], test2["pred"]))}
            ,ignore_index=True)

train2["env2"] = train2["env"] / train2["observed_max"]
#valid["env2"] = valid["env"] / valid["observed_max"]
test2["env2"] = test2["env"] / test2["observed_max"]

test2["observed"] = test2["nv2"]*test2["observed_max"]
test2["fluctuation"] = np.abs((test2["observed"] - test2["preal-30"])/test2["observed_max"])
test2_result = pd.DataFrame(columns=["id","rmse","mape","mae_per","rmse_op","mape_op","nrmse","nmae","r2","mae_per_op","nrmse_op","nmae_op","r2_op","fluctuation","observed_max"])
for id,group in test2.groupby("id"):
  mape = (np.abs((group.loc[group.nv2!=0,"pred"] - group.loc[group.nv2!=0,"nv2"])/group.loc[group.nv2!=0,"nv2"])).mean()
  rmse = np.sqrt(mean_squared_error(group["nv2"], group["pred"]))
  mae_per = np.abs(group["pred"] - group["nv2"]).mean()
  nrmse = np.sqrt(np.mean(((group.pred-group.nv2)/group.nv2.max())**(2))) ##論文　Spatio-Temporal Graph Neural Networks for Multi-Site PV Power Forecasting
  nmae = np.abs(group.pred-group.nv2).sum()/group.nv2.sum() ##論文　Spatio-Temporal Graph Neural Networks for Multi-Site PV Power Forecasting
  r2 = r2_score(group["nv2"], group["pred"])

  mape_op = (np.abs((group.loc[group.nv2!=0,"env2"] - group.loc[group.nv2!=0,"nv2"])/group.loc[group.nv2!=0,"nv2"])).mean()
  rmse_op = np.sqrt(mean_squared_error(group["nv2"], group["env2"]))
  mae_per_op = np.abs(group["pred"] - group["env2"]).mean()
  nrmse_op = np.sqrt(np.mean(((group.env2-group.nv2)/group.nv2.max())**(2)))
  nmae_op = np.abs(group.env2-group.nv2).sum()/group.nv2.sum()
  r2_op = r2_score(group["env2"], group["pred"])

  fluctuation = group["fluctuation"].mean()
  observed_max = group["observed_max"].iloc[0]

  test2_result = test2_result.append({"id":id,"rmse":rmse,"mape":mape,"mae_per":mae_per,"nrmse":nrmse,"nmae":nmae,"r2":r2,"rmse_op":rmse_op,"mape_op":mape_op,"mae_per_op":mae_per_op,"nrmse_op":nrmse_op,\
                                    "nmae_op":nmae_op,"r2_op":r2_op,"fluctuation":fluctuation,"observed_max":observed_max,"observed_max_test2":group.nv2.max()},ignore_index=True)

df_test2 = pd.merge(df_missing,test2_result,on="id")
df_test2 = df_test2[df_test2.missing_percent_test==0]

scores2.to_csv("./output/multi/scores_.csv",index=False)
df_test2.to_csv("./output/multi/df_test2_.csv",index=False)
test2.to_csv("./output/multi/test2_.csv",index=False)
train2.to_csv("./output/multi/train2_.csv",index=False)

In [None]:
scores2

Unnamed: 0,month,train2.shape[0],train2_mape,valid_mape,test2_mape,train2_rmse,valid_rmse,test2_rmse
0,"[6, 5, 4, 3, 2, 1, 12, 11, 10, 9, 8]",670032,0.098719,,0.222975,0.037312,,0.084907
