In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import pearsonr
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm
import random
import warnings
import gc
import time
from IPython.core.display import Image

from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

In [None]:
!pip uninstall --yes lightgbm && pip install --install-option=--gpu lightgbm

In [None]:
for_train2 = pd.read_csv("./data/for_train2.csv")
for_test2 = pd.read_csv("./data/for_test2.csv")
df_missing = pd.read_csv("./data/df_missing.csv")
#unique_id = for_train["id"].astype(int).unique()

In [None]:
import lightgbm as lgb
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


folds = 5

test3 = pd.DataFrame()
scores3 = pd.DataFrame()
for id in df_missing[df_missing.missing_percent_test!=1].id.unique():
  id = int(id)
  for_train2_id = for_train2[for_train2.id==id]
  for_test2_id = for_test2[for_test2.id==id]

  scores2 = pd.DataFrame(columns=["id","month","train.shape[0]","train_mape","valid_mape","test_mape","train_rmse","valid_rmse","test_rmse"])
  month_list = [6,5,4,3,2,1,12,11,10,9,8]
  train2 = for_train2_id[for_train2_id.month.isin(month_list)].reset_index(drop=True)
  test2 = for_test2_id[for_test2_id.month==7].reset_index(drop=True)

  features1 = ["year","month","day","time","minute","id","pvrate","id_lat","id_lng"] + [f"preal-30_{id}"]
  target = ["nv2"]
  params = {
      'boosting_type': 'gbdt',
      'metric': 'rmse',
      'objective': 'regression',
      'device':'gpu'
  }
  te_preds = []
  kf = KFold(n_splits=folds, shuffle=True,random_state=81)
  for fold,(tr_idx, va_idx) in enumerate(kf.split(train2)):
    lgb_results = []

    train_set = lgb.Dataset(train2.loc[train2.index.isin(tr_idx),features1],train2.loc[train2.index.isin(tr_idx),target])
    val_set = lgb.Dataset(train2.loc[train2.index.isin(va_idx),features1],train2.loc[train2.index.isin(va_idx),target])
    # Train and evaluate
    lgb_result = {} 
    model = lgb.train(
        params, 
        train_set, 
        num_boost_round = 100000, 
        early_stopping_rounds = 100, 
        valid_sets = [train_set, val_set], 
        verbose_eval = 100,
        evals_result = lgb_result,
    )
    lgb_results.append(lgb_result)

    model_path = f"./output/single/single_lgbm_{id}_{fold}.pkl"
    joblib.dump(model, model_path)
    tt = joblib.load(model_path)

    train2.loc[train2.index.isin(va_idx),"pred"] = tt.predict(train2.loc[train2.index.isin(va_idx),features1])
    te_preds.append(tt.predict(test2[features1]))

    
  test2["pred"] = np.mean(np.array(te_preds),axis=0)
  scores2= scores2.append({"id":id,
                "month":month_list,"train.shape[0]":train2.shape[0],
                "train_mape":(np.abs((train2.loc[train2.nv2!=0,"pred"] - train2.loc[train2.nv2!=0,"nv2"])/train2.loc[train2.nv2!=0,"nv2"])).mean(),
                #"valid_mape":(np.abs((valid2.loc[valid2.nv2!=0,"pred"] - valid2.loc[valid2.nv2!=0,"nv2"])/valid2.loc[valid2.nv2!=0,"nv2"])).mean(),
                "test_mape":(np.abs((test2.loc[test2.nv2!=0,"pred"] - test2.loc[test2.nv2!=0,"nv2"])/test2.loc[test2.nv2!=0,"nv2"])).mean(),
                "train_rmse":np.sqrt(mean_squared_error(train2["nv2"], train2["pred"])),
                #"valid_rmse":np.sqrt(mean_squared_error(valid2["nv2"], valid2["pred"])),
                "test_rmse":np.sqrt(mean_squared_error(test2["nv2"], test2["pred"])),
                "train_mae":np.abs(train2["nv2"]-train2["pred"]).mean(),
                #"valid_mae":np.abs(valid2["nv2"]-valid2["pred"]).mean(),
                "test_mae":np.abs(test2["nv2"]-test2["pred"]).mean(),
                }
              ,ignore_index=True)
  
  scores3 = pd.concat([scores3,scores2],axis=0)
  test3 = pd.concat([test3,test2],axis=0)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[100]	training's rmse: 0.0646037	valid_1's rmse: 0.0859403
Early stopping, best iteration is:
[43]	training's rmse: 0.0709153	valid_1's rmse: 0.0857377
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 317
[LightGBM] [Info] Number of data points in the train set: 5414, number of used features: 6
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 6 dense feature groups (0.04 MB) transferred to GPU in 0.000476 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.371594
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.066527	valid_1's rmse: 0.0817637
Early stopping, best iteration is:
[51]	training's rmse: 0.0713251	valid_1's rmse: 0.0810734
[LightGBM] [Info] This is th

In [None]:
from sklearn.metrics import r2_score

test3["env2"] = test3["env"] / test3["observed_max"]
test3["observed"] = test3["nv2"]*test3["observed_max"]
test3["fluctuation"] = np.abs((test3["observed"] - test3["preal-30"])/test3["observed_max"])
test3_result = pd.DataFrame(columns=["id","rmse","mape","mae_per","rmse_op","mape_op","nrmse","nmae","r2","mae_per_op","nrmse_op","nmae_op","r2_op","fluctuation","observed_max"])
for id,group in test3.groupby("id"):
  mape = (np.abs((group.loc[group.nv2!=0,"pred"] - group.loc[group.nv2!=0,"nv2"])/group.loc[group.nv2!=0,"nv2"])).mean()
  rmse = np.sqrt(mean_squared_error(group["nv2"], group["pred"]))
  mae_per = np.abs(group["pred"] - group["nv2"]).mean()
  nrmse = np.sqrt(np.mean(((group.pred-group.nv2)/group.nv2.max())**(2))) 
  nmae = np.abs(group.pred-group.nv2).sum()/group.nv2.sum() 
  r2 = r2_score(group["nv2"], group["pred"])

  mape_op = (np.abs((group.loc[group.nv2!=0,"env2"] - group.loc[group.nv2!=0,"nv2"])/group.loc[group.nv2!=0,"nv2"])).mean()
  rmse_op = np.sqrt(mean_squared_error(group["nv2"], group["env2"]))
  mae_per_op = np.abs(group["pred"] - group["env2"]).mean()
  nrmse_op = np.sqrt(np.mean(((group.env2-group.nv2)/group.nv2.max())**(2)))
  nmae_op = np.abs(group.env2-group.nv2).sum()/group.nv2.sum()
  r2_op = r2_score(group["env2"], group["pred"])

  fluctuation = group["fluctuation"].mean()
  observed_max = group["observed_max"].iloc[0]

  test3_result = test3_result.append({"id":id,"rmse":rmse,"mape":mape,"mae_per":mae_per,"nrmse":nrmse,"nmae":nmae,"r2":r2,"rmse_op":rmse_op,"mape_op":mape_op,"mae_per_op":mae_per_op,"nrmse_op":nrmse_op,\
                                    "nmae_op":nmae_op,"r2_op":r2_op,"fluctuation":fluctuation,"observed_max":observed_max,"observed_max_test":group.nv2.max()},ignore_index=True)

df_test3 = pd.merge(df_missing,test3_result,on="id")
df_test3 = df_test3[df_test3.missing_percent_test==0]

In [None]:
scores_ = pd.DataFrame()
scores_= scores_.append({"month":month_list,
                         #"train.shape[0]":train.shape[0],
                        #  "train_mape":(np.abs((train.loc[train.nv2!=0,"pred"] - train.loc[train.nv2!=0,"nv2"])/train.loc[train.nv2!=0,"nv2"])).mean(),
                        # "valid_mape":(np.abs((valid.loc[valid.nv2!=0,"pred"] - valid.loc[valid.nv2!=0,"nv2"])/valid.loc[valid.nv2!=0,"nv2"])).mean(),
                          "test_mape":(np.abs((test3.loc[test3.nv2!=0,"pred"] - test3.loc[test3.nv2!=0,"nv2"])/test3.loc[test3.nv2!=0,"nv2"])).mean(),
                          #"train_rmse":np.sqrt(mean_squared_error(train["nv2"], train["pred"])),
                        # "valid_rmse":np.sqrt(mean_squared_error(valid["nv2"], valid["pred"])),
                          "test_rmse":np.sqrt(mean_squared_error(test3["nv2"], test3["pred"]))}
                        ,ignore_index=True)

In [None]:
scores_

Unnamed: 0,month,test_mape,test_rmse
0,"[6, 5, 4, 3, 2, 1, 12, 11, 10, 9, 8]",0.258194,0.097654


In [None]:
test3.to_csv("./output/single/test3.csv",index=False)
df_test3.to_csv("./output/single/df_test3.csv",index=False)