In [1]:
###-- Google Driveに同期 --###
from google.colab import drive
drive.mount('/content/drive')

In [2]:
###------------------------------###
###           入力項目            ###
###------------------------------###
##-- CSVファイル名（訓練データ、テストデータ）
train_filename, test_filename = "train_data.csv", "test_data.csv"
##-- Data PATH
PATH_data = "drive/My Drive/data/"
##-- Data Submit
PATH_submit = "drive/My Drive/submit/"
##-- Rondom state
random_state = 99

In [2]:
###-------------------------------------------###
###        必要なライブラリの読み込み             ###
###-------------------------------------------###
##-- Pandas(CSVファイルの読み込み/処理)
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('max_columns', 300)
pd.set_option('display.max_columns', 300)
pd.set_option('max_rows', 300)
pd.set_option('display.max_rows', 300)
##-- Numpy(数値計算用の配列)
import numpy as np
##-- Matplotlib(グラフの描画)
import matplotlib.pylab as plt
import matplotlib.cm as cm # グラデーション
import seaborn as sns
##-- Scikit-learn(重回帰処理)
import sklearn  #-- print(sklearn.__version__)

##-- Warning非表示
import warnings
warnings.filterwarnings('ignore')

plt.style.use('bmh')
from itertools import cycle
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

In [4]:
###---------------------------------------------###
###        CSVファイルからプロセスを読み込む         ###
###---------------------------------------------###
f = pd.read_csv(PATH_data+"/"+train_filename, encoding="utf-8")
g = pd.read_csv(PATH_data+"/"+test_filename, encoding="utf-8")

##-- 対数をとっている
f["y"] = np.log(f["y"])

# f.head(5)
# f.dtypes

前処理

In [5]:
mean_ = f[["categoryId", "y"]].groupby("categoryId").mean().reset_index().rename({"y":"mean"}, axis=1)
max_ = f[["categoryId", "y"]].groupby("categoryId").max().reset_index().rename({"y":"max"}, axis=1)
min_ = f[["categoryId", "y"]].groupby("categoryId").min().reset_index().rename({"y":"min"}, axis=1)
std_ = f[["categoryId", "y"]].groupby("categoryId").std().reset_index().rename({"y":"std"}, axis=1)
count_ = f[["categoryId", "y"]].groupby("categoryId").count().reset_index().rename({"y":"count"}, axis=1)
q1_ = f[["categoryId", "y"]].groupby("categoryId").quantile(0.1).reset_index().rename({"y":"q1"}, axis=1)
q25_ = f[["categoryId", "y"]].groupby("categoryId").quantile(0.25).reset_index().rename({"y":"q25"}, axis=1)
q5_ = f[["categoryId", "y"]].groupby("categoryId").quantile(0.5).reset_index().rename({"y":"q5"}, axis=1)
q75_ = f[["categoryId", "y"]].groupby("categoryId").quantile(0.75).reset_index().rename({"y":"q75"}, axis=1)
q9_ = f[["categoryId", "y"]].groupby("categoryId").quantile(0.9).reset_index().rename({"y":"q9"}, axis=1)


def is_japanese(string):
    for ch in string:
        try:
            name = unicodedata.name(ch) 
            if "CJK UNIFIED" in name \
            or "HIRAGANA" in name \
            or "KATAKANA" in name:
                return True
        except:
          continue
    return False

Y = f["y"]
del f["y"]

##-- Dataを結合
data = pd.concat([f, g])
print(data.shape)  # 19720 + 29582 = 49302

data["tags"].fillna("[none]", inplace=True)
tagdic = dict(pd.Series("|".join(list(data["tags"])).split("|")).value_counts().sort_values())



(49302, 16)


In [6]:
##-- 正規表現 https://docs.python.org/3/library/re.html
import re

def bool_to_int(df):
  df["comments_disabled"] = df["comments_disabled"].astype(np.int16)
  df["ratings_disabled"] = df["ratings_disabled"].astype(np.int16)
  return df

def create_features(df):
  ##--  like dislike comment
  df["likes2"] = df["likes"]**2
  df["loglikes"] = np.log(df["likes"]+1)
  df["dislikes2"] = df["dislikes"]**2
  df["logdislikes"] = np.log(df["dislikes"]+1)
  df["logcomment_count"] = np.log(df["comment_count"]+1)
  df["sqrtlikes"] = np.sqrt(df["likes"])
  df["like_dislike_ratio"] = df["likes"]/(df["dislikes"]+1)
  df["comments_like_ratio"] = df["comment_count"]/(df["likes"]+1)
  df["comments_dislike_ratio"] = df["comment_count"]/(df["dislikes"]+1)

  ##-- likes comments diable
  df["likes_com"] = df["likes"] * df["comments_disabled"]
  df["dislikes_com"] = df["dislikes"] * df["comments_disabled"]
  df["comments_likes"] = df["comment_count"] * df["ratings_disabled"]

  ##-- tags
  df["num_tags"] = df["tags"].astype(str).apply(lambda x: len(x.split("|")))
  df["length_tags"] = df["tags"].astype(str).apply(lambda x: len(x))
  df["tags_point"] = df["tags"].apply(lambda tags: sum([tagdic[tag] for tag in tags.split("|")]))
  df["count_en_tag"] = df["tags"].apply(lambda x: sum([bool(re.search(r'[a-zA-Z0-9]', x_)) for x_ in x.split("|")]))
  df["count_ja_tag"] = df["tags"].apply(lambda x: sum([is_japanese(x_) for x_ in x.split("|")]))

  ##-- publishedAt
  df["publishedAt"] = pd.to_datetime(df["publishedAt"], utc=True)
  df["publishedAt_year"] = df["publishedAt"].apply(lambda x: x.year)
  df["publishedAt_month"] = df["publishedAt"].apply(lambda x: x.month)
  df["publishedAt_day"] = df["publishedAt"].apply(lambda x: x.day)
  df["publishedAt_hour"] = df["publishedAt"].apply(lambda x: x.hour)
  df["publishedAt_minute"] = df["publishedAt"].apply(lambda x: x.minute)
  df["publishedAt_second"] = df["publishedAt"].apply(lambda x: x.second)
  df["publishedAt_dayofweek"] = df["publishedAt"].apply(lambda x: x.dayofweek)

  ##-- collection_date
  df["collection_date_year"] = df["collection_date"].apply(lambda x: int(x[0:2]))
  ##-- 20 --> 2020年に直す
  df["collection_date_year"] = df["collection_date_year"] + 2000
  ##--
  df["collection_date_month"] = df["collection_date"].apply(lambda x: int(x[3:5]))
  df["collection_date_day"] = df["collection_date"].apply(lambda x: int(x[6:8]))
  df["collection_date"] = pd.to_datetime("20"+df["collection_date"], format="%Y.%d.%m", utc=True)

  ##-- delta
  df["delta"] = (df["collection_date"] - df["publishedAt"]).apply(lambda x: x.days)
  df["logdelta"] = np.log(df["delta"])
  df["sqrtdelta"] = np.sqrt(df["delta"])
  df["published_delta"] = (df["publishedAt"] - df["publishedAt"].min()).apply(lambda x: x.days)
  df["collection_delta"] = (df["collection_date"] - df["collection_date"].min()).apply(lambda x: x.days)

  df["description"].fillna(" ", inplace=True)
  df["ishttp_in_dis"] = df["description"].apply(lambda x: x.lower().count("http"))
  df["len_description"] = df["description"].apply(lambda x: len(x))

  df["title"].fillna(" ", inplace=True)
  df["len_title"] = df["title"].apply(lambda x: len(x))

  ##-- is japanese
  df["isJa_title"] = df["title"].apply(lambda x: is_japanese(x))
  df["isJa_tags"] = df["tags"].apply(lambda x: is_japanese(x))
  df["isJa_description"] = df["description"].apply(lambda x: is_japanese(x))

  ##-- is englosh
  df["onEn_tags"] = df["tags"].apply(lambda x: x.encode('utf-8').isalnum())
  df["onEn_description"] = df["description"].apply(lambda x: x.encode('utf-8').isalnum())

  ##-- cotain englosh
  df["conEn_title"] = df["title"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
  df["conEn_tags"] = df["tags"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
  df["conEn_description"] = df["description"].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))


  
  df = df.merge(mean_, how='left', on=["categoryId"])
  df = df.merge(max_, how='left', on=["categoryId"])
  df = df.merge(min_, how='left', on=["categoryId"])
  df = df.merge(std_, how='left', on=["categoryId"])
  df = df.merge(q1_, how='left', on=["categoryId"])
  df = df.merge(q25_, how='left', on=["categoryId"])
  df = df.merge(q5_, how='left', on=["categoryId"])
  df = df.merge(q75_, how='left', on=["categoryId"])
  df = df.merge(q9_, how='left', on=["categoryId"])

  # 出現頻度
  for col in ["categoryId", "channelTitle"]:
    freq = df[col].value_counts()
    df["freq_"+col] = df[col].map(freq)

  return df

In [7]:
data = bool_to_int(data)
data = create_features(data)

In [8]:
###-- 特徴量（公開されている時間（hour））, Google検索値を使用
data["period_hour"] = 8760*(data["collection_date_year"] - data["publishedAt_year"]) \
                      + 730*(data["collection_date_month"] - data["publishedAt_month"]) \
                      + 24*(data["collection_date_day"] - data["publishedAt_day"])

In [9]:
del data["channelId"]
del data["video_id"]
del data["title"]
del data["description"]
del data["thumbnail_link"]
del data["channelTitle"]
del data["tags"]
del data["publishedAt"]
del data["collection_date"]
del data["id"]
##-- LightGBMで"Feature importance"がゼロだったcolumnsを削除
del data["logdislikes"]
del data["dislikes2"]
del data["loglikes"]
del data["count_ja_tag"]
del data["logcomment_count"]
del data["likes2"]
del data["isJa_description"]
del data["isJa_tags"]
del data["isJa_title"]
del data["sqrtlikes"]
del data["sqrtdelta"]
del data["logdelta"]

訓練・検証データセットの準備

In [10]:
##-- 数値変数
nume_cols = [\
              #-- "***"(int64)
              "likes",\
              #-- "***"(int64)
              "dislikes",\
              #-- "***"(int64)
              "comment_count",\
              "like_dislike_ratio",\
              "comments_like_ratio",\
              "comments_dislike_ratio",\
              "num_tags",\
              "tags_point",\
              "count_en_tag",\
              "len_description",\
              "len_title",\
              "conEn_title",\
              "conEn_tags",\
              "conEn_description",\
              #-- "***"(float64)
              "length_tags",\
              #-- "***"(float64)
              "period_hour",\
             ]
##-- カテゴリカル変数
cat_cols = [\
              #-- "***"(int64)
              "categoryId",\
              "freq_categoryId",\
              "freq_channelTitle",\
              # "isJa_title",\
              # "isJa_tags",\
              # "isJa_description",\
              "onEn_tags",\
              "onEn_description",\
              "ishttp_in_dis",\
            ]

In [11]:
##-- 交互作用の作成
def interaction(df, nume_cols):
  k = 0
  for i in range(len(nume_cols)):
      for j in range(len(nume_cols)):
          if i == j:
              pass
          else:
              k += 1
              ##-- 積
              name_seki = "feature_seki" + str(k)
              df[name_seki] = df[nume_cols[i]]*df[nume_cols[j]]
              ##-- 商
              name_shou = "feature_shou" + str(k)
              df[name_shou] = df[nume_cols[i]]*df[nume_cols[j]]
              ##-- 差
              name_sa = "feature_sa" + str(k)
              df[name_sa] = df[nume_cols[i]]*df[nume_cols[j]]
  return df
data = interaction(data, nume_cols)
# X.head()

In [12]:
data.head()

Unnamed: 0,categoryId,likes,dislikes,comment_count,comments_disabled,ratings_disabled,like_dislike_ratio,comments_like_ratio,comments_dislike_ratio,likes_com,dislikes_com,comments_likes,num_tags,length_tags,tags_point,count_en_tag,publishedAt_year,publishedAt_month,publishedAt_day,publishedAt_hour,publishedAt_minute,publishedAt_second,publishedAt_dayofweek,collection_date_year,collection_date_month,collection_date_day,delta,published_delta,collection_delta,ishttp_in_dis,len_description,len_title,onEn_tags,onEn_description,conEn_title,conEn_tags,conEn_description,mean,max,min,std,q1,q25,q5,q75,q9,freq_categoryId,freq_channelTitle,period_hour,feature_seki1,feature_shou1,feature_sa1,feature_seki2,feature_shou2,feature_sa2,feature_seki3,feature_shou3,feature_sa3,feature_seki4,feature_shou4,feature_sa4,feature_seki5,feature_shou5,feature_sa5,feature_seki6,feature_shou6,feature_sa6,feature_seki7,feature_shou7,feature_sa7,feature_seki8,feature_shou8,feature_sa8,feature_seki9,feature_shou9,feature_sa9,feature_seki10,feature_shou10,feature_sa10,feature_seki11,feature_shou11,feature_sa11,feature_seki12,feature_shou12,feature_sa12,feature_seki13,feature_shou13,feature_sa13,feature_seki14,feature_shou14,feature_sa14,feature_seki15,feature_shou15,feature_sa15,feature_seki16,feature_shou16,feature_sa16,feature_seki17,feature_shou17,feature_sa17,feature_seki18,feature_shou18,feature_sa18,feature_seki19,feature_shou19,feature_sa19,feature_seki20,feature_shou20,feature_sa20,feature_seki21,feature_shou21,feature_sa21,feature_seki22,feature_shou22,feature_sa22,feature_seki23,feature_shou23,feature_sa23,feature_seki24,feature_shou24,feature_sa24,feature_seki25,feature_shou25,feature_sa25,feature_seki26,feature_shou26,feature_sa26,feature_seki27,feature_shou27,feature_sa27,feature_seki28,feature_shou28,feature_sa28,feature_seki29,feature_shou29,feature_sa29,feature_seki30,feature_shou30,feature_sa30,feature_seki31,feature_shou31,feature_sa31,feature_seki32,feature_shou32,feature_sa32,feature_seki33,feature_shou33,feature_sa33,feature_seki34,feature_shou34,...,feature_seki191,feature_shou191,feature_sa191,feature_seki192,feature_shou192,feature_sa192,feature_seki193,feature_shou193,feature_sa193,feature_seki194,feature_shou194,feature_sa194,feature_seki195,feature_shou195,feature_sa195,feature_seki196,feature_shou196,feature_sa196,feature_seki197,feature_shou197,feature_sa197,feature_seki198,feature_shou198,feature_sa198,feature_seki199,feature_shou199,feature_sa199,feature_seki200,feature_shou200,feature_sa200,feature_seki201,feature_shou201,feature_sa201,feature_seki202,feature_shou202,feature_sa202,feature_seki203,feature_shou203,feature_sa203,feature_seki204,feature_shou204,feature_sa204,feature_seki205,feature_shou205,feature_sa205,feature_seki206,feature_shou206,feature_sa206,feature_seki207,feature_shou207,feature_sa207,feature_seki208,feature_shou208,feature_sa208,feature_seki209,feature_shou209,feature_sa209,feature_seki210,feature_shou210,feature_sa210,feature_seki211,feature_shou211,feature_sa211,feature_seki212,feature_shou212,feature_sa212,feature_seki213,feature_shou213,feature_sa213,feature_seki214,feature_shou214,feature_sa214,feature_seki215,feature_shou215,feature_sa215,feature_seki216,feature_shou216,feature_sa216,feature_seki217,feature_shou217,feature_sa217,feature_seki218,feature_shou218,feature_sa218,feature_seki219,feature_shou219,feature_sa219,feature_seki220,feature_shou220,feature_sa220,feature_seki221,feature_shou221,feature_sa221,feature_seki222,feature_shou222,feature_sa222,feature_seki223,feature_shou223,feature_sa223,feature_seki224,feature_shou224,feature_sa224,feature_seki225,feature_shou225,feature_sa225,feature_seki226,feature_shou226,feature_sa226,feature_seki227,feature_shou227,feature_sa227,feature_seki228,feature_shou228,feature_sa228,feature_seki229,feature_shou229,feature_sa229,feature_seki230,feature_shou230,feature_sa230,feature_seki231,feature_shou231,feature_sa231,feature_seki232,feature_shou232,feature_sa232,feature_seki233,feature_shou233,feature_sa233,feature_seki234,feature_shou234,feature_sa234,feature_seki235,feature_shou235,feature_sa235,feature_seki236,feature_shou236,feature_sa236,feature_seki237,feature_shou237,feature_sa237,feature_seki238,feature_shou238,feature_sa238,feature_seki239,feature_shou239,feature_sa239,feature_seki240,feature_shou240,feature_sa240
0,20,114,0,7,0,0,114.0,0.06087,7.0,0,0,0,48,315,5434,47,2011,1,9,5,50,33,6,2020,1,2,3309,2086,41,0,61,42,False,False,19,258,38,12.746174,16.78797,3.871201,1.794815,10.331027,11.707977,12.980648,13.965945,14.772192,2275,1,78672,0,0,0,798,798,798,12996.0,12996.0,12996.0,6.93913,6.93913,6.93913,798.0,798.0,798.0,5472,5472,5472,619476,619476,619476,5358,5358,5358,6954,6954,6954,4788,4788,4788,2166,2166,2166,29412,29412,29412,4332,4332,4332,35910,35910,35910,8968608,8968608,8968608,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,798,798,798,0,0,0,798.0,798.0,798.0,0.426087,0.426087,...,10836,10836,10836,4902,4902,4902,9804,9804,9804,81270,81270,81270,20297376,20297376,20297376,4332,4332,4332,0,0,0,266,266,266,4332.0,4332.0,4332.0,2.313043,2.313043,2.313043,266.0,266.0,266.0,1824,1824,1824,206492,206492,206492,1786,1786,1786,2318,2318,2318,1596,1596,1596,722,722,722,9804,9804,9804,11970,11970,11970,2989536,2989536,2989536,35910,35910,35910,0,0,0,2205,2205,2205,35910.0,35910.0,35910.0,19.173913,19.173913,19.173913,2205.0,2205.0,2205.0,15120,15120,15120,1711710,1711710,1711710,14805,14805,14805,19215,19215,19215,13230,13230,13230,5985,5985,5985,81270,81270,81270,11970,11970,11970,24781680,24781680,24781680,8968608,8968608,8968608,0,0,0,550704,550704,550704,8968608.0,8968608.0,8968608.0,4788.730435,4788.730435,4788.730435,550704.0,550704.0,550704.0,3776256,3776256,3776256,427503648,427503648,427503648,3697584,3697584,3697584,4798992,4798992,4798992,3304224,3304224,3304224,1494768,1494768,1494768,20297376,20297376,20297376,2989536,2989536,2989536,24781680,24781680,24781680
1,10,2885,50,111,0,0,56.568627,0.038462,2.176471,0,0,0,19,129,76,5,2012,7,23,3,0,9,0,2020,8,2,2755,2646,48,1,261,35,False,False,18,38,54,13.155115,20.987221,1.791759,2.345023,10.297022,11.692096,13.236976,14.651161,15.922321,17253,5,70306,144250,144250,144250,320235,320235,320235,163200.490196,163200.490196,163200.490196,110.961538,110.961538,110.961538,6279.117647,6279.117647,6279.117647,54815,54815,54815,219260,219260,219260,14425,14425,14425,752985,752985,752985,100975,100975,100975,51930,51930,51930,109630,109630,109630,155790,155790,155790,372165,372165,372165,202832810,202832810,202832810,144250,144250,144250,5550,5550,5550,2828.431373,2828.431373,2828.431373,1.923077,1.923077,1.923077,108.823529,108.823529,108.823529,950,950,950,3800,3800,3800,250,250,250,13050,13050,13050,1750,1750,1750,900,900,900,1900,1900,1900,2700,2700,2700,6450,6450,6450,3515300,3515300,3515300,320235,320235,320235,5550,5550,5550,6279.117647,6279.117647,6279.117647,4.269231,4.269231,...,1330,1330,1330,684,684,684,2052,2052,2052,4902,4902,4902,2671628,2671628,2671628,155790,155790,155790,2700,2700,2700,5994,5994,5994,3054.705882,3054.705882,3054.705882,2.076923,2.076923,2.076923,117.529412,117.529412,117.529412,1026,1026,1026,4104,4104,4104,270,270,270,14094,14094,14094,1890,1890,1890,972,972,972,2052,2052,2052,6966,6966,6966,3796524,3796524,3796524,372165,372165,372165,6450,6450,6450,14319,14319,14319,7297.352941,7297.352941,7297.352941,4.961538,4.961538,4.961538,280.764706,280.764706,280.764706,2451,2451,2451,9804,9804,9804,645,645,645,33669,33669,33669,4515,4515,4515,2322,2322,2322,4902,4902,4902,6966,6966,6966,9069474,9069474,9069474,202832810,202832810,202832810,3515300,3515300,3515300,7803966,7803966,7803966,3977114.0,3977114.0,3977114.0,2704.076923,2704.076923,2704.076923,153018.941176,153018.941176,153018.941176,1335814,1335814,1335814,5343256,5343256,5343256,351530,351530,351530,18349866,18349866,18349866,2460710,2460710,2460710,1265508,1265508,1265508,2671628,2671628,2671628,3796524,3796524,3796524,9069474,9069474,9069474
2,24,133,17,14,0,0,7.388889,0.104478,0.777778,0,0,0,9,52,49,6,2007,7,26,13,54,9,3,2020,14,1,4554,823,23,1,45,22,False,False,11,33,36,12.621979,19.241236,1.098612,2.240565,9.784028,11.27116,12.789547,14.163681,15.277146,6873,1,118390,2261,2261,2261,1862,1862,1862,982.722222,982.722222,982.722222,13.895522,13.895522,13.895522,103.444444,103.444444,103.444444,1197,1197,1197,6517,6517,6517,798,798,798,5985,5985,5985,2926,2926,2926,1463,1463,1463,4389,4389,4389,4788,4788,4788,6916,6916,6916,15745870,15745870,15745870,2261,2261,2261,238,238,238,125.611111,125.611111,125.611111,1.776119,1.776119,1.776119,13.222222,13.222222,13.222222,153,153,153,833,833,833,102,102,102,765,765,765,374,374,374,187,187,187,561,561,561,612,612,612,884,884,884,2012630,2012630,2012630,1862,1862,1862,238,238,238,103.444444,103.444444,103.444444,1.462687,1.462687,...,726,726,726,363,363,363,1188,1188,1188,1716,1716,1716,3906870,3906870,3906870,4788,4788,4788,612,612,612,504,504,504,266.0,266.0,266.0,3.761194,3.761194,3.761194,28.0,28.0,28.0,324,324,324,1764,1764,1764,216,216,216,1620,1620,1620,792,792,792,396,396,396,1188,1188,1188,1872,1872,1872,4262040,4262040,4262040,6916,6916,6916,884,884,884,728,728,728,384.222222,384.222222,384.222222,5.432836,5.432836,5.432836,40.444444,40.444444,40.444444,468,468,468,2548,2548,2548,312,312,312,2340,2340,2340,1144,1144,1144,572,572,572,1716,1716,1716,1872,1872,1872,6156280,6156280,6156280,15745870,15745870,15745870,2012630,2012630,2012630,1657460,1657460,1657460,874770.6,874770.6,874770.6,12369.104478,12369.104478,12369.104478,92081.111111,92081.111111,92081.111111,1065510,1065510,1065510,5801110,5801110,5801110,710340,710340,710340,5327550,5327550,5327550,2604580,2604580,2604580,1302290,1302290,1302290,3906870,3906870,3906870,4262040,4262040,4262040,6156280,6156280,6156280
3,22,287,51,173,0,0,5.519231,0.600694,3.326923,0,0,0,1,6,2,1,2005,5,15,2,38,43,6,2019,22,12,5333,20,0,0,30,20,True,False,17,6,21,11.934383,17.548095,2.079442,2.559307,8.288986,10.549596,12.19682,13.755893,15.00633,2207,10,134978,14637,14637,14637,49651,49651,49651,1584.019231,1584.019231,1584.019231,172.399306,172.399306,172.399306,954.826923,954.826923,954.826923,287,287,287,574,574,574,287,287,287,8610,8610,8610,5740,5740,5740,4879,4879,4879,1722,1722,1722,6027,6027,6027,1722,1722,1722,38738686,38738686,38738686,14637,14637,14637,8823,8823,8823,281.480769,281.480769,281.480769,30.635417,30.635417,30.635417,169.673077,169.673077,169.673077,51,51,51,102,102,102,51,51,51,1530,1530,1530,1020,1020,1020,867,867,867,306,306,306,1071,1071,1071,306,306,306,6883878,6883878,6883878,49651,49651,49651,8823,8823,8823,954.826923,954.826923,954.826923,103.920139,103.920139,...,120,120,120,102,102,102,126,126,126,36,36,36,809868,809868,809868,6027,6027,6027,1071,1071,1071,3633,3633,3633,115.903846,115.903846,115.903846,12.614583,12.614583,12.614583,69.865385,69.865385,69.865385,21,21,21,42,42,42,21,21,21,630,630,630,420,420,420,357,357,357,126,126,126,126,126,126,2834538,2834538,2834538,1722,1722,1722,306,306,306,1038,1038,1038,33.115385,33.115385,33.115385,3.604167,3.604167,3.604167,19.961538,19.961538,19.961538,6,6,6,12,12,12,6,6,6,180,180,180,120,120,120,102,102,102,36,36,36,126,126,126,809868,809868,809868,38738686,38738686,38738686,6883878,6883878,6883878,23351194,23351194,23351194,744974.7,744974.7,744974.7,81080.534722,81080.534722,81080.534722,449061.423077,449061.423077,449061.423077,134978,134978,134978,269956,269956,269956,134978,134978,134978,4049340,4049340,4049340,2699560,2699560,2699560,2294626,2294626,2294626,809868,809868,809868,2834538,2834538,2834538,809868,809868,809868
4,10,178,6,17,0,0,25.428571,0.094972,2.428571,0,0,0,12,83,850,2,2007,9,9,9,52,47,6,2020,8,1,4503,868,17,2,138,13,False,False,0,10,79,13.155115,20.987221,1.791759,2.345023,10.297022,11.692096,13.236976,14.651161,15.922321,17253,2,112958,1068,1068,1068,3026,3026,3026,4526.285714,4526.285714,4526.285714,16.905028,16.905028,16.905028,432.285714,432.285714,432.285714,2136,2136,2136,151300,151300,151300,356,356,356,24564,24564,24564,2314,2314,2314,0,0,0,1780,1780,1780,14062,14062,14062,14774,14774,14774,20106524,20106524,20106524,1068,1068,1068,102,102,102,152.571429,152.571429,152.571429,0.569832,0.569832,0.569832,14.571429,14.571429,14.571429,72,72,72,5100,5100,5100,12,12,12,828,828,828,78,78,78,0,0,0,60,60,60,474,474,474,498,498,498,677748,677748,677748,3026,3026,3026,102,102,102,432.285714,432.285714,432.285714,1.614525,1.614525,...,130,130,130,0,0,0,790,790,790,830,830,830,1129580,1129580,1129580,14062,14062,14062,474,474,474,1343,1343,1343,2008.857143,2008.857143,2008.857143,7.502793,7.502793,7.502793,191.857143,191.857143,191.857143,948,948,948,67150,67150,67150,158,158,158,10902,10902,10902,1027,1027,1027,0,0,0,790,790,790,6557,6557,6557,8923682,8923682,8923682,14774,14774,14774,498,498,498,1411,1411,1411,2110.571429,2110.571429,2110.571429,7.882682,7.882682,7.882682,201.571429,201.571429,201.571429,996,996,996,70550,70550,70550,166,166,166,11454,11454,11454,1079,1079,1079,0,0,0,830,830,830,6557,6557,6557,9375514,9375514,9375514,20106524,20106524,20106524,677748,677748,677748,1920286,1920286,1920286,2872361.0,2872361.0,2872361.0,10727.854749,10727.854749,10727.854749,274326.571429,274326.571429,274326.571429,1355496,1355496,1355496,96014300,96014300,96014300,225916,225916,225916,15588204,15588204,15588204,1468454,1468454,1468454,0,0,0,1129580,1129580,1129580,8923682,8923682,8923682,9375514,9375514,9375514


In [13]:
##-- Label Encoding for categorical variable
from sklearn import preprocessing

for name in cat_cols:
  ##-- 学習データに基づいて定義する
  le = preprocessing.LabelEncoder()
  ##-- 訓練とテスト用の併せたカテゴリーにする
  # X_fit = X[name].append(X_pre[name])
  # le.fit(X_fit)
  le.fit(data[name])

  data[name] = le.transform(data[name])

data.head()

Unnamed: 0,categoryId,likes,dislikes,comment_count,comments_disabled,ratings_disabled,like_dislike_ratio,comments_like_ratio,comments_dislike_ratio,likes_com,dislikes_com,comments_likes,num_tags,length_tags,tags_point,count_en_tag,publishedAt_year,publishedAt_month,publishedAt_day,publishedAt_hour,publishedAt_minute,publishedAt_second,publishedAt_dayofweek,collection_date_year,collection_date_month,collection_date_day,delta,published_delta,collection_delta,ishttp_in_dis,len_description,len_title,onEn_tags,onEn_description,conEn_title,conEn_tags,conEn_description,mean,max,min,std,q1,q25,q5,q75,q9,freq_categoryId,freq_channelTitle,period_hour,feature_seki1,feature_shou1,feature_sa1,feature_seki2,feature_shou2,feature_sa2,feature_seki3,feature_shou3,feature_sa3,feature_seki4,feature_shou4,feature_sa4,feature_seki5,feature_shou5,feature_sa5,feature_seki6,feature_shou6,feature_sa6,feature_seki7,feature_shou7,feature_sa7,feature_seki8,feature_shou8,feature_sa8,feature_seki9,feature_shou9,feature_sa9,feature_seki10,feature_shou10,feature_sa10,feature_seki11,feature_shou11,feature_sa11,feature_seki12,feature_shou12,feature_sa12,feature_seki13,feature_shou13,feature_sa13,feature_seki14,feature_shou14,feature_sa14,feature_seki15,feature_shou15,feature_sa15,feature_seki16,feature_shou16,feature_sa16,feature_seki17,feature_shou17,feature_sa17,feature_seki18,feature_shou18,feature_sa18,feature_seki19,feature_shou19,feature_sa19,feature_seki20,feature_shou20,feature_sa20,feature_seki21,feature_shou21,feature_sa21,feature_seki22,feature_shou22,feature_sa22,feature_seki23,feature_shou23,feature_sa23,feature_seki24,feature_shou24,feature_sa24,feature_seki25,feature_shou25,feature_sa25,feature_seki26,feature_shou26,feature_sa26,feature_seki27,feature_shou27,feature_sa27,feature_seki28,feature_shou28,feature_sa28,feature_seki29,feature_shou29,feature_sa29,feature_seki30,feature_shou30,feature_sa30,feature_seki31,feature_shou31,feature_sa31,feature_seki32,feature_shou32,feature_sa32,feature_seki33,feature_shou33,feature_sa33,feature_seki34,feature_shou34,...,feature_seki191,feature_shou191,feature_sa191,feature_seki192,feature_shou192,feature_sa192,feature_seki193,feature_shou193,feature_sa193,feature_seki194,feature_shou194,feature_sa194,feature_seki195,feature_shou195,feature_sa195,feature_seki196,feature_shou196,feature_sa196,feature_seki197,feature_shou197,feature_sa197,feature_seki198,feature_shou198,feature_sa198,feature_seki199,feature_shou199,feature_sa199,feature_seki200,feature_shou200,feature_sa200,feature_seki201,feature_shou201,feature_sa201,feature_seki202,feature_shou202,feature_sa202,feature_seki203,feature_shou203,feature_sa203,feature_seki204,feature_shou204,feature_sa204,feature_seki205,feature_shou205,feature_sa205,feature_seki206,feature_shou206,feature_sa206,feature_seki207,feature_shou207,feature_sa207,feature_seki208,feature_shou208,feature_sa208,feature_seki209,feature_shou209,feature_sa209,feature_seki210,feature_shou210,feature_sa210,feature_seki211,feature_shou211,feature_sa211,feature_seki212,feature_shou212,feature_sa212,feature_seki213,feature_shou213,feature_sa213,feature_seki214,feature_shou214,feature_sa214,feature_seki215,feature_shou215,feature_sa215,feature_seki216,feature_shou216,feature_sa216,feature_seki217,feature_shou217,feature_sa217,feature_seki218,feature_shou218,feature_sa218,feature_seki219,feature_shou219,feature_sa219,feature_seki220,feature_shou220,feature_sa220,feature_seki221,feature_shou221,feature_sa221,feature_seki222,feature_shou222,feature_sa222,feature_seki223,feature_shou223,feature_sa223,feature_seki224,feature_shou224,feature_sa224,feature_seki225,feature_shou225,feature_sa225,feature_seki226,feature_shou226,feature_sa226,feature_seki227,feature_shou227,feature_sa227,feature_seki228,feature_shou228,feature_sa228,feature_seki229,feature_shou229,feature_sa229,feature_seki230,feature_shou230,feature_sa230,feature_seki231,feature_shou231,feature_sa231,feature_seki232,feature_shou232,feature_sa232,feature_seki233,feature_shou233,feature_sa233,feature_seki234,feature_shou234,feature_sa234,feature_seki235,feature_shou235,feature_sa235,feature_seki236,feature_shou236,feature_sa236,feature_seki237,feature_shou237,feature_sa237,feature_seki238,feature_shou238,feature_sa238,feature_seki239,feature_shou239,feature_sa239,feature_seki240,feature_shou240,feature_sa240
0,6,114,0,7,0,0,114.0,0.06087,7.0,0,0,0,48,315,5434,47,2011,1,9,5,50,33,6,2020,1,2,3309,2086,41,0,61,42,0,0,19,258,38,12.746174,16.78797,3.871201,1.794815,10.331027,11.707977,12.980648,13.965945,14.772192,12,0,78672,0,0,0,798,798,798,12996.0,12996.0,12996.0,6.93913,6.93913,6.93913,798.0,798.0,798.0,5472,5472,5472,619476,619476,619476,5358,5358,5358,6954,6954,6954,4788,4788,4788,2166,2166,2166,29412,29412,29412,4332,4332,4332,35910,35910,35910,8968608,8968608,8968608,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,798,798,798,0,0,0,798.0,798.0,798.0,0.426087,0.426087,...,10836,10836,10836,4902,4902,4902,9804,9804,9804,81270,81270,81270,20297376,20297376,20297376,4332,4332,4332,0,0,0,266,266,266,4332.0,4332.0,4332.0,2.313043,2.313043,2.313043,266.0,266.0,266.0,1824,1824,1824,206492,206492,206492,1786,1786,1786,2318,2318,2318,1596,1596,1596,722,722,722,9804,9804,9804,11970,11970,11970,2989536,2989536,2989536,35910,35910,35910,0,0,0,2205,2205,2205,35910.0,35910.0,35910.0,19.173913,19.173913,19.173913,2205.0,2205.0,2205.0,15120,15120,15120,1711710,1711710,1711710,14805,14805,14805,19215,19215,19215,13230,13230,13230,5985,5985,5985,81270,81270,81270,11970,11970,11970,24781680,24781680,24781680,8968608,8968608,8968608,0,0,0,550704,550704,550704,8968608.0,8968608.0,8968608.0,4788.730435,4788.730435,4788.730435,550704.0,550704.0,550704.0,3776256,3776256,3776256,427503648,427503648,427503648,3697584,3697584,3697584,4798992,4798992,4798992,3304224,3304224,3304224,1494768,1494768,1494768,20297376,20297376,20297376,2989536,2989536,2989536,24781680,24781680,24781680
1,2,2885,50,111,0,0,56.568627,0.038462,2.176471,0,0,0,19,129,76,5,2012,7,23,3,0,9,0,2020,8,2,2755,2646,48,1,261,35,0,0,18,38,54,13.155115,20.987221,1.791759,2.345023,10.297022,11.692096,13.236976,14.651161,15.922321,17,4,70306,144250,144250,144250,320235,320235,320235,163200.490196,163200.490196,163200.490196,110.961538,110.961538,110.961538,6279.117647,6279.117647,6279.117647,54815,54815,54815,219260,219260,219260,14425,14425,14425,752985,752985,752985,100975,100975,100975,51930,51930,51930,109630,109630,109630,155790,155790,155790,372165,372165,372165,202832810,202832810,202832810,144250,144250,144250,5550,5550,5550,2828.431373,2828.431373,2828.431373,1.923077,1.923077,1.923077,108.823529,108.823529,108.823529,950,950,950,3800,3800,3800,250,250,250,13050,13050,13050,1750,1750,1750,900,900,900,1900,1900,1900,2700,2700,2700,6450,6450,6450,3515300,3515300,3515300,320235,320235,320235,5550,5550,5550,6279.117647,6279.117647,6279.117647,4.269231,4.269231,...,1330,1330,1330,684,684,684,2052,2052,2052,4902,4902,4902,2671628,2671628,2671628,155790,155790,155790,2700,2700,2700,5994,5994,5994,3054.705882,3054.705882,3054.705882,2.076923,2.076923,2.076923,117.529412,117.529412,117.529412,1026,1026,1026,4104,4104,4104,270,270,270,14094,14094,14094,1890,1890,1890,972,972,972,2052,2052,2052,6966,6966,6966,3796524,3796524,3796524,372165,372165,372165,6450,6450,6450,14319,14319,14319,7297.352941,7297.352941,7297.352941,4.961538,4.961538,4.961538,280.764706,280.764706,280.764706,2451,2451,2451,9804,9804,9804,645,645,645,33669,33669,33669,4515,4515,4515,2322,2322,2322,4902,4902,4902,6966,6966,6966,9069474,9069474,9069474,202832810,202832810,202832810,3515300,3515300,3515300,7803966,7803966,7803966,3977114.0,3977114.0,3977114.0,2704.076923,2704.076923,2704.076923,153018.941176,153018.941176,153018.941176,1335814,1335814,1335814,5343256,5343256,5343256,351530,351530,351530,18349866,18349866,18349866,2460710,2460710,2460710,1265508,1265508,1265508,2671628,2671628,2671628,3796524,3796524,3796524,9069474,9069474,9069474
2,9,133,17,14,0,0,7.388889,0.104478,0.777778,0,0,0,9,52,49,6,2007,7,26,13,54,9,3,2020,14,1,4554,823,23,1,45,22,0,0,11,33,36,12.621979,19.241236,1.098612,2.240565,9.784028,11.27116,12.789547,14.163681,15.277146,16,0,118390,2261,2261,2261,1862,1862,1862,982.722222,982.722222,982.722222,13.895522,13.895522,13.895522,103.444444,103.444444,103.444444,1197,1197,1197,6517,6517,6517,798,798,798,5985,5985,5985,2926,2926,2926,1463,1463,1463,4389,4389,4389,4788,4788,4788,6916,6916,6916,15745870,15745870,15745870,2261,2261,2261,238,238,238,125.611111,125.611111,125.611111,1.776119,1.776119,1.776119,13.222222,13.222222,13.222222,153,153,153,833,833,833,102,102,102,765,765,765,374,374,374,187,187,187,561,561,561,612,612,612,884,884,884,2012630,2012630,2012630,1862,1862,1862,238,238,238,103.444444,103.444444,103.444444,1.462687,1.462687,...,726,726,726,363,363,363,1188,1188,1188,1716,1716,1716,3906870,3906870,3906870,4788,4788,4788,612,612,612,504,504,504,266.0,266.0,266.0,3.761194,3.761194,3.761194,28.0,28.0,28.0,324,324,324,1764,1764,1764,216,216,216,1620,1620,1620,792,792,792,396,396,396,1188,1188,1188,1872,1872,1872,4262040,4262040,4262040,6916,6916,6916,884,884,884,728,728,728,384.222222,384.222222,384.222222,5.432836,5.432836,5.432836,40.444444,40.444444,40.444444,468,468,468,2548,2548,2548,312,312,312,2340,2340,2340,1144,1144,1144,572,572,572,1716,1716,1716,1872,1872,1872,6156280,6156280,6156280,15745870,15745870,15745870,2012630,2012630,2012630,1657460,1657460,1657460,874770.6,874770.6,874770.6,12369.104478,12369.104478,12369.104478,92081.111111,92081.111111,92081.111111,1065510,1065510,1065510,5801110,5801110,5801110,710340,710340,710340,5327550,5327550,5327550,2604580,2604580,2604580,1302290,1302290,1302290,3906870,3906870,3906870,4262040,4262040,4262040,6156280,6156280,6156280
3,7,287,51,173,0,0,5.519231,0.600694,3.326923,0,0,0,1,6,2,1,2005,5,15,2,38,43,6,2019,22,12,5333,20,0,0,30,20,1,0,17,6,21,11.934383,17.548095,2.079442,2.559307,8.288986,10.549596,12.19682,13.755893,15.00633,11,9,134978,14637,14637,14637,49651,49651,49651,1584.019231,1584.019231,1584.019231,172.399306,172.399306,172.399306,954.826923,954.826923,954.826923,287,287,287,574,574,574,287,287,287,8610,8610,8610,5740,5740,5740,4879,4879,4879,1722,1722,1722,6027,6027,6027,1722,1722,1722,38738686,38738686,38738686,14637,14637,14637,8823,8823,8823,281.480769,281.480769,281.480769,30.635417,30.635417,30.635417,169.673077,169.673077,169.673077,51,51,51,102,102,102,51,51,51,1530,1530,1530,1020,1020,1020,867,867,867,306,306,306,1071,1071,1071,306,306,306,6883878,6883878,6883878,49651,49651,49651,8823,8823,8823,954.826923,954.826923,954.826923,103.920139,103.920139,...,120,120,120,102,102,102,126,126,126,36,36,36,809868,809868,809868,6027,6027,6027,1071,1071,1071,3633,3633,3633,115.903846,115.903846,115.903846,12.614583,12.614583,12.614583,69.865385,69.865385,69.865385,21,21,21,42,42,42,21,21,21,630,630,630,420,420,420,357,357,357,126,126,126,126,126,126,2834538,2834538,2834538,1722,1722,1722,306,306,306,1038,1038,1038,33.115385,33.115385,33.115385,3.604167,3.604167,3.604167,19.961538,19.961538,19.961538,6,6,6,12,12,12,6,6,6,180,180,180,120,120,120,102,102,102,36,36,36,126,126,126,809868,809868,809868,38738686,38738686,38738686,6883878,6883878,6883878,23351194,23351194,23351194,744974.7,744974.7,744974.7,81080.534722,81080.534722,81080.534722,449061.423077,449061.423077,449061.423077,134978,134978,134978,269956,269956,269956,134978,134978,134978,4049340,4049340,4049340,2699560,2699560,2699560,2294626,2294626,2294626,809868,809868,809868,2834538,2834538,2834538,809868,809868,809868
4,2,178,6,17,0,0,25.428571,0.094972,2.428571,0,0,0,12,83,850,2,2007,9,9,9,52,47,6,2020,8,1,4503,868,17,2,138,13,0,0,0,10,79,13.155115,20.987221,1.791759,2.345023,10.297022,11.692096,13.236976,14.651161,15.922321,17,1,112958,1068,1068,1068,3026,3026,3026,4526.285714,4526.285714,4526.285714,16.905028,16.905028,16.905028,432.285714,432.285714,432.285714,2136,2136,2136,151300,151300,151300,356,356,356,24564,24564,24564,2314,2314,2314,0,0,0,1780,1780,1780,14062,14062,14062,14774,14774,14774,20106524,20106524,20106524,1068,1068,1068,102,102,102,152.571429,152.571429,152.571429,0.569832,0.569832,0.569832,14.571429,14.571429,14.571429,72,72,72,5100,5100,5100,12,12,12,828,828,828,78,78,78,0,0,0,60,60,60,474,474,474,498,498,498,677748,677748,677748,3026,3026,3026,102,102,102,432.285714,432.285714,432.285714,1.614525,1.614525,...,130,130,130,0,0,0,790,790,790,830,830,830,1129580,1129580,1129580,14062,14062,14062,474,474,474,1343,1343,1343,2008.857143,2008.857143,2008.857143,7.502793,7.502793,7.502793,191.857143,191.857143,191.857143,948,948,948,67150,67150,67150,158,158,158,10902,10902,10902,1027,1027,1027,0,0,0,790,790,790,6557,6557,6557,8923682,8923682,8923682,14774,14774,14774,498,498,498,1411,1411,1411,2110.571429,2110.571429,2110.571429,7.882682,7.882682,7.882682,201.571429,201.571429,201.571429,996,996,996,70550,70550,70550,166,166,166,11454,11454,11454,1079,1079,1079,0,0,0,830,830,830,6557,6557,6557,9375514,9375514,9375514,20106524,20106524,20106524,677748,677748,677748,1920286,1920286,1920286,2872361.0,2872361.0,2872361.0,10727.854749,10727.854749,10727.854749,274326.571429,274326.571429,274326.571429,1355496,1355496,1355496,96014300,96014300,96014300,225916,225916,225916,15588204,15588204,15588204,1468454,1468454,1468454,0,0,0,1129580,1129580,1129580,8923682,8923682,8923682,9375514,9375514,9375514


In [14]:
###-- データセットのメモリ削減 --###
##-- Ref. https://www.kaggle.com/fabiendaniel/elo-world
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

print('start size: {:5.2f} Mb'.format(data.memory_usage().sum() / 1024**2))
data = reduce_mem_usage(data)

start size: 289.07 Mb
Mem. usage decreased to 148.01 Mb (48.8% reduction)


In [15]:
##-- 説明変数のみ取り出す
# data = data[input_name]

##-- 訓練データ(X)とテストデータ(X_test)に再分割
X = data.iloc[:len(Y), :]
X_pre = data.iloc[len(Y):, :]

print(X.shape[0], X_pre.shape[0]) ##-- 19720 and 29582

del data

19720 29582


In [3]:
###-- Import XGboost and MultiClass logloss --###
import xgboost as xgb
from sklearn.metrics import log_loss, mean_squared_log_error


###-- 全データ投入して学習 --###
###-- KFold Cross validation --###
num_splits=5
from sklearn.model_selection import KFold
kf = KFold(n_splits=num_splits, shuffle=True, random_state=random_state)

###--  Set params  --###
##-- Ref. https://qiita.com/FJyusk56/items/0649f4362587261bd57a
##-- Ref. http://kamonohashiperry.com/archives/209
params = {
          "nthread": -1,
          "booster": 'gbtree',
          'objective': 'reg:squarederror',
          'eval_metric': 'rmse',
          'tree_method': 'hist',
          "importance_type": 'gain',
          "min_child_weight": 1,
          "colsample_bytree": 0.8, #--使用する説明変数の比率(これがないとrandom_seedが機能しない)
          ##-- Optuna target
          ##-- Should be optimized
          "eta": 0.05, #-- "lr"の加速率 0.05
          # "learning_rate": 0.01,
          "max_depth": 7, #-- 精度に効いてくる
          'lambda': 1., #-- 1.8e-08
          "gamma": 1., #-- 0.002
          "reg_alpha": 0.9,
          "reg_lambda": 5e-06,
          ##--
          "silent": 0, #-- Message is 0: ON, 1: OFF
          # "seed": 99,
          }

n_estimators=1000

models, loss_list = [], []
num_models = 1
i = 0
for i in range(num_models):
  params_seed = {'seed': random_state + i}
  params.update(params_seed)
  print()
  print("Random seed:", params["seed"])
  print()
  for train_idx, val_idx in kf.split(X):
    ###-- Set the dataset --###
    X_kfold_train, Y_kfold_train = X.iloc[train_idx, :], Y.iloc[train_idx]
    X_kfold_val, Y_kfold_val = X.iloc[val_idx, :], Y.iloc[val_idx]

    ###-- Set dataset --###
    train_data_set = xgb.DMatrix(X_kfold_train, label=Y_kfold_train)
    test_data_set = xgb.DMatrix(X_kfold_val, Y_kfold_val)
    X_val_xgbm = xgb.DMatrix(X_kfold_val)

    watchlist = [(train_data_set, "train"), (test_data_set, "eval")]

    xgbm = xgb.train(
                      params, 
                      train_data_set,
                      n_estimators,
                      evals=[(train_data_set, 'train'),(test_data_set, 'eval')],
                      early_stopping_rounds=50, 
                      verbose_eval=5000,
                    )
    models.append(xgbm)

    ##-- Confirm Scores
    Y_val_pre = xgbm.predict(X_val_xgbm)

    ##-- loss: RMSE  np.sqrt(mean_squared_log_error(np.exp(y_val), oof)) = 
    loss_rmsle = np.sqrt( mean_squared_log_error(np.exp(Y_val_pre), np.exp(Y_kfold_val)) )
    loss_list.append(loss_rmsle)
    ##-- RMSLE lossを出力
    print()
    print("RMSLE", loss_rmsle)
    print()



In [18]:
print("train-average loss:", np.array(loss_list).sum() / len(models))

train-average loss: 0.7977200363039035


In [19]:
def predict_ensemble(models, num_model, X_pre):
    Y_pre_list = []
    for i in range(num_model):
        Y_pre = models[i].predict(X_pre)
        Y_pre_list.append(Y_pre)

    Y_pre_list = np.array(Y_pre_list)

    Y_pre_submit_tem = Y_pre_list[0]
    for i in range(1, num_model):
        Y_pre_submit_tem += Y_pre_list[i]

    Y_pre_submit = Y_pre_submit_tem / float(num_model)

    return Y_pre_submit


###------------------------------------###
###        結果ファイルへの書き出し        ###
###------------------------------------###
from pathlib import Path
root = Path(PATH_submit)

##-- Prediction for test data
Y_pre_submit = np.exp(predict_ensemble(models, len(models), xgb.DMatrix(X_pre)))

submit = pd.DataFrame({'y': Y_pre_submit})
submit.index.name = 'id'
submit.index = submit.index + 1
submit.to_csv(root.joinpath("submission_xgboost_5kfold_Seed"+str(random_state)+".csv"), index=True)