# 056_Model_nb053ベースでハイパラチューニング

In [None]:
# basic
import os
import gc
import sys
import yaml
import warnings
import random
from pathlib import Path
from glob import glob
from tqdm import tqdm_notebook as tqdm
import hashlib
import pickle
warnings.filterwarnings('ignore')

# usual
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import re

# preprocess
from fasttext import load_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import texthero as hero
import nltk
import cv2
from gensim.models import word2vec, KeyedVectors

# LightGBM
import lightgbm as lgb
#import optuna.integration.lightgbm as lgb  # チューニング用

# visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from pandas_profiling import ProfileReport  # profile report を作る用

# preprocessing
from sklearn.preprocessing import LabelEncoder

# plot settings
plt.rcParams["patch.force_edgecolor"] = False
plt.rcParams['font.family'] = 'sans_serif'
sns.set(style="whitegrid",  palette="muted", color_codes=True, rc={'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

# plot extentions
import japanize_matplotlib
from matplotlib_venn import venn2

from tqdm import tqdm


In [None]:
# seed固定
def set_seed(seed=2021):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

SEED = 2021
set_seed(SEED)

In [None]:
tqdm.pandas()

In [None]:
# 試験ID生成
trial_prefix = 'nb056'  # ←手動で指定 
dttm_now = datetime.now().strftime('%Y%m%d_%H%M%S')
trial_id = f'{trial_prefix}_{dttm_now}'

print(trial_prefix)
print(trial_id)

In [None]:
# アウトプット先指定
OUTPUT_DIR = Path(f"../02_outputs/{trial_prefix}")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
# データのパス指定
DATA_DIR = '../00_input/atmacup10_dataset/'

In [None]:
# 処理実行フラグ
RUN_W2V = True
#RUN_TFIDF = True
RUN_FEAT_SELECTION = True
#RUN_FEAT_SELECTION_IMPORTANCE = False
RUN_OPTUNA = True

# 特徴出力先指定
FEAT_DIR = Path(f"../03_feature/")

# 入出力のnotebook番号
feat_input = 'nb036'
feat_output = trial_prefix

feat_selection_input = 'nb042'
feat_selection_output = trial_prefix

tuned_model_input = 'nb042'
tuned_model_output = trial_prefix

In [None]:
# train/test
train_base = pd.read_csv(DATA_DIR + 'train.csv')
test_base = pd.read_csv(DATA_DIR + 'test.csv')

# 絵画の属性(object_idでJOIN)
color = pd.read_csv(DATA_DIR + 'color.csv')
palette = pd.read_csv(DATA_DIR + 'palette.csv')
material = pd.read_csv(DATA_DIR + 'material.csv')
historical_person = pd.read_csv(DATA_DIR + 'historical_person.csv')
object_collection = pd.read_csv(DATA_DIR + 'object_collection.csv')
production_place = pd.read_csv(DATA_DIR + 'production_place.csv')
technique = pd.read_csv(DATA_DIR + 'technique.csv')

# 作家の情報(nameでJOIN)
maker = pd.read_csv(DATA_DIR + 'maker.csv')

# 作品に関係した作家(vs作品情報:object_id, vs作家情報:name)
principal_maker = pd.read_csv(DATA_DIR + 'principal_maker.csv')

# 作家が作品にどうか変わったか(ちょっと扱いがわからない。保留)
principal_maker_occupation = pd.read_csv(DATA_DIR + 'principal_maker_occupation.csv')

# sample_submission
atmacup10__sample_submission = pd.read_csv(DATA_DIR + 'atmacup10__sample_submission.csv')

# 1. 事前確認

# 2. 特徴作成

# merge tables

In [None]:
def merge_table(input_df, right_df, left_on, right_on, right_prefix=None):
    join_df = right_df.add_prefix(right_prefix)
    out_df = input_df.merge(join_df, left_on=left_on, right_on=right_prefix+right_on, how='left')
    out_df.drop(right_prefix+right_on, axis=1, inplace=True)
    return out_df

In [None]:
train = merge_table(train_base, maker, left_on='principal_maker', right_on='name', right_prefix='principal_maker_')
test  = merge_table(test_base, maker, left_on='principal_maker', right_on='name', right_prefix='principal_maker_')

## merge check

In [None]:
print(train.shape)
train.head().T

In [None]:
print(test.shape)
test.head().T

ok!

## 前処理＆特徴量作成

In [None]:
# 特徴出力用カラム指定＆作成
FEAT_W2V_DIR_IN = FEAT_DIR/f'w2v/{feat_input}'
FEAT_W2V_DIR_OUT = FEAT_DIR/f'w2v/{feat_output}'
FEAT_W2V_DIR_OUT.mkdir(exist_ok=True, parents=True)

### preprocess: color

In [None]:
# https://www.guruguru.science/competitions/16/discussions/a69d0d70-3e50-4eb7-9d65-22bc87e380af/
import pandas as pd
from PIL import ImageColor

color_rgb = pd.DataFrame(color['hex'].str.strip().map(ImageColor.getrgb)\
                         .values.tolist(), columns=['color_r', 'color_g', 'color_b'])
color_rgb_df = pd.concat([color, color_rgb], axis=1).drop('hex', axis=1)

In [None]:
import colorsys

def rgb_encoder(input_df:pd.DataFrame(), idcols:list, prefix:str):
    tmp_ = input_df.copy()
    scale_csys = pd.DataFrame()

    # colorsysを用いた変換
    tmp_['hsv'] = tmp_[['color_r', 'color_g', 'color_b']]\
                        .apply(lambda x: colorsys.rgb_to_hsv(x[0], x[1],x[2]), axis=1)

    tmp_['yiq'] = tmp_[['color_r', 'color_g', 'color_b']]\
                        .apply(lambda x: colorsys.rgb_to_yiq(x[0], x[1],x[2]), axis=1)
    tmp_['hls'] = tmp_[['color_r', 'color_g', 'color_b']]\
                        .apply(lambda x: colorsys.rgb_to_hls(x[0], x[1],x[2]), axis=1)

    scale_csys['hsv_h'] = tmp_['hsv'].map(lambda x: x[0])
    scale_csys['hsv_s'] = tmp_['hsv'].map(lambda x: x[1])
    scale_csys['hsv_v'] = tmp_['hsv'].map(lambda x: x[2])

    scale_csys['yiq_y'] = tmp_['yiq'].map(lambda x: x[0])
    scale_csys['yiq_i'] = tmp_['yiq'].map(lambda x: x[1])
    scale_csys['yiq_q'] = tmp_['yiq'].map(lambda x: x[2])

    scale_csys['hls_h'] = tmp_['hls'].map(lambda x: x[0])
    scale_csys['hls_l'] = tmp_['hls'].map(lambda x: x[1])
    scale_csys['hls_s'] = tmp_['hls'].map(lambda x: x[2])

    # opencvを用いた変換
    rgb_ = np.array(tmp_[['color_r', 'color_g', 'color_b']]) #RGB
    rgb_ = rgb_[:, np.newaxis,:] # 画像の行列形状に
    rgb_ = rgb_.astype(np.uint8) 
    
    lab = cv2.cvtColor(rgb_, cv2.COLOR_RGB2Lab)
    lab_df = pd.DataFrame(lab[:,0,:], columns=['lab_L', 'lab_a', 'lab_b'])
    ycrcb = cv2.cvtColor(rgb_, cv2.COLOR_BGR2YCrCb)
    ycrcb_df = pd.DataFrame(lab[:,0,:], columns=['YCrCb_Y', 'YCrCb_Cr', 'YCrCb_Cb'])
    lub = cv2.cvtColor(rgb_, cv2.COLOR_RGB2Luv)
    lub_df = pd.DataFrame(lab[:,0,:], columns=['Luv_L', 'Luv_u', 'Luv_v'])
    xyz = cv2.cvtColor(rgb_, cv2.COLOR_RGB2XYZ)
    xyz_df = pd.DataFrame(lab[:,0,:], columns=['XYZ_X', 'XYZ_Y', 'XYZ_Z'])
    gray = cv2.cvtColor(rgb_, cv2.COLOR_RGB2GRAY)
    gray_df = pd.DataFrame(gray)
    gray_df.columns = ['grayscale']

    scale_cv = pd.concat([lab_df, lub_df, xyz_df, gray_df], axis=1)

    color_scales = pd.concat([scale_csys, scale_cv], axis=1).add_prefix(f'{prefix}')
    color_scales_cols = color_scales.columns.tolist()

    out_df = pd.concat([tmp_[idcols], color_scales], axis=1)
    
    return out_df, color_scales_cols

In [None]:
color_df, color_cols = rgb_encoder(color_rgb_df, ['object_id', 'percentage'], 'color_')
print(color_cols)
color_df['ratio'] = color_df['percentage'] * 0.01
color_df.head()

In [None]:
def color_transform(input_df:pd.DataFrame(), color_df:pd.DataFrame(), tgcols):
    output_df = input_df[['object_id']].copy()
    color_tmp = color_df.copy()

    # 平均を取得
    mean_palette = color_tmp.copy()
    for col_ in tgcols:
        mean_palette[f'mean_{col_}'] = mean_palette['ratio'] * mean_palette[col_]
    print(mean_palette.columns)
    mean_group = mean_palette.groupby('object_id')[[f'mean_{i}' for i in tgcols]].sum().reset_index()
    output_df = pd.merge(output_df, mean_group, on="object_id", how="left")
    print(output_df.columns)

    # 分散を取得
    var_palette_grp = color_tmp.groupby("object_id")[tgcols
                                                      ].var().add_prefix('var_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # 標準偏差を取得
    var_palette_grp = color_tmp.groupby("object_id")[tgcols
                                                      ].std().add_prefix('std_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # ratioの分散を取得
    var_ratio = color_tmp.groupby('object_id')['ratio'].var()
    output_df['var_color_ratio'] = output_df['object_id'].map(var_ratio)

    return output_df.drop(['object_id'], axis=1)


In [None]:
color_train = color_transform(train, color_df, color_cols)
color_test = color_transform(test, color_df, color_cols)

color_columns = color_train.columns.tolist()

train = pd.concat([train, color_train], axis=1)
test = pd.concat([test, color_test], axis=1)

In [None]:
print(train.shape)
print(test.shape)

### preprocess: palette_hsv_yiq

In [None]:
import colorsys

palette_hsv_yiq = palette.copy()
palette_hsv_yiq['hsv'] = palette_hsv_yiq[['color_r', 'color_g', 'color_b']]\
                    .apply(lambda x: colorsys.rgb_to_hsv(x[0], x[1],x[2]), axis=1)

palette_hsv_yiq['yiq'] = palette_hsv_yiq[['color_r', 'color_g', 'color_b']]\
                    .apply(lambda x: colorsys.rgb_to_yiq(x[0], x[1],x[2]), axis=1)
palette_hsv_yiq['hls'] = palette_hsv_yiq[['color_r', 'color_g', 'color_b']]\
                    .apply(lambda x: colorsys.rgb_to_hls(x[0], x[1],x[2]), axis=1)

palette_hsv_yiq['hsv_h'] = palette_hsv_yiq['hsv'].map(lambda x: x[0])
palette_hsv_yiq['hsv_s'] = palette_hsv_yiq['hsv'].map(lambda x: x[1])
palette_hsv_yiq['hsv_v'] = palette_hsv_yiq['hsv'].map(lambda x: x[2])

palette_hsv_yiq['yiq_y'] = palette_hsv_yiq['yiq'].map(lambda x: x[0])
palette_hsv_yiq['yiq_i'] = palette_hsv_yiq['yiq'].map(lambda x: x[1])
palette_hsv_yiq['yiq_q'] = palette_hsv_yiq['yiq'].map(lambda x: x[2])

palette_hsv_yiq['hls_h'] = palette_hsv_yiq['hls'].map(lambda x: x[0])
palette_hsv_yiq['hls_l'] = palette_hsv_yiq['hls'].map(lambda x: x[1])
palette_hsv_yiq['hls_s'] = palette_hsv_yiq['hls'].map(lambda x: x[2])


color_sys_cols = ['hsv_h','hsv_s','hsv_v',
                          'yiq_y','yiq_i','yiq_q'
                          #'hls_h','hls_l','hls_s'
                 ]

palette_hsv_yiq = palette_hsv_yiq[['ratio',
                          'object_id',
                          *color_sys_cols]]

In [None]:
def create_palette_hsv_yiq(input_df, tgcols=color_sys_cols):
    output_df = input_df[['object_id']].copy()
    palette_tmp = palette_hsv_yiq.copy()

    # 平均のlabを取得
    mean_palette = palette_tmp.copy()
    for col_ in tgcols:
        mean_palette[f'mean_{col_}'] = mean_palette['ratio'] * mean_palette[col_]
    print(mean_palette.columns)
    mean_group = mean_palette.groupby('object_id')[[f'mean_{i}' for i in tgcols]].sum().reset_index()
    output_df = pd.merge(output_df, mean_group, on="object_id", how="left")
    print(output_df.columns)

    # labの分散を取得
    var_palette_grp = palette_tmp.groupby("object_id")[tgcols
                                                      ].var().add_prefix('var_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # labの標準偏差を取得
    var_palette_grp = palette_tmp.groupby("object_id")[tgcols
                                                      ].std().add_prefix('std_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # ratioの分散を取得
    #var_ratio = palette_lab.groupby('object_id')['ratio'].var()
    #output_df['var_ratio'] = output_df['object_id'].map(var_ratio)

    return output_df.drop(['object_id'], axis=1)


In [None]:
create_palette_hsv_yiq(train)

In [None]:
palette_train_hsvyiq = create_palette_hsv_yiq(train)
palette_test_hsvyiq = create_palette_hsv_yiq(test)

feat_palette_hsvyiq = palette_train_hsvyiq.columns.tolist()

train = pd.concat([train, palette_train_hsvyiq], axis=1)
test = pd.concat([test, palette_test_hsvyiq], axis=1)

In [None]:
feat_palette_hsvyiq

In [None]:
palette_train_hsvyiq.head().T

### preprocess: palette_lab

In [None]:
def rgb_to_scale_opencv():
    rgb_ = np.array(palette[['color_r', 'color_g', 'color_b']]) #RGB
    rgb_ = rgb_[:, np.newaxis,:] # 画像の行列形状に
    rgb_ = rgb_.astype(np.uint8) 
    
    lab = cv2.cvtColor(rgb_, cv2.COLOR_RGB2Lab)
    lab_df = pd.DataFrame(lab[:,0,:], columns=['lab_L', 'lab_a', 'lab_b'])
    ycrcb = cv2.cvtColor(rgb_, cv2.COLOR_BGR2YCrCb)
    ycrcb_df = pd.DataFrame(lab[:,0,:], columns=['YCrCb_Y', 'YCrCb_Cr', 'YCrCb_Cb'])
    lub = cv2.cvtColor(rgb_, cv2.COLOR_RGB2Luv)
    lub_df = pd.DataFrame(lab[:,0,:], columns=['Luv_L', 'Luv_u', 'Luv_v'])
    xyz = cv2.cvtColor(rgb_, cv2.COLOR_RGB2XYZ)
    xyz_df = pd.DataFrame(lab[:,0,:], columns=['XYZ_X', 'XYZ_Y', 'XYZ_Z'])
    gray = cv2.cvtColor(rgb_, cv2.COLOR_RGB2GRAY)
    gray_df = pd.DataFrame(gray)
    gray_df.columns = ['grayscale']

    palette_cv = pd.concat([lab_df, lub_df, xyz_df, gray_df], axis=1)
    palette_cv_cols = palette_cv.columns.tolist()
    palette_out = pd.concat([palette, palette_cv], axis=1).drop(['color_r', 'color_g', 'color_b'], axis=1)

    return palette_out, palette_cv_cols


def create_palette_feature_opencv(input_df, palette_df, tgcols):
    output_df = input_df[['object_id']].copy()

    # ratio最大のものを取得
    max_palette = palette_df.groupby('object_id')['ratio'].max().reset_index()
    max_palette = pd.merge(max_palette,
                           palette_df,
                           on=['object_id','ratio'],
                           how='left'
                          ).set_index(['object_id','ratio']
                                     ).add_prefix('max_').reset_index()
    
    max_palette = max_palette.loc[
                            max_palette["object_id"].drop_duplicates().index.tolist()
                            ].reset_index()  # 同じidでmax ratioが同じものは削除

    output_df = pd.merge(output_df, max_palette, on="object_id", how="left")
    print(output_df.columns)

    # 平均を取得
    mean_palette = palette_df.copy()
    for col_ in tgcols:
        mean_palette[f'mean_{col_}'] = mean_palette['ratio'] * mean_palette[col_]
    print(mean_palette.columns)
    mean_group = mean_palette.groupby('object_id')[[f'mean_{i}' for i in tgcols]].sum().reset_index()
    output_df = pd.merge(output_df, mean_group, on="object_id", how="left")
    print(output_df.columns)

    # 分散を取得
    var_palette_grp = palette_df.groupby("object_id")[tgcols
                                                      ].var().add_prefix('var_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # 標準偏差を取得
    var_palette_grp = palette_df.groupby("object_id")[tgcols
                                                      ].std().add_prefix('std_').reset_index()
    output_df = pd.merge(output_df, var_palette_grp, on="object_id", how="left")
    print(output_df.columns)

    # ratioの分散を取得
    #var_ratio = palette_lab.groupby('object_id')['ratio'].var()
    #output_df['var_ratio'] = output_df['object_id'].map(var_ratio)

    return output_df.drop(['index', 'object_id'], axis=1)


In [None]:
palette_cv, palette_cv_cols = rgb_to_scale_opencv()

palette_train = create_palette_feature_opencv(train, palette_cv, palette_cv_cols)
palette_test = create_palette_feature_opencv(test, palette_cv, palette_cv_cols)

feat_palette_cv = palette_train.columns.tolist()

train = pd.concat([train, palette_train], axis=1)
test = pd.concat([test, palette_test], axis=1)

### preprocess: W2V_items

In [None]:
# 単語ベクトル表現の次元数
# 元の語彙数をベースに適当に決めました
model_size = {
    "material": 20,
    "technique": 8,
    "collection": 3,
    "material_collection": 20,
    "material_technique": 20,
    "collection_technique": 10,
    "material_collection_technique": 25,
    "historical_person": 30, 
    "production_place": 20,
    "historical_person__production_place":50 
}

n_iter = 100


In [None]:
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

def create_w2v(df_list:list, df_nm_list:list):
    w2v_dfs = []
    w2v_cols = []
    for df, df_name in zip(df_list, df_nm_list):
        df_group = df.groupby("object_id")["name"].apply(list).reset_index()
        # Word2Vecの学習
        w2v_model = word2vec.Word2Vec(df_group["name"].values.tolist(),
                                      size=model_size[df_name],
                                      min_count=1,
                                      window=1,
                                      seed=SEED,
                                      workers=1,
                                      hashfxn=hashfxn,
                                      iter=n_iter)

        # 各文章ごとにそれぞれの単語をベクトル表現に直し、平均をとって文章ベクトルにする
        sentence_vectors = df_group["name"].progress_apply(
            lambda x: np.mean([w2v_model.wv[e] for e in x], axis=0))
        sentence_vectors = np.vstack([x for x in sentence_vectors])
        sentence_vector_df = pd.DataFrame(sentence_vectors,
                                          columns=[f"{df_name}_w2v_{i}"
                                                   for i in range(model_size[df_name])])
        sentence_vector_df.index = df_group["object_id"]
        w2v_dfs.append(sentence_vector_df)
        w2v_cols += sentence_vector_df.columns.tolist()
    return w2v_dfs, w2v_cols


def add_w2v_dfs(input_df:pd.DataFrame, w2v_dfs:list):
    out_df = input_df.copy()
    for _df in w2v_dfs:
        _df = _df.reset_index()
        out_df = pd.merge(out_df, _df, on='object_id', how='left')
    return out_df

In [None]:
if RUN_W2V == True:
    mat_col = pd.concat([material, object_collection], axis=0).reset_index(drop=True)
    mat_tec = pd.concat([material, technique], axis=0).reset_index(drop=True)
    col_tec = pd.concat([object_collection, technique], axis=0).reset_index(drop=True)
    mat_col_tec = pd.concat([material, object_collection, technique], axis=0).reset_index(drop=True)
    per_plc = pd.concat([historical_person, production_place, technique], axis=0).reset_index(drop=True)

    df_list = [material, object_collection, technique, mat_col, mat_tec, col_tec, mat_col_tec,
              historical_person, production_place, per_plc]
    df_nm_list = [
                "material", "collection", "technique",
                "material_collection",
                "material_technique",
                "collection_technique",
                "material_collection_technique",
                "historical_person", "production_place",
                "historical_person__production_place"
            ]

    w2v_dfs, w2v_cols = create_w2v(df_list, df_nm_list)

    with open(FEAT_W2V_DIR_OUT/f'w2v_dfs.pkl', 'wb') as f:
        pickle.dump(w2v_dfs , f)

    with open(FEAT_W2V_DIR_OUT/f'w2v_cols.pkl', 'wb') as f:
        pickle.dump(w2v_cols , f)

else:    
    with open(FEAT_W2V_DIR_IN/f'w2v_dfs.pkl', 'rb') as f:
        w2v_dfs = pickle.load(f)

    with open(FEAT_W2V_DIR_IN/f'w2v_cols.pkl', 'rb') as f:
        w2v_cols = pickle.load(f)

train = add_w2v_dfs(train, w2v_dfs)
test = add_w2v_dfs(test, w2v_dfs)

### preprocess: W2V_Text

In [None]:
model_size = {
    "title": 30,
    "description": 30,
    "more_title": 15,
    "long_title": 15
    ,'acquisition_credit_line':15
}

text_cols = ['title', 'description', 'more_title', 'long_title', 'acquisition_credit_line']

n_iter = 100

def text_normalization(text):

    # 英語とオランダ語を stopword として指定
    custom_stopwords = nltk.corpus.stopwords.words('dutch') + nltk.corpus.stopwords.words('english')

    x = hero.clean(text, pipeline=[
        hero.preprocessing.fillna,
        hero.preprocessing.lowercase,
        hero.preprocessing.remove_digits,
        hero.preprocessing.remove_punctuation,
        hero.preprocessing.remove_diacritics,
        lambda x: hero.preprocessing.remove_stopwords(x, stopwords=custom_stopwords)
    ])

    return x

def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

def create_w2v_single_table(input_df:pd.DataFrame, columns:list):
    w2v_dfs = []
    w2v_cols = []
    for col_ in columns:
        tmp_df = input_df[['object_id', col_]].copy()
        tmp_df = tmp_df.dropna(axis=0) 
        tmp_df['name'] = text_normalization(tmp_df[col_]).\
                            map(lambda x: [i for i in x.split(' ') if i not in (' ')])  

        # Word2Vecの学習
        w2v_model = word2vec.Word2Vec(tmp_df['name'].values.tolist(),
                                      size=model_size[col_],
                                      min_count=1,
                                      window=5,
                                      seed=SEED,
                                      workers=1,
                                      hashfxn=hashfxn,
                                      iter=n_iter)

        # 各文章ごとにそれぞれの単語をベクトル表現に直し、平均をとって文章ベクトルにする
        sentence_vectors = tmp_df['name'].progress_apply(
            lambda x: np.mean([w2v_model.wv[e] for e in x], axis=0))
        sentence_vectors = np.vstack([x for x in sentence_vectors])
        sentence_vector_df = pd.DataFrame(sentence_vectors,
                                          columns=[f"{col_}_w2v_{i}"
                                                   for i in range(model_size[col_])])
        sentence_vector_df.index = tmp_df["object_id"]
        w2v_dfs.append(sentence_vector_df)
        w2v_cols += sentence_vector_df.columns.tolist()
    return w2v_dfs, w2v_cols


def add_w2v_dfs(input_df:pd.DataFrame, w2v_dfs:list):
    out_df = input_df.copy()
    for _df in w2v_dfs:
        _df = _df.reset_index()
        out_df = pd.merge(out_df, _df, on='object_id', how='left')
    return out_df

In [None]:
if RUN_W2V == True:
    whole_df = pd.concat([train_base, test_base])
    w2v_dfs_text, w2v_cols_text = create_w2v_single_table(whole_df, text_cols)

    with open(FEAT_W2V_DIR_OUT/f'w2v_dfs_text.pkl', 'wb') as f:
        pickle.dump(w2v_dfs_text , f)

    with open(FEAT_W2V_DIR_OUT/f'w2v_cols_text.pkl', 'wb') as f:
        pickle.dump(w2v_cols_text , f)
    
else:
    with open(FEAT_W2V_DIR_IN/f'w2v_dfs_text.pkl', 'rb') as f:
        w2v_dfs_text = pickle.load(f)

    with open(FEAT_W2V_DIR_IN/f'w2v_cols_text.pkl', 'rb') as f:
        w2v_cols_text = pickle.load(f)

train = add_w2v_dfs(train, w2v_dfs_text)
test = add_w2v_dfs(test, w2v_dfs_text)

## preprocess: date_transform
- 日付系の変換
- acquisition_date

In [None]:
# acquisition_dateの変換
def acquisition_date_transform(input_df:pd.DataFrame, tg_col='acquisition_date'):
    out_df = pd.DataFrame()
    out_df[f'{tg_col}_year'] = input_df[tg_col]\
                                .fillna('nan')\
                                .map(lambda x: x[:4] if x!='nan' else np.nan)\
                                .astype('float')
    out_df[f'{tg_col}_month'] = input_df[tg_col]\
                                .fillna('nan')\
                                .map(lambda x: x[5:7] if x!='nan' else np.nan)\
                                .astype('float')
    out_df[f'{tg_col}_day'] = input_df[tg_col]\
                                .fillna('nan')\
                                .map(lambda x: x[8:10] if x!='nan' else np.nan)\
                                .astype('float')
    out_df[f'{tg_col}_ym'] = input_df[tg_col]\
                                .fillna('nan')\
                                .map(lambda x: x[:4]+x[5:7] if x!='nan' else np.nan)\
                                .astype('float')
    out_df[f'{tg_col}_ymd'] = input_df[tg_col]\
                                .fillna('nan')\
                                .map(lambda x: x[:4]+x[5:7]+x[8:10] if x!='nan' else np.nan)\
                                .astype('float')
    return out_df

In [None]:
train = pd.concat([train, acquisition_date_transform(train)], axis=1)
test = pd.concat([test, acquisition_date_transform(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: sub_title

In [None]:
def sub_title_extract(input_df:pd.DataFrame):
    out_df = pd.DataFrame()

    for axis in ['h', 'w', 't', 'd']:
        column_name = f'size_{axis}'
        size_info = input_df['sub_title'].str.extract(r'{} (\d*|\d*\.\d*)(cm|mm)'.format(axis)) # 正規表現を使ってサイズを抽出
        size_info = size_info.rename(columns={0: column_name, 1: 'unit'})
        size_info[column_name] = size_info[column_name].replace('', np.nan).astype(float) # dtypeがobjectになってるのでfloatに直す
        size_info[column_name] = size_info.apply(lambda row: row[column_name] * 10 if row['unit'] == 'cm' else row[column_name], axis=1) # 　単位をmmに統一する
        out_df[column_name] = size_info[column_name] # trainにくっつける
        
    return out_df

In [None]:
train = pd.concat([train, sub_title_extract(train)], axis=1)
test = pd.concat([test, sub_title_extract(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: diff
- 差分
- 掛け算

In [None]:
# 引き算
def diff_features(input_df:pd.DataFrame, col_left:str, col_right:str, new_colname=None):
    out_df = pd.DataFrame()
    if new_colname == None:
        new_colname = f'diff_{col_left}__{col_right}'
    out_df[new_colname] = input_df[col_left] - input_df[col_right]        
    return out_df

In [None]:
# 掛け算
def products_feats(input_df:pd.DataFrame, col_left:str, col_right:str, new_colname=None):
    out_df = pd.DataFrame()
    if new_colname == None:
        new_colname = f'products_{col_left}__{col_right}'
    out_df[new_colname] = input_df[col_left] * input_df[col_right]        
    return out_df

# 割り算
def division_feats(input_df:pd.DataFrame, col_left:str, col_right:str, new_colname=None):
    out_df = pd.DataFrame()
    if new_colname == None:
        new_colname = f'products_{col_left}__{col_right}'
    out_df[new_colname] = input_df[col_left] / input_df[col_right] 
    return out_df

In [None]:
# diffを取る特徴を辞書としてリストに格納
diff_dic_lst = [
           {'col_l':'dating_year_late',
            'col_r':'dating_year_early',
            'new_colname':'dating_presenting_year_num'},
           {'col_l':'acquisition_date_year',
            'col_r':'dating_year_late',
            'new_colname':None}
]

prd_dic_lst = [
           {'col_l':'size_h',
            'col_r':'size_w',
            'new_colname':'size_square'}
]

div_dic_lst = [
           {'col_l':'size_w',
            'col_r':'size_h',
            'new_colname':'size_aspect_ratio'}
]

In [None]:
# 各組み合わせについて差分を取ったカラムを追加
for dic_ in diff_dic_lst:
    train = pd.concat([train,
                       diff_features(train, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                       axis=1)
    test = pd.concat([test,
                      diff_features(test, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                      axis=1)
    
for dic_ in prd_dic_lst:
    train = pd.concat([train,
                       products_feats(train, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                       axis=1)
    test = pd.concat([test,
                      products_feats(test, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                      axis=1)
    
for dic_ in div_dic_lst:
    train = pd.concat([train,
                       division_feats(train, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                       axis=1)
    test = pd.concat([test,
                      division_feats(test, dic_['col_l'] , dic_['col_r'] , dic_['new_colname'])],
                      axis=1)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.columns

## preprocess: language

In [None]:
model = load_model("../99_bin/lid.176.bin")
def lang_transform(input_df:pd.DataFrame, tg_col:str):
    out_df = pd.DataFrame()
    out_df[f"{tg_col}_lang_ft"] = input_df[tg_col].fillna("").map(
        lambda x: model.predict(x.replace("\n", ""))[0][0])
    return out_df

In [None]:
train = pd.concat([train, lang_transform(train, 'title')], axis=1)
test = pd.concat([test, lang_transform(test, 'title')], axis=1)

train = pd.concat([train, lang_transform(train, 'long_title')], axis=1)
test = pd.concat([test, lang_transform(test, 'long_title')], axis=1)

train = pd.concat([train, lang_transform(train, 'more_title')], axis=1)
test = pd.concat([test, lang_transform(test, 'more_title')], axis=1)

train = pd.concat([train, lang_transform(train, 'description')], axis=1)
test = pd.concat([test, lang_transform(test, 'description')], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: technique

In [None]:
# 関数化
tg_techcol = ['technique_etching',
 'technique_engraving',
 'technique_albumen print',
 'technique_gelatin silver print',
 'technique_letterpress printing',
 'technique_drypoint',
 'technique_salted paper print',
 'technique_slide',
 'technique_painting',
 'technique_steel engraving']

def technique_encoder(input_df:pd.DataFrame, left_df=technique, tg_col=tg_techcol):
    # 技法数
    out_df = input_df[['object_id']].copy()
    vc = technique['object_id'].value_counts()
    out_df['tech_cnt'] = input_df['object_id'].map(vc).fillna(-1)
    
    # 主要な技法の使用
    _ohe = pd.concat([left_df['object_id'], pd.get_dummies(left_df['name'])], axis=1)
    _ohe = _ohe.groupby('object_id').max().add_prefix(f'technique_')
    _ohe = _ohe[tg_techcol]
    out_df = out_df.merge(_ohe, left_on='object_id', right_index=True, how='left').fillna(0)

    return out_df.drop('object_id', axis=1)

In [None]:
train = pd.concat([train, technique_encoder(train)], axis=1)
test = pd.concat([test, technique_encoder(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: object_collection

In [None]:
# 関数化_最終
obj_coll_tgcol = [
 'obj_coll_prints',
 'obj_coll_paintings',
 'obj_coll_Navy Model Room',
 'obj_coll_paper',
 'obj_coll_drawings',
 'obj_coll_dollhouse',
 'obj_coll_lace',
 'obj_coll_musical instruments'
]


def obj_coll_encoder(input_df:pd.DataFrame, left_df=object_collection, tg_col=obj_coll_tgcol):
    # 技法数
    out_df = input_df[['object_id']].copy()
    vc = left_df['object_id'].value_counts()
    out_df['obj_coll_cnt'] = input_df['object_id'].map(vc).fillna(-1)
    
    # 主要な技法の使用
    _ohe = pd.concat([left_df['object_id'], pd.get_dummies(left_df['name'])], axis=1)
    _ohe = _ohe.groupby('object_id').max().add_prefix(f'obj_coll_')
    
    if len(tg_col) != 0:
        _ohe = _ohe[tg_col]

    out_df = out_df.merge(_ohe, left_on='object_id', right_index=True, how='left').fillna(0)

    return out_df.drop('object_id', axis=1)

In [None]:
train = pd.concat([train, obj_coll_encoder(train)], axis=1)
test = pd.concat([test, obj_coll_encoder(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: material

In [None]:
# 関数化
tg_material_col = [
 'material_paper',
 'material_photographic paper',
 'material_oil paint (paint)',
 'material_cardboard',
 'material_canvas',
 'material_panel',
 'material_glass',
 'material_copper (metal)',
 'material_baryta paper']

def material_encoder(input_df:pd.DataFrame, left_df=material, tg_col=tg_material_col, prefix='material_'):
    # 技法数
    out_df = input_df[['object_id']].copy()
    vc = left_df['object_id'].value_counts()
    out_df[f'{prefix}cnt'] = input_df['object_id'].map(vc).fillna(-1)
    
    # 主要な技法の使用
    _ohe = pd.concat([left_df['object_id'], pd.get_dummies(left_df['name'])], axis=1)
    _ohe = _ohe.groupby('object_id').max().add_prefix(f'{prefix}')
    
    if len(tg_col) != 0:
        _ohe = _ohe[tg_col]

    out_df = out_df.merge(_ohe, left_on='object_id', right_index=True, how='left').fillna(0)

    return out_df.drop('object_id', axis=1)

In [None]:
train = pd.concat([train, material_encoder(train)], axis=1)
test = pd.concat([test, material_encoder(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: production_place

In [None]:
# 関数化
tg_production_place_col = [
 'production_place_Amsterdam',
 'production_place_Netherlands',
 'production_place_Northern Netherlands',
 'production_place_Antwerp',
 'production_place_Paris',
 'production_place_France',
 'production_place_The Hague',
 'production_place_unknown',
 'production_place_Haarlem',
 'production_place_Cologne',
 'production_place_Rotterdam',
 'production_place_Utrecht',
 'production_place_Germany',
 'production_place_Suriname',
 'production_place_? Netherlands',
 'production_place_Venice',
 'production_place_London',
 'production_place_Southern Netherlands']

def production_place_encoder(input_df:pd.DataFrame,
                             left_df=production_place,
                             tg_col=tg_production_place_col,
                             prefix='production_place_'):
    # カウント
    out_df = input_df[['object_id']].copy()
    vc = left_df['object_id'].value_counts()
    out_df[f'{prefix}cnt'] = input_df['object_id'].map(vc).fillna(-1)
    
    # ?を含むかどうか
    _tmp = left_df.copy()
    _tmp['production_place_question'] = left_df['name'].map(lambda x: 1 if x[:1] == '?' else 0)
    _question = _tmp.groupby('object_id')['production_place_question'].max()
    out_df[f'{prefix}question'] = input_df['object_id'].map(_question).fillna(-1)
    
    # 主要なフラグ
    _ohe = pd.concat([left_df['object_id'], pd.get_dummies(left_df['name'])], axis=1)
    _ohe = _ohe.groupby('object_id').max().add_prefix(f'{prefix}')
    
    if len(tg_col) != 0:
        _ohe = _ohe[tg_col]

    out_df = out_df.merge(_ohe, left_on='object_id', right_index=True, how='left').fillna(0)

    return out_df.drop('object_id', axis=1)

In [None]:
train = pd.concat([train, production_place_encoder(train)], axis=1)
test = pd.concat([test, production_place_encoder(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: Text_length
- テキスト系カラムの長さを返す

In [None]:
def text_length_transformer(input_df:pd.DataFrame,
                            columns:list,
                            prefix='text_length_'):
    out_df = pd.DataFrame()
    
    for col_ in columns:
        out_df[f'{prefix}{col_}'] = input_df[col_].fillna('').map(lambda x: len(x) if x!='' else np.nan)

    print(out_df.columns.tolist())

    return out_df

In [None]:
tgcol_length = ['title',
                'description',
                'more_title',
                'long_title',
                'principal_maker',
                'principal_or_first_maker',
                'acquisition_credit_line',
                ]

train = pd.concat([train, text_length_transformer(train, tgcol_length)], axis=1)
test = pd.concat([test, text_length_transformer(test, tgcol_length)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## preprocess: historical_person

In [None]:
tg_historical_person_col = [
 'historical_person_WillemIIIprinsvanOranjeenkoningvanEngelandSchotlandenIerland',
 'historical_person_LodewijkXIVkoningvanFrankrijk',
 'historical_person_VerenigdeOostindischeCompagnie',
 'historical_person_WillemIprinsvanOranje',
 'historical_person_lvarezdeToledoFernando3ehertogvanAlva',
 'historical_person_WilhelminakoninginderNederlanden',
 'historical_person_WittJohande',
 'historical_person_WittCornelisde',
 'historical_person_Wehrmacht',
 'historical_person_DjatirotoSuikeronderneming']

def clean_words(list_:list, stopwords=['']):
    list_drop_stopwords = [i for i in list_ if i not in stopwords]
    return [re.sub('[^A-Za-z0-9_]+', '', x) for x in list_drop_stopwords]


def historical_person_encoder(input_df:pd.DataFrame,
                             left_df=historical_person,
                             tg_col=tg_historical_person_col,
                             prefix='historical_person_'):
    _tmp_df = left_df.copy()
    _tmp_df['name'] = _tmp_df['name'].map(lambda x: ''.join(clean_words(x)))
    
    # カウント
    out_df = input_df[['object_id']].copy()
    vc = _tmp_df['object_id'].value_counts()
    out_df[f'{prefix}cnt'] = input_df['object_id'].map(vc).fillna(-1)
    
    # 主要なフラグ
    _ohe = pd.concat([_tmp_df['object_id'], pd.get_dummies(_tmp_df['name'])], axis=1)
    _ohe = _ohe.groupby('object_id').max().add_prefix(f'{prefix}')
    
    if len(tg_col) != 0:
        _ohe = _ohe[tg_col]

    out_df = out_df.merge(_ohe, left_on='object_id', right_index=True, how='left').fillna(0)

    return out_df.drop('object_id', axis=1)

In [None]:
train = pd.concat([train, historical_person_encoder(train)], axis=1)
test = pd.concat([test, historical_person_encoder(test)], axis=1)

In [None]:
print(train.shape)
print(test.shape)

## feature: bool columns

In [None]:
# そのまま渡すパターン
def boolean_converter(data, bool_columns):
    return data[bool_columns]

In [None]:
# 特定のラベルをフラグとするパターン

# カラム・文字列を辞書としてリストに格納

def boolean_word_converter(data, boolcol_labels):
    out_df = pd.DataFrame()
    for dic_ in boolcol_labels:
        out_df[dic_['col']] = data[dic_['col']].apply(lambda x: 1 if x == dic_['label'] else 0)
    return out_df

In [None]:
# check
bool_columns = [*tg_techcol,
                *obj_coll_tgcol,
                *tg_material_col,
                *tg_production_place_col]

boolcol_labels = [
           {'col':'principal_maker_nationality',
            'label':'dutch'
           }
]

assert len(train) == len(boolean_converter(train, bool_columns))
assert len(train) == len(boolean_word_converter(train, boolcol_labels))

## feature: null_or_exists

## feature: Category Encoding
- LE, CE

In [None]:
# Encoding系特徴量。ベースとなる継承元のクラス
class BaseBlock(object):
    def fit(self, input_df, y=None):
        return self.transform(input_df)
    def transform(self, input_df):
        raise NotImplementedError()

In [None]:
# LabelEncoding
class LabelBlock(BaseBlock):
    def __init__(self, column: str, whole_df: pd.DataFrame):
        self.column = column
        self.le = LabelEncoder()
        self.whole_df = whole_df

    def fit(self, input_df, y=None):
        self.le.fit(self.whole_df[self.column].fillna('nan'))
        return self.transform(input_df)
    
    def transform(self, input_df):
        c = self.column
        out_df = pd.DataFrame()
        out_df[c] = self.le.transform(input_df[self.column].fillna('nan')).astype('int')
        return out_df.add_prefix(f'LE_')

In [None]:
# Count Encoding
class CountEncodingBlock(BaseBlock):
    def __init__(self, column, whole_df: pd.DataFrame):
        self.column = column
        self.whole_df = whole_df
    
    def transform(self, input_df):
        output_df = pd.DataFrame()
        c = self.column
        
        vc = self.whole_df[c].value_counts()
        output_df[c] = input_df[c].map(vc)
        return output_df.add_prefix('CE_')

In [None]:
# check
check_categ = 'principal_maker'
assert len(train) == len(LabelBlock(check_categ, whole_df=train).fit(train))
assert len(train) == len(CountEncodingBlock(check_categ, whole_df=train).fit(train))

## feature: numerics

In [None]:
def numeric_converter(data, numerics):
    return data[numerics]

In [None]:
# check
numerics = ['dating_sorting_date']
assert len(train) == len(numeric_converter(train, numerics))

## feature: tfidf

In [None]:
# 参考: https://www.guruguru.science/competitions/16/discussions/556029f7-484d-40d4-ad6a-9d86337487e2/
# 一旦、まんま同じコードで実装。後で深堀りして理解する。

def text_normalization(text):

    # 英語とオランダ語を stopword として指定
    custom_stopwords = nltk.corpus.stopwords.words('dutch') + nltk.corpus.stopwords.words('english')

    x = hero.clean(text, pipeline=[
        hero.preprocessing.fillna,
        hero.preprocessing.lowercase,
        hero.preprocessing.remove_digits,
        hero.preprocessing.remove_punctuation,
        hero.preprocessing.remove_diacritics,
        lambda x: hero.preprocessing.remove_stopwords(x, stopwords=custom_stopwords)
    ])

    return x

class TfidfBlock(BaseBlock):
    """tfidf x SVD による圧縮を行なう block"""
    def __init__(self, column: str):
        """
        args:
            column: str
                変換対象のカラム名
        """
        self.column = column

    def preprocess(self, input_df):
        x = text_normalization(input_df[self.column])
        return x

    def get_master(self, input_df):
        """tdidfを計算するための全体集合を返す. 
        デフォルトでは fit でわたされた dataframe を使うが, もっと別のデータを使うのも考えられる."""
        return input_df

    def fit(self, input_df, y=None, n_components=50):
        master_df = self.get_master(input_df)
        text = self.preprocess(input_df)
        self.pileline_ = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=10000)),
            ('svd', TruncatedSVD(n_components=n_components, random_state=SEED)),
        ])

        self.pileline_.fit(text)
        return self.transform(input_df)

    def transform(self, input_df):
        text = self.preprocess(input_df)
        z = self.pileline_.transform(text)

        out_df = pd.DataFrame(z)
        return out_df.add_prefix(f'{self.column}_tfidf_')


In [None]:
# check
block = TfidfBlock('title')
block.fit(train_base)

assert block.transform(train_base).equals(block.transform(train_base))
assert block.transform(train_base).equals(block.transform(train_base))

## feature: Groupby

In [None]:
# Groupby系
# 参考: https://github.com/Ynakatsuka/kaggle_utils
def preprocess(data):
    return data
        
class GroupbyBlock(BaseBlock):
    """Groupby系特徴量"""
    def __init__(self, param_dict, whole_df: pd.DataFrame):
        self.param_dict = param_dict
        self.whole_df = preprocess(whole_df)

    def transform(self, input_df):
        out_df = pd.DataFrame()
        for param_dict in self.param_dict:
            key, var, agg, on = self._get_params(param_dict)
            all_features = list(set(key + var))
            new_features = self._get_feature_names(key, var, agg)
            features = self.whole_df[all_features].groupby(key)[
                var].agg(agg).reset_index()
            features.columns = key + new_features
            print(key)
            feat_df = pd.merge(input_df[key], features, on=on, how='left').drop(columns=key)
            out_df = pd.concat([out_df, feat_df], axis=1).fillna(0)
        return out_df   

    def _get_params(self, p_dict):
        key = p_dict['key']
        if 'var' in p_dict.keys():
            var = p_dict['var']
        else:
            var = self.var
        if 'agg' in p_dict.keys():
            agg = p_dict['agg']
        else:
            agg = self.agg
        if 'on' in p_dict.keys():
            on = p_dict['on']
        else:
            on = key
        return key, var, agg, on

    def _get_feature_names(self, key, var, agg):
        _agg = []
        for a in agg:
            if not isinstance(a, str):
                _agg.append(a.__name__)
            else:
                _agg.append(a)
        return ['_'.join([a, v, 'groupby'] + key) for v in var for a in _agg]

In [None]:
# check
param_dict_grp = [
            # acquisition_date
            {# フラグ系
                'key': ['acquisition_date'], 
                'var': ['dating_presenting_date'
                       ], 
                'agg': ['nunique']
            }]
assert len(train) == len(GroupbyBlock(param_dict_grp, whole_df=train).fit(train))

## 特徴作成_パラメタ定義

In [None]:
# boolean_そのまま渡すパターン
bool_columns = [*tg_techcol,
                *obj_coll_tgcol,
                *tg_material_col,
                *tg_production_place_col,
                *tg_historical_person_col]

# boolean_特定の文字列をフラグとするパターン
boolcol_labels = [
           {'col':'principal_maker_nationality',
            'label':'dutch'
           }
]

In [None]:
# LE, CE
le_categories = ['principal_maker',
              'principal_or_first_maker',
              'copyright_holder', 
              'acquisition_method', 
              #'acquisition_date', 
              #'dating_presenting_date',
              'principal_maker_place_of_birth',
              'principal_maker_date_of_birth',
              'principal_maker_date_of_death',
              'principal_maker_place_of_death',
             'title_lang_ft',
             'description_lang_ft'
             #'long_title_lang_ft'
             #'more_title_lang_ft'
                ]

ce_categories = ['art_series_id',
                 'title',
                 'description',
                 'long_title',
                 'principal_maker',
                 'principal_or_first_maker',
                 'sub_title',
                 'copyright_holder',
                 'more_title',
                 'acquisition_date',
                 'acquisition_credit_line',
                 'dating_presenting_date',
                 'title_lang_ft',
                 'description_lang_ft',
                 #'long_title_lang_ft'
                 #'more_title_lang_ft'
                 ]

In [None]:
# feat_palette
feat_palette = [
 'max_ratio',
 'max_palette_r',
 'max_palette_g',
 'max_palette_b',
 #'ratio_x',
 'mean_palette_r',
 'mean_palette_g',
 'mean_palette_b',
 #'ratio_y',
 'var_palette_r',
 'var_palette_g',
 'var_palette_b']


In [None]:
# Numerics
numerics = ['dating_sorting_date',
            'dating_period', 
            'dating_year_early', 
            'dating_year_late',
            'dating_presenting_year_num',
            'tech_cnt',
            'obj_coll_cnt',
            'material_cnt',
            'production_place_question',
            'production_place_cnt',
            'acquisition_date_year',
            'acquisition_date_ym',
            'acquisition_date_month',
            'diff_acquisition_date_year__dating_year_late',
            'size_h', 'size_w', 'size_t', 'size_d',
            'size_square',
            'text_length_title',
            'text_length_description',
            'text_length_more_title',
            'text_length_long_title',
            'text_length_principal_maker',
            'text_length_principal_or_first_maker',
            'text_length_acquisition_credit_line'
            #'size_aspect_ratio',
            #*feat_palette,
            ]

In [None]:
# テキスト特徴
text_cols = ['title',
            'description',
            'more_title',
            'long_title',
            'acquisition_credit_line'
]

In [None]:
# Groupby(長いので、本当は外部で定義するようにしたい)
param_dict_grp = [
            # acquisition_date
            {# フラグ系
                'key': ['acquisition_date'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['acquisition_date'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['acquisition_date'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'acquisition_date'],
                'agg': ['nunique']
            },
            # dating_presenting_date
            {# フラグ系
                'key': ['dating_presenting_date'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['dating_presenting_date'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['dating_presenting_date'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'dating_presenting_date'],
                'agg': ['nunique']
            },
            # principal_or_first_maker
            {# フラグ系
                'key': ['principal_or_first_maker'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['principal_or_first_maker'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['principal_or_first_maker'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'principal_or_first_maker'],
                'agg': ['nunique']
            },
            # principal_maker_place_of_birth
            {# フラグ系
                'key': ['principal_maker_place_of_birth'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['principal_maker_place_of_birth'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['principal_maker_place_of_birth'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'principal_maker_place_of_birth'],
                'agg': ['nunique']
            },
            # principal_maker_date_of_birth
            {# フラグ系
                'key': ['principal_maker_date_of_birth'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['principal_maker_date_of_birth'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['principal_maker_date_of_birth'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'principal_maker_date_of_birth'],
                'agg': ['nunique']
            },
            # principal_maker
            {# フラグ系
                'key': ['principal_maker'], 
                'var': [*tg_techcol,
                        *obj_coll_tgcol,
                        *tg_material_col,
                        *tg_production_place_col
                       ], 
                'agg': ['sum', 'mean']
            },
            {# 数値系
                'key': ['principal_maker'], 
                'var': [*numerics
                       ], 
                'agg': ['min', 'max', 'mean', 'median', 'sum', 'std']
            },
            {# カテゴリ系
                'key': ['principal_maker'], 
                'var': [i for i in set([*le_categories, *ce_categories]) if i != 'principal_maker'],
                'agg': ['nunique']
            }
]

In [None]:
# カラー特徴(paletteベース)
feat_palette_cv = ['ratio',
 'max_lab_L',
 'max_lab_a',
 'max_lab_b',
 'mean_lab_L',
 'mean_lab_a',
 'mean_lab_b',
 'var_lab_L',
 'var_lab_a',
 'var_lab_b',
 'std_lab_L',
 'std_lab_a',
 'std_lab_b'
                  ]


## 特徴作成_実行

In [None]:
# feat用df初期化
train_feat = pd.DataFrame()
test_feat = pd.DataFrame()

In [None]:
# boolean
train_feat = pd.concat([train_feat, boolean_converter(train, bool_columns)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, bool_columns)], axis=1)
train_feat = pd.concat([train_feat, boolean_word_converter(train, boolcol_labels)], axis=1)
test_feat  = pd.concat([test_feat, boolean_word_converter(test, boolcol_labels)], axis=1)

In [None]:
# 実行_LE
le_blocks = [*[LabelBlock(c, whole_df=pd.concat([train, test], axis=0)) for c in le_categories]]
le_dic = {}  # LEのオブジェクトを辞書に格納しておく。SHAPのラベル用。

for i, block in enumerate(le_blocks):
    le_feat = block.fit(train)
    cur_categ = le_categories[i]
    le_dic[cur_categ] = block.le
    train_feat = pd.concat([train_feat, le_feat], axis=1)
    test_feat = pd.concat([test_feat, block.fit(test)], axis=1)
    
# 実行_CE
ce_blocks = [*[CountEncodingBlock(c, whole_df=pd.concat([train, test], axis=0)) for c in ce_categories]]
for i, block in enumerate(ce_blocks):
    train_feat = pd.concat([train_feat, block.fit(train)], axis=1)
    test_feat = pd.concat([test_feat, block.fit(test)], axis=1)

In [None]:
# 実行_numerics
train_feat = pd.concat([train_feat, numeric_converter(train, numerics)], axis=1)
test_feat  = pd.concat([test_feat, numeric_converter(test, numerics)], axis=1)

In [None]:
# 実行_tfidf
for col_ in text_cols:
    block = TfidfBlock(col_)
    block.fit(pd.concat([train, test], axis=0))

    train_feat = pd.concat([train_feat, block.transform(train)], axis=1)
    test_feat  = pd.concat([test_feat, block.transform(test)], axis=1)
    print(col_)

In [None]:
# groupbyしたくないnumericカラムはここで渡す
# 実行_W2V
train_feat = pd.concat([train_feat, boolean_converter(train, w2v_cols)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, w2v_cols)], axis=1)
train_feat = pd.concat([train_feat, boolean_converter(train, w2v_cols_text)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, w2v_cols_text)], axis=1)

# 実行_palette
train_feat = pd.concat([train_feat, boolean_converter(train, feat_palette_cv)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, feat_palette_cv)], axis=1)
train_feat = pd.concat([train_feat, boolean_converter(train, feat_palette_hsvyiq)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, feat_palette_hsvyiq)], axis=1)

# 実行_color
train_feat = pd.concat([train_feat, boolean_converter(train, color_columns)], axis=1)
test_feat  = pd.concat([test_feat, boolean_converter(test, color_columns)], axis=1)

In [None]:
# 実行_Groupby
train_grp = GroupbyBlock(param_dict_grp, whole_df=pd.concat([train, test])).fit(train)
test_grp = GroupbyBlock(param_dict_grp, whole_df=pd.concat([train, test])).fit(test)
train_feat = pd.concat([train_feat, train_grp], axis=1)
test_feat = pd.concat([test_feat, test_grp], axis=1)

In [None]:
train_feat.shape

## feature_selection

In [None]:
# 特徴出力用カラム指定＆作成
FEAT_SELECT_DIR_IN = FEAT_DIR/f'feat_select/{feat_input}'
FEAT_SELECT_DIR_OUT = FEAT_DIR/f'feat_select/{feat_output}'
FEAT_SELECT_DIR_OUT.mkdir(exist_ok=True, parents=True)

### feature_selection: stdが0のカラム、他と完全相関しているカラムを除外
- ノイズとなる特徴を除外する処理。
- 特徴を(手動で)選ぶ上でのノイズを減らしたいための処理で、実際のところ精度は全く変わらなかった。

In [None]:
full_feat = pd.concat([train_feat, test_feat], axis=0)

In [None]:
if RUN_FEAT_SELECTION == True:
    # stdが0のカラムを抽出、full_featから除外
    cols_std0 = full_feat.std()[full_feat.std() == 0].index.tolist()

    with open(FEAT_SELECT_DIR_OUT/f'cols_std0.pkl', 'wb') as f:
        pickle.dump(cols_std0 , f)

    with open(FEAT_SELECT_DIR_OUT/f'cols_std0.pkl', 'wb') as f:
        pickle.dump(cols_std0 , f)

else:    
    with open(FEAT_SELECT_DIR_IN/f'cols_std0.pkl', 'rb') as f:
        cols_std0 = pickle.load(f)

    with open(FEAT_SELECT_DIR_IN/f'cols_std0.pkl', 'rb') as f:
        cols_std0 = pickle.load(f)

full_feat = full_feat.drop(cols_std0, axis=1)
print(cols_std0)

In [None]:
if RUN_FEAT_SELECTION == True:
    # 相関係数を求め、完全相関のカラムを判別
    full_corr = full_feat.corr()

    def return_perfect_correlation(sr_:pd.Series):
        perf_cols = sr_[sr_ == 1].index.drop(sr_.name).tolist()
        if len(perf_cols) == 0:
            return 'NoColumns'
        else:
            return perf_cols

    full_corr_ = full_corr.apply(return_perfect_correlation, axis='columns')
    full_corr_perf = full_corr_[full_corr_ != 'NoColumns']
    
    # 完全相関のカラムを格納
    cols_perfcorr = []

    for idx_, list_ in zip(full_corr_perf.index, full_corr_perf.values):
        #print(idx_, list_)
        if idx_ in full_feat.columns:
            for tgcol_ in list_:
                full_feat = full_feat.drop(tgcol_, axis='columns')
                cols_perfcorr.append(tgcol_)
                print(tgcol_)
            else:
                pass
            
    with open(FEAT_SELECT_DIR_OUT/f'cols_perfcorr.pkl', 'wb') as f:
        pickle.dump(cols_perfcorr , f)

    with open(FEAT_SELECT_DIR_OUT/f'cols_perfcorr.pkl', 'wb') as f:
        pickle.dump(cols_perfcorr , f)

else:    
    with open(FEAT_SELECT_DIR_IN/f'cols_perfcorr.pkl', 'rb') as f:
        cols_perfcorr = pickle.load(f)

    with open(FEAT_SELECT_DIR_IN/f'cols_perfcorr.pkl', 'rb') as f:
        cols_perfcorr = pickle.load(f)


In [None]:
# 上記で求めた不要カラムを除外
print(train_feat.shape)
print(test_feat.shape)

train_feat = train_feat.drop(cols_perfcorr + cols_std0, axis=1)
test_feat = test_feat.drop(cols_perfcorr + cols_std0, axis=1)

print(train_feat.shape)
print(test_feat.shape)

# train & predict

In [None]:
# モデル格納用フォルダ
TUNED_MODEL_DIR_IN = FEAT_DIR/f'tuned_models/{tuned_model_input}'
TUNED_MODEL_DIR_OUT = FEAT_DIR/f'tuned_models/{tuned_model_output}'
TUNED_MODEL_DIR_OUT.mkdir(exist_ok=True, parents=True)

In [None]:
# チューニングしない場合、過去実行済みのモデルを取得
if RUN_OPTUNA == False:
    with open(TUNED_MODEL_DIR_IN/f'models.pkl', 'rb') as f:
        models = pickle.load(f)

    param_tuned = []
    for i in models:
        param_tuned.append(i.params)

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

def kfold_cv(X, y, n_splits=5, random_state=0):
    folds = KFold(n_splits=n_splits, random_state=0, shuffle=True)
    return list(folds.split(X, y))

def stratified_kfold_cv(X, y, n_splits=5, random_state=0):
    folds = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
    return list(folds.split(X, y))

In [None]:
def target_encoding_for_cv(tr_x, tr_y, va_x, te_x, cat_cols, id_cols, seed=SEED):
    tr_x_out = pd.merge(tr_x, train[id_cols], left_index=True, right_index=True, how='left')
    va_x_out = pd.merge(va_x, train[id_cols], left_index=True, right_index=True, how='left')
    te_x_out = pd.merge(te_x, test[id_cols], left_index=True, right_index=True, how='left')

    # 変数をループしてtarget encoding
    for c in cat_cols:
        # 学習データ全体で各カテゴリにおけるtargetの平均を計算
        data_tmp = pd.DataFrame({c: tr_x_out[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        target_max = data_tmp.groupby(c)['target'].max()

        # バリデーションデータ, テストデータのカテゴリを置換
        va_x_out.loc[:, f'TGE_Mean_{c}'] = va_x_out[c].map(target_mean)
        te_x_out.loc[:, f'TGE_Mean_{c}'] = te_x_out[c].map(target_mean)

        # 学習データの変換後の値を格納する配列を準備
        tmp = np.repeat(np.nan, tr_x_out.shape[0])
        kf_encoding = KFold(n_splits=4, shuffle=True, random_state=seed)
        for idx_1, idx_2 in kf_encoding.split(tr_x_out):
            # out-of-foldで各カテゴリにおける目的変数の平均を計算
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            # 変換後の値を一時配列に格納
            tmp[idx_2] = tr_x_out[c].iloc[idx_2].map(target_mean)

        tr_x_out.loc[:, f'TGE_Mean_{c}'] = tmp
    tr_x_out = tr_x_out.drop(id_cols, axis=1)
    va_x_out = va_x_out.drop(id_cols, axis=1)
    te_x_out = te_x_out.drop(id_cols, axis=1)

    return tr_x_out, va_x_out, te_x_out

In [None]:
target = 'likes'
#cv = kfold_cv(train_feat, train[target], n_splits=3)
cv = kfold_cv(train_feat, train[target])
#cv = stratified_kfold_cv(train_feat, train[target])

In [None]:
# RMSLE対策に対数変換をかける
train_target = np.log1p(train[target])

In [None]:
# チューニングする場合の処理
if RUN_OPTUNA == True:
    params = {
        'objective': 'regression',
        'metrics': 'rmse',
        'seed': SEED
    }

    import optuna.integration.lightgbm as lgb  # lgbをoptunaで再呼び出し

In [None]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [None]:
id_cols = ['principal_maker']
cat_cols = ['principal_maker']

oof_preds = np.zeros(len(train_feat))
test_preds = np.zeros(len(test_feat))

importances = pd.DataFrame()
scores = []
models = []

for i, (train_index, valid_index) in enumerate(cv):
    print(f'\nFold {i + 1}')
    trn_x, trn_y = train_feat.iloc[train_index], train_target.iloc[train_index]
    val_x, val_y = train_feat.iloc[valid_index], train_target.iloc[valid_index]
    trn_x, val_x, test_feat_tge = target_encoding_for_cv(trn_x, trn_y, val_x, test_feat, cat_cols, id_cols)

    dtrain = lgb.Dataset(trn_x, trn_y, categorical_feature = [col_ for col_ in train_feat.columns if col_[:3] == 'LE_'])
    dvalid = lgb.Dataset(val_x, val_y, categorical_feature = [col_ for col_ in train_feat.columns if col_[:3] == 'LE_'])

    #dtrain = lgb.Dataset(trn_x, trn_y)
    #dvalid = lgb.Dataset(val_x, val_y)

    if RUN_OPTUNA == False:
        params = param_tuned[i] # チューニングしない場合、paramsを事前実行済みのパラメータで上書き(fold毎に) 
    
    model = lgb.train(
        params,
        train_set=dtrain,
        num_boost_round=100000,
        valid_sets=[dtrain, dvalid],
        valid_names=['training', 'valid'],
        early_stopping_rounds=20,
        verbose_eval=50
    )
    
    val_preds = model.predict(val_x)
    oof_preds[valid_index] = val_preds
    test_preds += model.predict(test_feat_tge) / 5
    
    val_score = model.best_score['valid']['rmse']
    scores.append(val_score)
    models.append(model)
    
    imp_df = pd.DataFrame({
        'feature': model.feature_name(),
        'gain': model.feature_importance(importance_type='gain'),
        'fold': i+1
    })
    
    importances = pd.concat([importances, imp_df], axis=0)
    
mean_score = np.mean(scores)
std_score  = np.std(scores)
all_score  = np.sqrt(mean_squared_error(train_target, oof_preds))
metrics_name = 'RMSE'
print(f'Mean {metrics_name}: {mean_score}, std: {std_score}, All {metrics_name}: {all_score}')

In [None]:
if RUN_OPTUNA == True:
    with open(TUNED_MODEL_DIR_OUT/f'models.pkl', 'wb') as f:
        pickle.dump(models , f)
        
    params_tuned = []
    for i in models:
        params_tuned.append(i.params)
    
    with open(TUNED_MODEL_DIR_OUT/f'params_tuned.pkl', 'wb') as f:
        pickle.dump(params_tuned , f)

### 履歴
#### nb056
Mean RMSE: 0.9795419784472446, std: 0.02287319064231735, All RMSE: 0.9798083364376317

#### Tuned
Mean RMSE: 0.970350615683453, std: 0.021076563253025746, All RMSE: 0.9705787352541676

In [None]:
plt.figure(figsize=(8, 10))
sns.barplot(x='gain', y='feature', data=importances.sort_values('gain', ascending=False)[:100]);
plt.savefig(os.path.join(OUTPUT_DIR, 'feature_importance.png'))

In [None]:
# CVごとのブレをboxen plotとして表現
# 参考: https://www.guruguru.science/competitions/13/discussions/d8f2d66a-aeee-4789-8b3d-d5935c26b1b7/
order = importances.groupby('feature')\
    .sum()[['gain']]\
    .sort_values('gain', ascending=False).index[:50]

fig, ax = plt.subplots(figsize=(max(6, len(order) * .4), 7))
sns.boxenplot(data=importances, x='feature', y='gain', order=order, ax=ax, palette='viridis')
ax.tick_params(axis='x', rotation=90)
ax.grid()
fig.tight_layout()
fig.savefig(os.path.join(OUTPUT_DIR, 'feature_importance_boxen.png'))

In [None]:
# SHAPによる可視化。
# 参考その1: https://github.com/slundberg/shap/issues/337
# 参考その2: https://github.com/slundberg/shap/issues/630
import shap

shap_values = []
for model_ in models:
    explainer = shap.TreeExplainer(model_)
    shap_values.append(explainer.shap_values(test_feat_tge))
    
shap_mean = np.mean(shap_values, axis=0)

In [None]:
# summary_plot
# 参考_画像の出力について: https://github.com/slundberg/shap/issues/153
shap.summary_plot(shap_mean, test_feat_tge, show=False)
plt.subplots_adjust(left=0.4, right=1.0)  # 保存画像のラベルが欠けるのを防ぐ
plt.savefig(os.path.join(OUTPUT_DIR, 'shap_summary_plot.png'))

In [None]:
# dependence_plot
# 参考1: https://github.com/slundberg/shap/issues/1206
# 参考2: https://slundberg.github.io/shap/notebooks/plots/dependence_plot.html

def dependence_plot_le(feat_disp:str):
    # LEしたカテゴリ値を元に戻す
    test_disp = test_feat_tge.copy()
    feat_disp_prefix = 'LE_' + feat_disp
    test_disp[feat_disp_prefix] = le_dic[feat_disp].inverse_transform(test_feat[feat_disp_prefix])

    # 表示&保存
    shap.dependence_plot(feat_disp_prefix, shap_mean, test_disp)
    plt.subplots_adjust(left=0.4, right=1.0)  # 保存画像のラベルが欠けるのを防ぐ
    plt.savefig(os.path.join(OUTPUT_DIR, f'shap_dependence_plot_{feat_disp}.png'))
    plt.show()

In [None]:
sub = atmacup10__sample_submission.copy()

In [None]:
sub.head()

In [None]:
test_preds.shape

In [None]:
sub['likes'] = np.expm1(test_preds)

In [None]:
# マイナス値は0とする
sub['likes'] = sub['likes'].map(lambda x: 0 if x < 0 else x)

In [None]:
sub.head()

In [None]:
sub.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)

In [None]:
# 分布(train_vs_oof)
fig, ax = plt.subplots(figsize=(8, 8))
sns.distplot(train_target, label='Train', ax=ax, color='C1')
sns.distplot(oof_preds, label='Out Of Fold', ax=ax, color='C2')
ax.legend()
ax.grid()
plt.savefig(os.path.join(OUTPUT_DIR, 'train_vs_oof.png'))

In [None]:
# 分布(oof_vs_Test)
fig, ax = plt.subplots(figsize=(8, 8))
sns.distplot(oof_preds, label='Out Of Fold', ax=ax, color='C2')
sns.distplot(np.log1p(test_preds), label='Test Predict', ax=ax, color='black')
ax.legend()
ax.grid()
plt.savefig(os.path.join(OUTPUT_DIR, 'oof_vs_test.png'))

In [None]:
pd.Series(oof_preds)[pd.Series(oof_preds) <= 1].count()