# exp 003

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import plotly
from plotly import express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import texthero as hero
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold, StratifiedKFold
from gensim.models import word2vec, KeyedVectors
from sklearn.pipeline import Pipeline
from pathlib import Path
from typing import Union, Tuple, List
from tqdm import tqdm
import lightgbm as lgb
import category_encoders as ce
import wandb
from wandb.lightgbm import wandb_callback

import sys
sys.path.append('../../src')
import utils

# logger = utils.get_logger()

tqdm.pandas()

INPUT_DIR = "../../input"
OUTPUT_DIR = "../../output"
EXP_NAME = "exp_003"

OBJECT_ID = "object_id"

# wandb.init(project="atmacup-10", name=EXP_NAME)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




In [None]:
class ArtDataset(object):
    def __init__(self, file_path: Path):
        self.target = 'likes'
        self.file_path = file_path
        self.train_df = pd.read_csv(self.file_path / 'train.csv')
        self.test_df = pd.read_csv(self.file_path / 'test.csv')
        self.color_df = pd.read_csv(self.file_path / 'color.csv')
        self.historical_df = pd.read_csv(self.file_path / 'historical_person.csv')
        self.maker_df = pd.read_csv(self.file_path / 'maker.csv')
        self.material_df = pd.read_csv(self.file_path / 'material.csv')
        self.object_df = pd.read_csv(self.file_path / 'object_collection.csv')
        self.palette_df = pd.read_csv(self.file_path / 'palette.csv')
        self.principal_occupation_df = pd.read_csv(self.file_path / 'principal_maker_occupation.csv')
        self.principal_maker_df =  pd.read_csv(self.file_path / 'principal_maker.csv')
        self.production_df = pd.read_csv(self.file_path / 'production_place.csv')
        self.technique_df = pd.read_csv(self.file_path / 'technique.csv')
        self.submission = pd.read_csv(self.file_path / 'sample_submission.csv')

    def get_whole_df(self):
        return pd.concat([self.train_df, self.test_df], axis=0).reset_index(drop=True)

    def get_target(self, log: bool = False):
        return np.log1p(self.train_df[self.target].values) if log else self.train_df[self.target].values

art_ds = ArtDataset(file_path=Path(INPUT_DIR))

## table / aggregation feature

In [None]:
class BaseBlock(object):
    def fit(self, input_df: pd.DataFrame, y=None) -> pd.DataFrame:
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        return NotImplementedError()


class OrdinalEncodingBlock(BaseBlock):
    def __init__(self, cat_cols: list):
        self.cat_cols = cat_cols
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.OrdinalEncoder(handle_unknown="value", handle_missing="value")
        self.encoder.fit(input_df[self.cat_cols])
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return (
            self.encoder.transform(input_df[self.cat_cols])
            .add_prefix("OE_")
            .astype(int)
        )


class CountEncodingBlock(BaseBlock):
    def __init__(self, cat_cols: list):
        self.cat_cols = cat_cols
        self.encoder = None

    def fit(self, input_df: pd.DataFrame, y=None):
        self.encoder = ce.CountEncoder(handle_unknown=-1, handle_missing="count")
        self.encoder.fit(input_df[self.cat_cols])
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return self.encoder.transform(input_df[self.cat_cols]).add_prefix("CE_")


class TargetEncodingBlock(BaseBlock):
    '''TargetEncoding
    - mapping_dfはtest setに対する変換のため(train全体でTE)
    - xfeatのTargetEncodingはoofにしかない水準の時の値が0になる仕様なのでoof平均で埋めるようにしている
    - smoothingも実装したい
    '''
    def __init__(self, columns: List[str], target_column: str, cv: List[np.ndarray]):
        self.columns = columns
        self.target_column = target_column
        self.cv = cv
        self.num_fold = len(self.cv)

    def create_mapping(self, input_df: pd.DataFrame, y: np.ndarray):
        self.mapping_df = {}
        self.y_mean = np.mean(y)
        output_df = pd.DataFrame()

        for col in self.columns:
            keys = input_df[col].unique()
            oof = np.zeros_like(input_df[col], dtype=float)

            for train_idx, valid_idx in self.cv:
                _df = input_df.iloc[train_idx].groupby([col])[self.target_column].mean()
                _df = _df.reindex(keys)
                _df = _df.fillna(_df.mean())
                oof[valid_idx] = input_df.iloc[valid_idx][col].map(_df.to_dict())

            output_df[col] = oof

            self.mapping_df[col] = input_df.groupby([col])[self.target_column].mean()

        return output_df

    def fit(self, input_df: pd.DataFrame, y=None):
        output_df = self.create_mapping(input_df, y)

        return output_df.add_prefix("TE_")

    def transform(self, input_df: pd.DataFrame):
        output_df = pd.DataFrame()

        for col in self.columns:
            output_df[col] = input_df[col].map(self.mapping_df[col]).fillna(self.y_mean)

        return output_df.add_prefix("TE_")


def merge_by_key(left: Union[pd.DataFrame, pd.Series], right: pd.DataFrame, on=OBJECT_ID) -> pd.DataFrame:
    if not isinstance(left, pd.Series):
        left = left[on]
    return pd.merge(left, right, on=on, how='left').drop(columns=[on])


class OtherTableCountBlock(BaseBlock):
    '''material, technique, production_place, historical_person, object_collectionのcrosstab集計
    - nameが被集計対象
    '''
    def __init__(self, category_df: pd.DataFrame, df_name: str, minimum_freq: int) -> None:
        self.category_df = category_df
        self.df_name = df_name
        self.minimum_freq = minimum_freq

    def fit(self, input_df: pd.DataFrame, y=None):
        vc = self.category_df["name"].value_counts()
        use_names = vc[vc >= self.minimum_freq].index
        _df = self.category_df[self.category_df["name"].isin(use_names)].reset_index(drop=True)
        self.agg_df = pd.crosstab(_df["object_id"], _df["name"]).reset_index()

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df, self.agg_df).fillna(0).astype(int)
        return output_df.add_prefix(f"OtherTableCount_{self.df_name}_")


def detect_size_from_subtitle(input_df: pd.DataFrame):
    output_df = pd.DataFrame()
    for axis in ['h', 'w', 't', 'd']:
        column_name = f'size_{axis}'
        size_info = input_df['sub_title'].str.extract(r'{} (\d*|\d*\.\d*)(cm|mm)'.format(axis)) # 正規表現を使ってサイズを抽出
        size_info = size_info.rename(columns={0: column_name, 1: 'unit'})
        size_info[column_name] = size_info[column_name].replace('', np.nan).astype(float) # dtypeがobjectになってるのでfloatに直す
        size_info[column_name] = size_info.apply(lambda row: row[column_name] * 10 if row['unit'] == 'cm' else row[column_name], axis=1) # 単位をmmに統一する
        output_df[column_name] = size_info[column_name]

    return output_df


class SubtitleSizeBlock(BaseBlock):
    '''sub_titleから作品のサイズ(W,H,t,D)/面積/アスペクト比/欠損している属性数を抽出する
    '''
    def transform(self, input_df: pd.DataFrame):
        output_df = detect_size_from_subtitle(input_df)
        output_df["area"] = output_df["size_w"] * output_df["size_h"]
        output_df["aspect"] = output_df["size_w"] / output_df["size_h"]
        output_df["missing_attributes"] = output_df.isnull().sum(axis=1)

        return output_df.add_prefix("Subtitle_")


def parse_year(s: str):
    """maker data の date of birth / death を parse する method"""
    if s is None:
        return None

    if isinstance(s, float):
        return s

    if '-' not in s:
        return int(s)

    return int(s.split('-')[0])


class MakerAgeBlock(BaseBlock):
    '''principal_makerの生まれた年/亡くなった年や制作開始/終了時点での年齢
    '''
    def fit(self, input_df: pd.DataFrame, y=None):
        maker_df = art_ds.maker_df
        output_df = maker_df[["name"]].copy()

        output_df["birth_year"] = maker_df["date_of_birth"].map(parse_year)
        output_df["death_year"] = maker_df["date_of_death"].map(parse_year)
        output_df["living_year"] = output_df["death_year"] - output_df["birth_year"]
        self.agg_df = output_df

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df["principal_maker"].rename("name"), self.agg_df, on="name")
        output_df["age_in_dating_early"] = input_df["dating_year_early"] - output_df["birth_year"]
        output_df["age_in_dating_late"] = input_df["dating_year_late"] - output_df["birth_year"]

        return output_df.add_prefix("MakerAge_")


class ObjectYearMetaBlock(BaseBlock):
    '''objectの制作開始/終了/期間などに関する特徴
    '''
    def transform(self, input_df: pd.DataFrame):
        output_df = pd.DataFrame()

        output_df["acquisition_year"] = pd.to_datetime(input_df["acquisition_date"]).dt.year
        output_df["dating_sorting_date"] = input_df["dating_sorting_date"]
        output_df["dating_period"] = input_df["dating_period"]
        output_df["dating_year_early"] = input_df["dating_year_early"]
        output_df["dating_year_late"] = input_df["dating_year_late"]
        output_df["diff_dating_year"] = output_df["dating_year_late"] - output_df["dating_year_early"]
        output_df["diff_acquisition_dating"] = output_df["acquisition_year"] - output_df["dating_year_late"]

        return output_df


class PrincipalMakerCountByObjectBlock(BaseBlock):
    '''object_id単位でのprincipal maker/principal occupationのcrosstab集計
    '''
    def fit(self, input_df: pd.DataFrame, y=None):
        maker_df = art_ds.principal_maker_df
        occupation_df = art_ds.principal_occupation_df
        agg_df = pd.DataFrame()

        for col in ["qualification", "roles", "productionPlaces"]:
            _df = pd.crosstab(maker_df["object_id"], maker_df[col])

            agg_df = pd.concat([_df.add_prefix(f"{col}_"), agg_df], axis=1)

        occupation_maker_df = occupation_df.merge(maker_df[["id", "object_id"]], on="id", how="left")
        occupation_count_df = pd.crosstab(occupation_maker_df["object_id"], occupation_maker_df["name"]).add_prefix("occupation_")
        agg_df = agg_df.merge(occupation_count_df, on="object_id", how="left")

        self.agg_df = agg_df

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df, self.agg_df).fillna(0).astype(int)

        return output_df.add_prefix("PrincipalMakerByObject_")


class PrincipalMakerCountByMakerBlock(BaseBlock):
    '''principal_maker単位でのprincipal maker/principal occupationのcrosstab集計
    '''
    def fit(self, input_df: pd.DataFrame, y=None):
        maker_df = art_ds.principal_maker_df
        occupation_df = art_ds.principal_occupation_df
        agg_df = pd.DataFrame()

        for col in ["qualification", "roles", "productionPlaces"]:
            _df = pd.crosstab(maker_df["maker_name"], maker_df[col])

            agg_df = pd.concat([_df.add_prefix(f"{col}_"), agg_df], axis=1)

        occupation_maker_df = occupation_df.merge(maker_df[["id", "maker_name"]], on="id", how="left")
        occupation_count_df = pd.crosstab(occupation_maker_df["maker_name"], occupation_maker_df["name"]).add_prefix("occupation_")
        agg_df = agg_df.merge(occupation_count_df, on="maker_name", how="left")

        agg_df.index = agg_df.index.rename("principal_maker")
        self.agg_df = agg_df

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        output_df = merge_by_key(input_df, self.agg_df, on="principal_maker").fillna(0).astype(int)

        return output_df.add_prefix("PrincipalMakerByMaker_")




## text feature

In [None]:
class StringLengthBlock(BaseBlock):
    '''文字列の長さを返す
    '''
    def __init__(self, columns: list):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        output_df = pd.DataFrame()
        for col in self.columns:
            output_df[col] = input_df[col].fillna("").str.len()

        return output_df.add_prefix("StringLength_")


class WordCountBlock(BaseBlock):
    '''train/testの単語数をカウント
    '''
    def __init__(self, columns: list):
        self.columns = columns

    def transform(self, input_df: pd.DataFrame):
        output_df = pd.DataFrame()
        for col in self.columns:
            output_df[col] = input_df[col].fillna("").apply(lambda x: len(x.split()))

        return output_df.add_prefix("WordCount_")

    
def text_normalization(text):
    '''textheroを用いたテキスト前処理pipeline
    '''
    # nltkのオランダ語と英語のstopword
    custom_stopwords = nltk.corpus.stopwords.words('dutch') + nltk.corpus.stopwords.words('english')

    clean_text = hero.clean(
        text, 
        pipeline=[
            hero.preprocessing.fillna,
            hero.preprocessing.lowercase,
            hero.preprocessing.remove_digits,
            hero.preprocessing.remove_punctuation,
            hero.preprocessing.remove_diacritics,
            lambda x: hero.preprocessing.remove_stopwords(x, stopwords=custom_stopwords),
            hero.preprocessing.remove_whitespace,
        ])

    return clean_text


class TfidfBlock(BaseBlock):
    '''TF-IDF特徴量
    - 複数カラムに対応
    '''
    def __init__(self, column: str, n_components: int = 50):
        self.column = column
        self.n_components = n_components

    def get_text_series(self, input_df: pd.DataFrame) -> pd.Series:
        '''input_dfを入力としてテキスト正規化したpd.Seriesを返す
        &で連結された複数カラムの場合は空白区切りでテキストを連結させる
        '''
        out_series = None

        for i, col in enumerate(self.column.split('&')):
            text_i = text_normalization(input_df[col]).astype(str)
            if out_series is None:
                out_series = text_i
            else:
                out_series = out_series + ' ' + text_i

        return out_series

    def fit(self, input_df: pd.DataFrame, y=None):
        whole_df = art_ds.get_whole_df()
        x = self.get_text_series(whole_df)

        self.pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=20000)),
            ('svd', TruncatedSVD(n_components=self.n_components, random_state=42)),
        ])

        feature = self.pipeline.fit_transform(x)
        self.agg_df = pd.concat([whole_df[[OBJECT_ID]].copy(), pd.DataFrame(feature)], axis=1)

        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df).add_prefix(f"{self.column}_Tfidf_")


def get_w2v_input_df():
    '''w2vのinputのdataframeを作成する
    - object_idをPKとするdataframeを返すことを想定
    '''
    # whole_df(train/test)
    whole_df = art_ds.get_whole_df()
    whole_df["title"] = whole_df["title"].astype(str)
    whole_df["description"] = whole_df["description"].astype(str)
    whole_df["more_title"] = whole_df["more_title"].astype(str)
    whole_df["title-description"] = whole_df["title"].astype(str) + ' ' + whole_df["description"].astype(str)

    w2v_whole_title = whole_df.groupby(["object_id"])["title"].apply(list).reset_index()
    w2v_whole_description = whole_df.groupby(["object_id"])["description"].apply(list).reset_index()
    w2v_whole_more_title = whole_df.groupby(["object_id"])["more_title"].apply(list).reset_index()
    w2v_whole_title_description = whole_df.groupby(["object_id"])["title-description"].apply(list).reset_index()
    
    # material
    material_df = art_ds.material_df.copy()
    w2v_material = material_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # object
    object_df = art_ds.object_df.copy()
    w2v_object = object_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # technique
    technique_df = art_ds.technique_df.copy()
    w2v_technique = technique_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # material/object
    material_object_df = pd.concat([material_df, object_df], axis=0).reset_index(drop=True)
    w2v_material_object = material_object_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # material/technique
    material_technique_df = pd.concat([material_df, technique_df], axis=0).reset_index(drop=True)
    w2v_material_technique = material_technique_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # object/technique
    object_technique_df = pd.concat([object_df, technique_df], axis=0).reset_index(drop=True)
    w2v_object_technique = object_technique_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # material/object/technique
    material_object_technique_df = pd.concat([material_df, object_df, technique_df], axis=0).reset_index(drop=True)
    w2v_material_object_technique = material_object_technique_df.groupby(["object_id"])["name"].apply(list).reset_index()

    # material + title + description + more_title


    returns = {
        "w2v_whole_title": w2v_whole_title,
        "w2v_whole_description": w2v_whole_description,
        "w2v_whole_more_title": w2v_whole_more_title,
        "w2v_whole_title_description": w2v_whole_title_description,
        "w2v_material": w2v_material,
        "w2v_object": w2v_object,
        "w2v_technique": w2v_technique,
        "w2v_material_object": w2v_material_object,
        "w2v_material_technique": w2v_material_technique,
        "w2v_object_technique": w2v_object_technique,
        "w2v_material_object_technique": w2v_material_object_technique,
    }

    return returns


class W2vBlock(BaseBlock):
    '''Word2Vecによる単語ベクトル表現を得て、平均により文章ベクトル化
    - 単一dataframeで複数カラムに対応
    '''
    def __init__(self, sentences_df: pd.DataFrame, df_name: str, vector_size: int, min_count: int, window: int, epochs: int):
        self.sentences_df = sentences_df
        self.df_name = df_name
        self.vector_size = vector_size
        self.min_count = min_count
        self.window = window
        self.epochs = epochs

    def fit(self, input_df: pd.DataFrame, y=None):
        self.agg_df = art_ds.get_whole_df()[['object_id']]
        cat_col = self.sentences_df.columns.drop("object_id")[0]
        # text normalization
        self.sentences_df[cat_col] = text_normalization(self.sentences_df[cat_col])

        self.w2v_model = word2vec.Word2Vec(
            self.sentences_df[cat_col].values.tolist(),
            vector_size=self.vector_size,
            min_count=self.min_count,
            window=self.window,
            epochs=self.epochs,
            sg=1,
        )

        # element-wise average(SWEM-aver)
        sentence_vectors = self.sentences_df[cat_col].progress_apply(lambda x: np.mean([self.w2v_model.wv[e] for e in x], axis=0))
        sentence_vectors = np.vstack(sentence_vectors)
        sentence_vectors_df = pd.DataFrame(sentence_vectors, columns=[f"{self.df_name}_{i}" for i in range(self.vector_size)])
        sentence_vectors_df.index = self.sentences_df["object_id"]
        self.agg_df = self.agg_df.merge(sentence_vectors_df, on='object_id', how='left')
        
        return self.transform(input_df)

    def transform(self, input_df: pd.DataFrame):
        return merge_by_key(input_df, self.agg_df)

## palette feature

## create feature

In [None]:
def create_feature(train_df: pd.DataFrame, test_df: pd.DataFrame, y, blocks: list) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_feat_df = pd.DataFrame()
    test_feat_df = pd.DataFrame()

    for block in blocks:
        with utils.timer(name=f"{str(block) + '_fit'}", logger=logger):
            try:
                out_train_block = block.fit(train_df, y=y)
            except Exception as e:
                print(f'Error on {block} fit. ')
                raise e from e

            assert len(out_train_block) == len(train_df), block

        train_feat_df = pd.concat([train_feat_df, out_train_block], axis=1)

    for block in blocks:
        with utils.timer(name=f"{str(block) + '_transform'}", logger=logger):
            out_test_block = block.transform(test_df)
            assert len(out_test_block) == len(test_df), block

        test_feat_df = pd.concat([test_feat_df, out_test_block], axis=1)

    return train_feat_df, test_feat_df

## CV

In [None]:
def get_group_k_fold(train_df: pd.DataFrame, group_col: str, n_splits: int = 5, seed: int = 42) -> List[tuple]:
    '''GroupKFoldで分割してfold列を付与する
    '''
    _train_df = train_df.copy()
    group_series = _train_df[group_col]
    group_key = group_series.unique()

    splitter = KFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=seed
    )

    fold = []

    for fold_id, (tr_group_idx, val_group_idx) in enumerate(splitter.split(group_key)):
        tr_group = group_key[tr_group_idx]
        val_group = group_key[val_group_idx]

        is_tr = group_series.isin(tr_group)
        is_val = group_series.isin(val_group)

        train_idx = np.array(_train_df[is_tr].index)
        valid_idx = np.array(_train_df[is_val].index)

        fold.append((train_idx, valid_idx))

    return fold


def get_stratified_k_fold(train_df: pd.DataFrame, y: np.ndarray, n_splits: int = 5, seed: int = 42) -> List[tuple]:
    '''StratifiedK-Foldで分割してfold列を付与
    '''
    X_train = train_df.copy()
    y_train = y.round()

    splitter = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=seed
    )

    fold = []

    for fold_id, (train_idx, valid_idx) in enumerate(splitter.split(X_train, y_train)):
        fold.append((train_idx, valid_idx))

    return fold

## models

In [None]:
class LightGBMTrainer:
    def __init__(
        self,
        X_train: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test: pd.DataFrame,
        params: dict,
        cv: List[tuple],
        seeds: list,
    ):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.params = params
        self.cv = cv
        self.seeds = seeds
        self.models = []

    def fit(self):
        oof = np.zeros((len(self.seeds), len(self.y_train)))

        for i, seed in enumerate(self.seeds):
            oof_ = np.zeros((len(self.y_train)))
            self.params["seed"] = seed

            for train_idx, valid_idx in self.cv:
                X_train_fold = self.X_train.iloc[train_idx].values
                X_valid_fold = self.X_train.iloc[valid_idx].values

                y_train_fold = self.y_train[train_idx]
                y_valid_fold = self.y_train[valid_idx]

                train_set = lgb.Dataset(X_train_fold, y_train_fold)
                valid_set = lgb.Dataset(X_valid_fold, y_valid_fold, reference=train_set)

                model = lgb.train(
                    train_set=train_set,
                    valid_sets=[train_set, valid_set],
                    params=self.params,
                    verbose_eval=100,
                    # callbacks=[wandb_callback()]
                )

                oof_[valid_idx] = model.predict(
                    X_valid_fold,
                    num_iteration=model.best_iteration,
                )
                
                self.models.append(model)

            oof[i, :] = oof_
        
        y_oof = np.mean(oof, axis=0)

        return y_oof, self.models

    def predict(self):
        y_pred = np.mean(
            [model.predict(self.X_test) for model in self.models], axis=0
        )

        return y_pred

## Training / Inference

In [None]:
# config
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.01,
    "max_depth": 5,
    "num_leaves": 32,
    "lambda_l1": 0.01,
    "lambda_l2": 0.01,
    "bagging_fraction": 0.9,
    "bagging_freq": 3,
    "feature_fraction": 0.9,
    "min_data_in_leaf": 20,
    "num_threads": 8,
    "verbosity": -1,
    "num_iterations": 10000,
    "early_stopping_round": 100,
}
# wandb.config.update(lgb_params)
seeds = [42, 2021, 2434, 1123, 98]

# feature engineering
train_df = art_ds.train_df
test_df = art_ds.test_df
y_train = art_ds.get_target(log=True)
cv = get_stratified_k_fold(train_df, y_train, 5, 42)

blocks = [
    # table/aggregate features
    OrdinalEncodingBlock(
        cat_cols=[
            "principal_maker",
            "principal_or_first_maker",
            "copyright_holder",
            "acquisition_method",
        ]
    ),
    CountEncodingBlock(
        cat_cols=[
            "principal_maker",
            "principal_or_first_maker",
            "acquisition_method",
        ]
    ),
    TargetEncodingBlock(
        columns=["principal_maker"],
        target_column="likes",
        cv=cv,
    ),
    *[OtherTableCountBlock(
    category_df=df,
    df_name=name,
    minimum_freq=min_freq,
    )
    for df, name, min_freq in zip(
        [art_ds.material_df, art_ds.technique_df, art_ds.production_df, art_ds.historical_df, art_ds.object_df],
        ["material", "technique", "production_place", "historical_person", "object_collection"],
        [30, 30, 30, 30, 30])
    ],
    SubtitleSizeBlock(),
    MakerAgeBlock(),
    ObjectYearMetaBlock(),
    PrincipalMakerCountByObjectBlock(),
    PrincipalMakerCountByMakerBlock(),
    # text feature
    StringLengthBlock(
        columns=[
            'title',
            'description',
            'long_title',
            'more_title',
            'sub_title',
        ]
    ),
    WordCountBlock(
        columns=[
            'title',
            'description',
            'long_title',
            'more_title',
            'sub_title',
        ]
    ),
    *[TfidfBlock(
        column=column,
        n_components=50) for column in [
            "title",
            "description",
            "long_title",
            "more_title",
            "title&description&long_title&more_title",
            "acquisition_credit_line",
        ]
    ],
    *[W2vBlock(
        sentences_df=sentences_df,
        df_name=df_name,
        vector_size=50,
        min_count=1,
        window=5,
        epochs=100
        ) for df_name, sentences_df in get_w2v_input_df().items()
    ],
]

X_train, X_test = create_feature(train_df, test_df, y_train, blocks)

In [None]:
# training/inference
trainer = LightGBMTrainer(X_train, y_train, X_test, lgb_params, cv, seeds)
y_oof, models = trainer.fit()
y_pred = trainer.predict()

In [None]:
def show_feature_importance(models, feat_train_df) -> plotly.graph_objects.Figure:
    '''LightGBMのfeature importanceを可視化
    '''
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df['feature_importance'] = model.feature_importance(importance_type="gain")
        _df['feature'] = feat_train_df.columns
        _df['fold'] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df], 
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby('feature')\
        .sum()[['feature_importance']]\
        .sort_values('feature_importance', ascending=False).index[:50]
    
    fig = px.box(
        feature_importance_df.query("feature in @order"),
        x="feature_importance",
        y="feature",
        category_orders={"feature": order},
        width=1250,
        height=900,
        title="Top 50 feature importance",
    )
    fig.update_yaxes(showgrid=True)

    return fig


def show_oof_predict_distribution(y_train: np.ndarray, y_oof: np.ndarray, y_pred: np.ndarray) -> plotly.graph_objects.Figure:
    '''train, oof, predのhistogramを可視化する
    '''
    dfs = [pd.DataFrame({'phase': phase, 'value': value}) for phase, value in zip(['train', 'oof', 'pred'], [y_train, y_oof, y_pred])]
    df = pd.concat(dfs, axis=0)
    fig = px.histogram(df, x="value", color="phase", width=1250, marginal="box")
    fig.update_layout(barmode="overlay")
    fig.update_traces(opacity=0.6, marker=dict(line=dict(width=2,color="DarkSlateGrey")))

    return fig

In [None]:
fig = show_feature_importance(models, X_train)
fig.show()

In [None]:
fig = show_oof_predict_distribution(y_train, y_oof, y_pred)
fig.show()

In [None]:
def revert_log_target(y_log: np.ndarray) -> np.ndarray:
    y = np.expm1(y_log)
    y = np.where(y < 0, 0, y)
    return y


def make_submit(submission: pd.DataFrame, y_pred: np.ndarray, path: str, f_name: str):
    dir_path = Path(path)
    submission_df = submission.copy()
    submission_df["likes"] = y_pred
    assert len(submission_df) == len(y_pred)

    submission_df.to_csv(dir_path / str(f_name + ".csv"), index=False)

In [None]:
revert_pred = revert_log_target(y_pred)
make_submit(art_ds.submission, revert_pred, OUTPUT_DIR, EXP_NAME)