In [1]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import time


from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error
from tqdm.auto import tqdm

import warnings
warnings.simplefilter("ignore")

import unicodedata
%pip install -U lightgbm
import lightgbm as lgb



Collecting lightgbm
  Downloading lightgbm-4.0.0-py3-none-manylinux_2_28_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 3.3.2
    Uninstalling lightgbm-3.3.2:
      Successfully uninstalled lightgbm-3.3.2
Successfully installed lightgbm-4.0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    filename = "exp011"
    seed = 42
    greedy_seed = 778
    n_splits = 5
    data_dir = "/kaggle/input/signate-studentcup2023/"
    year_bins = 20
    num_boost_round = 10000
    stopping_rounds = 1500
    n_trials = 1500
    save_dir = "/kaggle/working/"
    num_cores = 4 # kaggleの方と統一
    categorical_features = [
        "fuel", "title_status", "type", "state", "region", "manufacturer", "condition", "cylinders", "transmission", "drive", "size", "paint_color"
        ]
    candidates = [col + "_category" for col in categorical_features] +\
    ["log_odometer", "sqrt_odometer", "elapsed_years", "log_elapsed_years", "sqrt_elapsed_years", "elapsed_years*odometer",
    "elapsed_years*log_odometer", "elapsed_years*sqrt_odometer", "log_elapsed_years*odometer", "log_elapsed_years*log_odometer", 
     "log_elapsed_years*sqrt_odometer", "sqrt_elapsed_years*odometer", "sqrt_elapsed_years*log_odometer", "sqrt_elapsed_years*sqrt_odometer",
    "region_latitude", "region_longitude", "state_latitude", "state_longitude"]
    use_features = ["odometer", "year"]

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100


def kfold(CFG, all_data: pd.DataFrame):
    """kfold

    Args:
        CFG : config
        all_data (pd.DataFrame)

    Returns:
        train, test
    """
    train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
    test = all_data[all_data["flag"] == "test"].reset_index(drop=True)
    # priceを小さい順に各foldに振り分ける
    train.sort_values(by="price", ignore_index=True, inplace=True)
    train["fold"] = [i for i in range(CFG.n_splits)] * (train.shape[0] // CFG.n_splits) + [i for i in range(train.shape[0] % CFG.n_splits)]
    train.sort_values(by="id", ignore_index=True, inplace=True)
    print(train["fold"].value_counts())
    print("The variance of the mean of the folds: ", train.groupby("fold")["price"].mean().std())
    return train

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")

region_coor = pd.read_csv(CFG.data_dir+"region_coordinate.csv")
state_coor = pd.read_csv(CFG.data_dir+"state_coordinate.csv")

train["flag"] = "train"
test["flag"] = "test"
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
# ===================================================================
#  feature_engineering
# ===================================================================
def preprocessing(all_data: pd.DataFrame):
    """
    train, testデータで共通の前処理のコード
    
    ・yearの異常値を直す
    ・manufacturerの表記を統一する
    ・sizeの表記を統一する
    ・regionの欠損値をtrain dataの(state, region)の組み合わせから補完する。残った欠損値は調べて補完する。
    ・title_statusとtypeの欠損値処理はとりあえず放置

    Args:
        all_data (pd.DataFrame): pd.concat([train, test], ignore_index=True)
    """
    # year
    year_dict = {
        2999:1999,
        3008:2008,
        3011:2011,
        3015:2015,
        3017:2017,
        3019:2019,
    }
    all_data["year"] = all_data["year"].replace(year_dict)
    
    
    # manufacturer
    all_data["manufacturer"] = all_data["manufacturer"].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))
    manufacturer_map = {
        'niѕsan':'nissan',
        'nisѕan':'nissan',
        'subαru':'subaru',
        'toyotа':'toyota',
        'sαturn':'saturn',
        'аcura':'acura',
        'vоlkswagen':'volkswagen',
        'lexuѕ':'lexus',
        'ᴄhrysler':'chrysler',
    }
    all_data["manufacturer"] = all_data["manufacturer"].replace(manufacturer_map)
    
    
    # size
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    all_data["size"] = all_data["size"].replace(size_dict)
        
    
    # 地域
    ## region -> stateが一意に定まることを確認
    region_state = {region:{} for region in all_data[all_data["flag"]=="train"]['region'].unique()}
    for row, value in all_data[all_data["flag"]=="train"].iterrows():
        if not pd.isna(value['state']):
            if value['state'] not in region_state[value['region']]:
                region_state[value['region']][value['state']] = 1
            else:
                region_state[value['region']][value['state']] += 1
    for region, state_dict in region_state.items():
        if len(state_dict) > 1 or state_dict == {}:
            region_state[region] = pd.NA
        else:
            region_state[region] = list(state_dict.keys())[0]

    ## regionからstateを決定
    all_data['state'] = [region_state[region] if pd.isna(state) else state for region, state in zip(all_data['region'], all_data['state'])]
    all_data.loc[all_data["region"] == "northwest KS", "state"] = "ks"
    all_data.loc[all_data["region"] == "ashtabula", "state"] = "oh"
    all_data.loc[all_data["region"] == "southern WV", "state"] = "wv"
    
    all_data = pd.merge(all_data, region_coor, on="region", how="left")
    all_data = pd.merge(all_data, state_coor, on="state", how="left")
    
    
    # type
    ## 欠損値 train: 456, test: 229
    
    # title_status
    ## 欠損値 train: 456, test: 229
    
    # fuel
    ## 欠損値 train: 1239, test: 1495
    
    
    all_data["elapsed_years"] = 2023 - all_data["year"]
    all_data["log_elapsed_years"] = np.log(all_data["elapsed_years"])
    all_data["sqrt_elapsed_years"] = np.sqrt(all_data["elapsed_years"])
    
    return all_data

all_data = preprocessing(all_data)


In [6]:
# ===================================================================
#  Cross Validation
# ===================================================================
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)

train.sort_values(by="id", ignore_index=True, inplace=True)

# priceを小さい順に各foldに振り分ける
train.sort_values(by="price", ignore_index=True, inplace=True)
train["fold"] = [i for i in range(CFG.n_splits)] * (train.shape[0] // CFG.n_splits) + [i for i in range(train.shape[0] % CFG.n_splits)]
train.sort_values(by="id", ignore_index=True, inplace=True)
print("The variance of the mean of the folds: ", train.groupby("fold")["price"].mean().std())

The variance of the mean of the folds:  4.695908800203918


In [7]:
def preprocessing_per_fold(CFG, train:pd.DataFrame, fold: int = 0):
    """foldごとの前処理: leakageを防ぐ

    Args:
        CFG :config
        train (pd.DataFrame): 学習データ
        test (pd.DataFrame, optional): test data Defaults to None.
        fold (int, optional): Defaults to 0.
        predict (bool, optional): 予測するか否か. Defaults to False.
    """
    X_train = train[train["fold"] != fold].reset_index(drop=True)
    X_valid = train[train["fold"] == fold].reset_index(drop=True)    

    # odometerの補正
    ## odometerが100以下or400000以上を異常値と考えて補完する
    ## year_mapがodometerの分散が大きくなる特徴量だったのでこれを利用してodometerを補完する
    fillna_map = X_train[(X_train["odometer"] > 100)&(X_train["odometer"] < 400000)].groupby(["region"])["odometer"].mean().reset_index()
    
    def replace_odometer(df: pd.DataFrame, fillna_map: pd.DataFrame)-> pd.DataFrame:
        """odometerの異常値をfillna_mapを利用して補完する

        Args:
            df (pd.DataFrame): 補完前のデータ
            fillna_map (pd.DataFrame): 補完するデータ

        Returns:
            pd.DataFrame: 補完後のデータ
        """
        df_1 = df[(df["odometer"] < 100)|(df["odometer"] > 400000)].reset_index(drop=True)
        df_2 = df[(df["odometer"] >= 100)&(df["odometer"] <= 400000)].reset_index(drop=True)
        df_1.drop("odometer", inplace=True, axis=1)
        df_1 = pd.merge(df_1, fillna_map, on="region", how="left")
        df = pd.concat([df_1, df_2])
        return df.sort_values("id", ignore_index=True)
    
    
    X_train = replace_odometer(X_train, fillna_map)
    X_valid = replace_odometer(X_valid, fillna_map)
    X_train["odometer"].fillna(X_train["odometer"].mean(), inplace=True)
    X_valid["odometer"].fillna(X_train["odometer"].mean(), inplace=True)
        

    # 交互作用
    def apply_fe(df: pd.DataFrame) -> pd.DataFrame:
        """foldごとの特徴量作成

        Args:
            df (pd.DataFrame)

        Returns:
            pd.DataFrame: 特徴量作成後のdf
        """
        df["log_odometer"] = np.log(df["odometer"])
        df["sqrt_odometer"] = np.sqrt(df["odometer"])
        
        df["elapsed_years*odometer"] = df["elapsed_years"] * df["odometer"]
        df["elapsed_years*log_odometer"] = df["elapsed_years"] * df["log_odometer"]
        df["elapsed_years*sqrt_odometer"] = df["elapsed_years"] * df["sqrt_odometer"]
        
        df["log_elapsed_years*odometer"] = df["log_elapsed_years"] * df["odometer"]
        df["log_elapsed_years*log_odometer"] = df["log_elapsed_years"] * df["log_odometer"]
        df["log_elapsed_years*sqrt_odometer"] = df["log_elapsed_years"] * df["sqrt_odometer"]
        
        df["sqrt_elapsed_years*odometer"] = df["sqrt_elapsed_years"] * df["odometer"]
        df["sqrt_elapsed_years*log_odometer"] = df["sqrt_elapsed_years"] * df["log_odometer"]
        df["sqrt_elapsed_years*sqrt_odometer"] = df["sqrt_elapsed_years"] * df["sqrt_odometer"]
        return df
    X_train = apply_fe(X_train)
    X_valid = apply_fe(X_valid)
        
        
    # カウントエンコーディング
    for col in CFG.categorical_features:
        count_map = X_train[col].value_counts().to_dict()
        X_train[col+"_count_encoding"] = X_train[col].map(count_map)
        X_valid[col+"_count_encoding"] = X_valid[col].map(count_map)
        if fold == 0:
            CFG.candidates.append(col+"_count_encoding")
        
        
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["price"].agg(agg_)
            X_train[col+f"_{agg_}_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_encoding"] = X_valid[col].map(fillna_map)
            if fold == 0:
                CFG.candidates.append(col+f"_{agg_}_encoding")
                
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["odometer"].agg(agg_)
            X_train[col+f"_{agg_}_odometer_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_odometer_encoding"] = X_valid[col].map(fillna_map)
            if fold == 0:
                CFG.candidates.append(col+f"_{agg_}_odometer_encoding")
            if agg_ == "median" or agg_ == "mean":
                X_train[col+f"_{agg_}_odometer_encoding_diff"] = X_train[col+f"_{agg_}_odometer_encoding"] - X_train["odometer"]
                X_valid[col+f"_{agg_}_odometer_encoding_diff"] = X_valid[col+f"_{agg_}_odometer_encoding"] - X_valid["odometer"]
                if fold == 0:
                    CFG.candidates.append(col+f"_{agg_}_odometer_encoding_diff")
                    
                    
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["elapsed_years"].agg(agg_)
            X_train[col+f"_{agg_}_elapsed_years_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_elapsed_years_encoding"] = X_valid[col].map(fillna_map)
            if fold == 0:
                CFG.candidates.append(col+f"_{agg_}_elapsed_years_encoding")
            if agg_ == "median" or agg_ == "mean":
                X_train[col+f"_{agg_}_elapsed_years_encoding_diff"] = X_train[col+f"_{agg_}_elapsed_years_encoding"] - X_train["elapsed_years"]
                X_valid[col+f"_{agg_}_elapsed_years_encoding_diff"] = X_valid[col+f"_{agg_}_elapsed_years_encoding"] - X_valid["elapsed_years"]
                if fold == 0:
                    CFG.candidates.append(col+f"_{agg_}_elapsed_years_encoding_diff")

                
    """
    # target encodingしたやつらの交互作用
    te = [col for col in X_train.columns if "_mean_" in col and "_diff" not in col] 
    for i, col1 in enumerate(tqdm(te)):
        for col2 in te[i:]:
            X_train[f"{col1}*{col2}"] = X_train[col1] * X_train[col2]
            X_valid[f"{col1}*{col2}"] = X_valid[col1] * X_valid[col2]
            if fold == 0:
                CFG.candidates.append(f"{col1}*{col2}")
    """

            
    # OrdinalEncoder: これはfoldごとではなくともよい
    oe = OrdinalEncoder(categories="auto",
                        handle_unknown="use_encoded_value",
                        unknown_value=999,
                        encoded_missing_value=np.nan, # QUESTION: 欠損値は-1に変換する -> NaNに??
                        )
    CFG.categorical_features_ = [feature + "_category" for feature in CFG.categorical_features]
    X_train[CFG.categorical_features_] = oe.fit_transform(X_train[CFG.categorical_features].values)
    X_valid[CFG.categorical_features_] = oe.transform(X_valid[CFG.categorical_features].values)
    return X_train, X_valid

In [8]:
# ===================================================================
#  preprocessing_per_fold
# ===================================================================
X_train0, X_valid0 = preprocessing_per_fold(CFG, train, 0)
X_train1, X_valid1 = preprocessing_per_fold(CFG, train, 1)
X_train2, X_valid2 = preprocessing_per_fold(CFG, train, 2)
X_train3, X_valid3 = preprocessing_per_fold(CFG, train, 3)
X_train4, X_valid4 = preprocessing_per_fold(CFG, train, 4)

In [9]:
# ===================================================================
#  check
# ===================================================================
len(CFG.candidates), len(set(CFG.candidates))

(270, 270)

In [10]:
# ===================================================================
#  evaluate
# ===================================================================
def train_lgb(CFG, lgb_param):
    oof_df = pd.DataFrame()
    for X_train, X_valid in zip([X_train0, X_train1, X_train2, X_train3, X_train4], [X_valid0, X_valid1, X_valid2, X_valid3, X_valid4]):
        # train
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train["price"], categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid["price"], categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                        )

        X_valid[f"pred"] = model.predict(X_valid[CFG.use_features], num_iteration=model.best_iteration)
        oof_df = pd.concat([oof_df, X_valid], ignore_index=True)
    score = get_score(oof_df["price"], oof_df["pred"])
    return score

In [11]:
# ===================================================================
#  simple greedy forward selection
# ===================================================================
lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "num_boost_round": CFG.num_boost_round,
    "learning_rate":0.1, # default: 0.1
    "num_leaves":31, # max number of leaves in one tree ###
    "max_depth":6, # default -1, int: limit the max depth for tree model ##
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "alpha":0.9, # double, constraints, alpha > 0.0: 
    "colsample_bytree":0.4, # 0 < "colsample_bytree" < 1
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": CFG.num_cores,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1, 
}

print('\033[32m'+"start simple selection"+'\033[0m')
best_score = train_lgb(CFG, lgb_param)
print(f"[{0}/{len(CFG.candidates)}]Base Score: {best_score:.3f}")

start = time.time()
CFG.candidates = np.random.RandomState(CFG.greedy_seed).permutation(CFG.candidates)
for i, col in enumerate(CFG.candidates):
    CFG.use_features.append(col)
    score = train_lgb(CFG, lgb_param)
    if best_score > score:
        print('[{0}/{1}]'
              'Elapsed {remain:s} '
              'Score: {best_score:.4f} -> {score:.4f} '
              '&   selected: {feature}'
              .format(i+1, len(CFG.candidates),
                      remain=timeSince(start, float(i+1)/len(CFG.candidates) ),
                      best_score = best_score,
                      score = score,
                      feature = col
                     )
             )
        best_score = score
    else:
        print('[{0}/{1}]'
              'Elapsed {remain:s} '
              'Score: {best_score:.4f} -> {score:.4f} '
              '& unselected: {feature}'
              .format(i+1, len(CFG.candidates),
                      remain=timeSince(start, float(i+1)/len(CFG.candidates) ),
                      best_score = best_score,
                      score = score,
                      feature = col
                     )
             )
        CFG.use_features.remove(col)

[32mstart simple selection[0m
[0/270]Base Score: 49.505
[1/270]Elapsed 1m 11s (remain 318m 42s) Score: 49.5053 -> 49.8388 & unselected: paint_color_mean_odometer_encoding_diff
[2/270]Elapsed 2m 18s (remain 308m 21s) Score: 49.5053 -> 48.7136 &   selected: fuel_std_elapsed_years_encoding
[3/270]Elapsed 3m 38s (remain 324m 50s) Score: 48.7136 -> 46.8724 &   selected: type_median_encoding
[4/270]Elapsed 5m 0s (remain 333m 26s) Score: 46.8724 -> 46.6551 &   selected: fuel_median_odometer_encoding_diff
[5/270]Elapsed 6m 21s (remain 337m 21s) Score: 46.6551 -> 46.1967 &   selected: drive_median_odometer_encoding
[6/270]Elapsed 7m 49s (remain 344m 18s) Score: 46.1967 -> 45.7932 &   selected: type_median_odometer_encoding
[7/270]Elapsed 9m 14s (remain 347m 23s) Score: 45.7932 -> 45.7415 &   selected: size_min_odometer_encoding
[8/270]Elapsed 10m 37s (remain 348m 4s) Score: 45.7415 -> 45.7337 &   selected: size_category
[9/270]Elapsed 12m 15s (remain 355m 22s) Score: 45.7337 -> 45.9527 & unse

In [12]:
# ===================================================================
#  simple greedy forward selection
# ===================================================================
lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "num_boost_round": CFG.num_boost_round,
    "learning_rate":0.1, # default: 0.1
    "num_leaves":31, # max number of leaves in one tree ###
    "max_depth":6, # default -1, int: limit the max depth for tree model ###
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "alpha":0.9, # double, constraints, alpha > 0.0: 
    "colsample_bytree":0.4, # 0 < "colsample_bytree" < 1 ###
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": CFG.num_cores,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1, 
}

print('\033[32m'+"start simple selection"+'\033[0m')
best_score = train_lgb(CFG, lgb_param)
print(f"[{0}/{len(CFG.candidates)}]Base Score: {best_score:.3f}")

CFG.greedy_seed += 10000
start = time.time()
CFG.candidates = np.random.RandomState(CFG.greedy_seed).permutation(CFG.candidates)
for i, col in enumerate(CFG.candidates):
    CFG.use_features.append(col)
    score = train_lgb(CFG, lgb_param)
    if best_score > score:
        print('[{0}/{1}]'
              'Elapsed {remain:s} '
              'Score: {best_score:.4f} -> {score:.4f} '
              '&   selected: {feature}'
              .format(i+1, len(CFG.candidates),
                      remain=timeSince(start, float(i+1)/len(CFG.candidates) ),
                      best_score = best_score,
                      score = score,
                      feature = col
                     )
             )
        best_score = score
    else:
        print('[{0}/{1}]'
              'Elapsed {remain:s} '
              'Score: {best_score:.4f} -> {score:.4f} '
              '& unselected: {feature}'
              .format(i+1, len(CFG.candidates),
                      remain=timeSince(start, float(i+1)/len(CFG.candidates) ),
                      best_score = best_score,
                      score = score,
                      feature = col
                     )
             )
        CFG.use_features.remove(col)

[32mstart simple selection[0m
[0/270]Base Score: 44.281
[1/270]Elapsed 1m 37s (remain 438m 15s) Score: 44.2813 -> 44.3817 & unselected: transmission_median_elapsed_years_encoding
[2/270]Elapsed 3m 22s (remain 451m 50s) Score: 44.2813 -> 44.3550 & unselected: cylinders_std_encoding
[3/270]Elapsed 5m 8s (remain 458m 13s) Score: 44.2813 -> 44.3414 & unselected: type_category
[4/270]Elapsed 6m 50s (remain 454m 42s) Score: 44.2813 -> 44.3994 & unselected: type_mean_elapsed_years_encoding_diff
[5/270]Elapsed 8m 30s (remain 450m 36s) Score: 44.2813 -> 44.3533 & unselected: size_std_odometer_encoding
[6/270]Elapsed 10m 18s (remain 453m 12s) Score: 44.2813 -> 44.3523 & unselected: title_status_std_elapsed_years_encoding
[7/270]Elapsed 12m 5s (remain 454m 17s) Score: 44.2813 -> 44.4035 & unselected: paint_color_mean_elapsed_years_encoding_diff
[8/270]Elapsed 13m 51s (remain 453m 55s) Score: 44.2813 -> 44.3718 & unselected: manufacturer_median_elapsed_years_encoding_diff
[9/270]Elapsed 15m 35s 

[LightGBM] [Fatal] Feature (cylinders_mean_odometer_encoding) appears more than one time.


LightGBMError: Feature (cylinders_mean_odometer_encoding) appears more than one time.