In [1]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_percentage_error
from typing import List


import multiprocessing
NUM_CORES = multiprocessing.cpu_count()

import warnings
warnings.simplefilter("ignore")

from tqdm.auto import tqdm
import optuna
import unicodedata
import lightgbm as lgb

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    random_greedy_seed = 42
    n_splits = 5
    data_dir =  "G:/マイドライブ/signate_StudentCup2023/data/"
    target_bins = 20
    year_bins = 20
    num_boost_round = 10
    stopping_rounds = 1
    n_trials = 1500
    save_dir =  "G:/マイドライブ/signate_StudentCup2023/data/"

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")

train["flag"] = "train"
test["flag"] = "test"
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
# ===================================================================
#  preprocessing
# ===================================================================
# year
def pre_year(df: pd.DataFrame):
    year_dict = {
        2999:1999,
        3008:2008,
        3011:2011,
        3015:2015,
        3017:2017,
        3019:2019,
    }
    df["year"] = df["year"].replace(year_dict)
    return df
all_data = pre_year(all_data)

# manufacturer
all_data["manufacturer"] = all_data["manufacturer"].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))
manufacturer_map = {
    'niѕsan':'nissan',
    'nisѕan':'nissan',
    'subαru':'subaru',
    'toyotа':'toyota',
    'sαturn':'saturn',
    'аcura':'acura',
    'vоlkswagen':'volkswagen',
    'lexuѕ':'lexus',
    'ᴄhrysler':'chrysler',
}
all_data["manufacturer"] = all_data["manufacturer"].replace(manufacturer_map)

# size
def pre_size(df: pd.DataFrame):
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    df["size"] = df["size"].replace(size_dict)
    return df
all_data = pre_size(all_data)

# title_status
## 欠損値 train: 456, test: 229
all_data["title_status"].fillna(value="Unknown", inplace=True)
#all_data["title_status"] = all_data["title_status"].fillna("clean")


# region -> stateが一意に定まることを確認
region_state = {region:{} for region in train['region'].unique()}
for row, value in train.iterrows():
    if not pd.isna(value['state']):
        if value['state'] not in region_state[value['region']]:
            region_state[value['region']][value['state']] = 1
        else:
            region_state[value['region']][value['state']] += 1
for region, state_dict in region_state.items():
    if len(state_dict) > 1 or state_dict == {}:
        region_state[region] = pd.NA
    else:
        region_state[region] = list(state_dict.keys())[0]

# regionからstateを決定
all_data['state'] = [region_state[region] if pd.isna(state) else state for region, state in zip(all_data['region'], all_data['state'])]
all_data.loc[all_data["region"] == "northwest KS", "state"] = "ks"
all_data.loc[all_data["region"] == "ashtabula", "state"] = "oh"
all_data.loc[all_data["region"] == "southern WV", "state"] = "wv"


# type
## 欠損値 train: 456, test: 229
all_data["type"].fillna(value="Unknown", inplace=True)

In [6]:
all_data["fuel"].unique()

array(['gas', nan, 'diesel', 'other', 'hybrid', 'electric'], dtype=object)

In [7]:
all_data[all_data["flag"] == "train"].isnull().sum(), all_data[all_data["flag"] == "test"].isnull().sum()

(id                 0
 region             0
 year               0
 manufacturer       0
 condition          0
 cylinders          0
 fuel            1239
 odometer           0
 title_status       0
 transmission       0
 drive              0
 size               0
 type               0
 paint_color        0
 state              0
 price              0
 flag               0
 dtype: int64,
 id                  0
 region              0
 year                0
 manufacturer        0
 condition           0
 cylinders           0
 fuel             1495
 odometer            0
 title_status        0
 transmission        0
 drive               0
 size                0
 type                0
 paint_color         0
 state               0
 price           27537
 flag                0
 dtype: int64)

In [8]:
# ===================================================================
#  Cross Validation
# ===================================================================
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)

train.sort_values(by="id", ignore_index=True, inplace=True)

train["year_map"], bins = pd.cut(train["year"], bins=20, labels=False, retbins=True)
test["year_map"] = pd.cut(test["year"], bins=bins, labels=False)
train["price_map"] = pd.cut(train["price"], bins=CFG.target_bins, labels=False)

skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val) in enumerate(skf.split(X=train, y=train["price_map"])):
    train.loc[val, "fold"] = i
print(train["fold"].value_counts())
print(train.groupby("fold")["price"].mean().std())

fold
1.0    5507
0.0    5507
2.0    5506
4.0    5506
3.0    5506
Name: count, dtype: int64
8.784566908616718


In [9]:
# ===================================================================
#  use features
# =================================================================== 
## numerical
CFG.base_features = ['year', "odometer"]

## category
CFG.categorical_features = ["fuel", "title_status", "type", "state", "region", "manufacturer", "condition", "cylinders", "transmission", "drive", "size", "paint_color"]

## use_features
CFG.use_features = []

In [10]:
fold = 0
X_train = train[train["fold"] != fold].reset_index(drop=True)
X_valid = train[train["fold"] == fold].reset_index(drop=True)    
test_df = test.copy()

# odometerの補正
fillna_map = X_train[(X_train["odometer"] > 100)&(X_train["odometer"] < 400000)].groupby(["year_map"])["odometer"].mean().reset_index()
def replace_odometer(df: pd.DataFrame, fillna_map: pd.DataFrame):
    df_1 = df[(df["odometer"] < 100)|(df["odometer"] > 400000)].reset_index(drop=True)
    df_2 = df[(df["odometer"] >= 100)&(df["odometer"] <= 400000)].reset_index(drop=True)
    df_1.drop("odometer", inplace=True, axis=1)
    df_1 = pd.merge(df_1, fillna_map, on="year_map", how="left")
    df = pd.concat([df_1, df_2])
    return df.sort_values("id", ignore_index=True)
X_train = replace_odometer(X_train, fillna_map)
X_valid = replace_odometer(X_valid, fillna_map)
test_df = replace_odometer(test_df, fillna_map)
    
# CFG.count_encoding_features
for col in CFG.categorical_features:
    count_map = X_train[col].value_counts().to_dict()
    X_train[col+"_count_encoding"] = X_train[col].map(count_map)
    X_valid[col+"_count_encoding"] = X_valid[col].map(count_map)
    test_df[col+"_count_encoding"] = test_df[col].map(count_map)
    CFG.use_features.append(col+"_count_encoding")
    
for col in CFG.categorical_features:
    for agg_ in ["mean", "std", "max", "min", "median"]:
        fillna_map = X_train.groupby(col)["price"].agg(agg_)
        X_train[col+f"_{agg_}_encoding"] = X_train[col].map(fillna_map)
        X_valid[col+f"_{agg_}_encoding"] = X_valid[col].map(fillna_map)
        test_df[col+f"_{agg_}_encoding"] = test_df[col].map(fillna_map)
        CFG.use_features.append(col+f"_{agg_}_encoding")

In [11]:
oe = OrdinalEncoder(categories="auto",
                    handle_unknown="use_encoded_value",
                    unknown_value=-2, # 未知のデータは-2に変換する
                    encoded_missing_value=-1, # 欠損値は-1に変換する
                    )
CFG.categorical_features_ = [feature + "_category" for feature in CFG.categorical_features]
X_train[CFG.categorical_features_] = oe.fit_transform(X_train[CFG.categorical_features].values)
X_valid[CFG.categorical_features_] = oe.transform(X_valid[CFG.categorical_features].values)
test_df[CFG.categorical_features_] = oe.transform(test_df[CFG.categorical_features].values)

In [12]:
CFG.use_features = CFG.use_features + CFG.categorical_features_

In [13]:
CFG.use_features

['fuel_count_encoding',
 'title_status_count_encoding',
 'type_count_encoding',
 'state_count_encoding',
 'region_count_encoding',
 'manufacturer_count_encoding',
 'condition_count_encoding',
 'cylinders_count_encoding',
 'transmission_count_encoding',
 'drive_count_encoding',
 'size_count_encoding',
 'paint_color_count_encoding',
 'fuel_mean_encoding',
 'fuel_std_encoding',
 'fuel_max_encoding',
 'fuel_min_encoding',
 'fuel_median_encoding',
 'title_status_mean_encoding',
 'title_status_std_encoding',
 'title_status_max_encoding',
 'title_status_min_encoding',
 'title_status_median_encoding',
 'type_mean_encoding',
 'type_std_encoding',
 'type_max_encoding',
 'type_min_encoding',
 'type_median_encoding',
 'state_mean_encoding',
 'state_std_encoding',
 'state_max_encoding',
 'state_min_encoding',
 'state_median_encoding',
 'region_mean_encoding',
 'region_std_encoding',
 'region_max_encoding',
 'region_min_encoding',
 'region_median_encoding',
 'manufacturer_mean_encoding',
 'manufactu

In [25]:
X_train[X_train["fuel_count_encoding"].isnull()]["fuel_count_encoding"]

10      NaN
16      NaN
17      NaN
27      NaN
55      NaN
         ..
21984   NaN
21985   NaN
21986   NaN
22001   NaN
22006   NaN
Name: fuel_count_encoding, Length: 992, dtype: float64

In [133]:
# params
lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "num_boost_round": CFG.num_boost_round,
    "learning_rate":0.1, # default: 0.1
    "num_leaves":31, # max number of leaves in one tree ###
    "max_depth":4, # default -1, int: limit the max depth for tree model ###
    "min_child_weight":1e-3, # double: minimal sum hessian in one leaf
    "min_data_in_leaf":20, # minimal number of data in one leaf
    "alpha":0.9, # double, constraints, alpha > 0.0: 
    "colsample_bytree":1.0, # 0 < "colsample_bytree" < 1
    #: LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0
    "lambda": 0, #lambda_l2 >= 0.0: L2 regularization
    "subsample":1, #0.0 < bagging_fraction <= 1.0
    "num_threads": NUM_CORES,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1,   
    "use_missing":False, # 欠損値の情報を使わない
}

In [134]:
def evaluate(fs: List[str]):
    # train
    categorical_features = [col for col in fs if "_category" in col]
    lgb_train = lgb.Dataset(X_train[fs], X_train["price"], categorical_feature = categorical_features)
    lgb_valid = lgb.Dataset(X_valid[fs], X_valid["price"], categorical_feature = categorical_features)
    model = lgb.train(
                    lgb_param, 
                    lgb_train, 
                    num_boost_round=CFG.num_boost_round, 
                    valid_sets=[lgb_valid],
                    categorical_feature = categorical_features,
                    callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                    )
    
    X_valid[f"pred"] = model.predict(X_valid[fs], num_iteration=model.best_iteration)
    score = get_score(y_true=X_valid["price"], y_pred=X_valid["pred"])
    return score

In [135]:
candidates = np.random.RandomState(CFG.random_greedy_seed).permutation(CFG.use_features)
selected = set([])

base_features = CFG.base_features
print('\033[32m'+"start simple selection"+'\033[0m')
print("base features: ", base_features)
best_score = evaluate(base_features)
print(f"[{0}/{len(candidates)}]Base Score: {best_score:.3f}")

for i, feature in enumerate(candidates):
    fs = list(selected) + [feature] + base_features
    score = evaluate(fs)
    if score < best_score:
        selected.add(feature)
        print(f"[{i+1}/{len(candidates)}]Score: {best_score:.3f} -> {score:.3f}"+f' &   selected: {feature}')
        best_score = score
    else:
        print(f"[{i+1}/{len(candidates)}]Score: {best_score:.3f} -> {score:.3f}"+f' & unselected: {feature}') 

print('\033[32m'+f'selected features: {selected}'+'\033[0m')
print('\033[32m'+f"best_score: {best_score}"+'\033[0m')
#https://github.com/ghmagazine/kagglebook/blob/3d8509d1c1b41a765e3f4744ba1fb226188e2b15/ch06/ch06-06-wrapper.py

[32mstart simple selection[0m
base features:  ['year', 'odometer']
[0/84]Base Score: 50.512
[1/84]Score: 50.512 -> 50.495 &   selected: title_status_category
[2/84]Score: 50.495 -> 50.379 &   selected: fuel_count_encoding
[3/84]Score: 50.379 -> 49.243 &   selected: drive_std_encoding
[4/84]Score: 49.243 -> 49.057 &   selected: type_mean_encoding
[5/84]Score: 49.057 -> 49.138 & unselected: fuel_mean_encoding
[6/84]Score: 49.057 -> 49.103 & unselected: paint_color_min_encoding
[7/84]Score: 49.057 -> 49.041 &   selected: size_count_encoding
[8/84]Score: 49.041 -> 49.035 &   selected: title_status_std_encoding
[9/84]Score: 49.035 -> 49.035 & unselected: region_count_encoding
[10/84]Score: 49.035 -> 49.075 & unselected: size_min_encoding
[11/84]Score: 49.035 -> 49.034 &   selected: manufacturer_min_encoding
[12/84]Score: 49.034 -> 49.034 & unselected: cylinders_mean_encoding
[13/84]Score: 49.034 -> 49.270 & unselected: region_min_encoding
[14/84]Score: 49.034 -> 49.034 & unselected: drive

fold0: 44.40705827859202  
fold1: 44.80749453396233  
fold2: 43.79303708762999  
fold3: 44.12593310955919  
fold4: 43.8818079259226  
====== CV score ======  
44.20309555007229