In [1]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd

from math import comb
from tqdm.auto import tqdm
from itertools import combinations

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import unicodedata
import lightgbm as lgb

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    filename = "exp042"
    seed = 42
    n_splits = 8
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    year_bins = 20
    num_boost_round = 10000
    stopping_rounds = 100
    n_trials = 300
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    num_cores = 4 # kaggleの方と統一
    categorical_features = [
        "fuel", "title_status", "type", "state", "region", "manufacturer", "condition", "cylinders", "transmission", "drive", "size", "paint_color"
        ]
    use_features = ["odometer", "year"]

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")

region_coor = pd.read_csv(CFG.data_dir+"region_coordinate.csv")
state_coor = pd.read_csv(CFG.data_dir+"state_coordinate.csv")

train["flag"] = "train"
test["flag"] = "test"
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
# ===================================================================
#  feature_engineering
# ===================================================================
def preprocessing(all_data: pd.DataFrame):
    """
    train, testデータで共通の前処理のコード
    
    ・yearの異常値を直す
    ・manufacturerの表記を統一する
    ・sizeの表記を統一する
    ・regionの欠損値をtrain dataの(state, region)の組み合わせから補完する。残った欠損値は調べて補完する。
    ・title_statusとtypeの欠損値処理はとりあえず放置

    Args:
        all_data (pd.DataFrame): pd.concat([train, test], ignore_index=True)
    """
    # year
    year_dict = {
        2999:1999,
        3008:2008,
        3011:2011,
        3015:2015,
        3017:2017,
        3019:2019,
    }
    all_data["year"] = all_data["year"].replace(year_dict)
    
    
    # manufacturer
    all_data["manufacturer"] = all_data["manufacturer"].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))
    manufacturer_map = {
        'niѕsan':'nissan',
        'nisѕan':'nissan',
        'subαru':'subaru',
        'toyotа':'toyota',
        'sαturn':'saturn',
        'аcura':'acura',
        'vоlkswagen':'volkswagen',
        'lexuѕ':'lexus',
        'ᴄhrysler':'chrysler',
    }
    all_data["manufacturer"] = all_data["manufacturer"].replace(manufacturer_map)
    
    
    # size
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    all_data["size"] = all_data["size"].replace(size_dict)
        
    
    # 地域
    ## region -> stateが一意に定まることを確認
    region_state = {region:{} for region in all_data[all_data["flag"]=="train"]['region'].unique()}
    for row, value in all_data[all_data["flag"]=="train"].iterrows():
        if not pd.isna(value['state']):
            if value['state'] not in region_state[value['region']]:
                region_state[value['region']][value['state']] = 1
            else:
                region_state[value['region']][value['state']] += 1
    for region, state_dict in region_state.items():
        if len(state_dict) > 1 or state_dict == {}:
            region_state[region] = pd.NA
        else:
            region_state[region] = list(state_dict.keys())[0]

    ## regionからstateを決定
    all_data['state'] = [region_state[region] if pd.isna(state) else state for region, state in zip(all_data['region'], all_data['state'])]
    all_data.loc[all_data["region"] == "northwest KS", "state"] = "ks"
    all_data.loc[all_data["region"] == "ashtabula", "state"] = "oh"
    all_data.loc[all_data["region"] == "southern WV", "state"] = "wv"
    
    all_data = pd.merge(all_data, region_coor, on="region", how="left")
    all_data = pd.merge(all_data, state_coor, on="state", how="left")
    
    
    # type
    ## 欠損値 train: 456, test: 229
    
    # title_status
    ## 欠損値 train: 456, test: 229
    
    # fuel
    ## 欠損値 train: 1239, test: 1495
    
    
    all_data["elapsed_years"] = 2023 - all_data["year"]
    all_data["log_elapsed_years"] = np.log(all_data["elapsed_years"])
    all_data["sqrt_elapsed_years"] = np.sqrt(all_data["elapsed_years"])
    
    return all_data

all_data = preprocessing(all_data)

In [6]:
# ===================================================================
#  Cross Validation
# ===================================================================
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)

train.sort_values(by="id", ignore_index=True, inplace=True)

# priceを小さい順に各foldに振り分ける
train.sort_values(by="price", ignore_index=True, inplace=True)
train["fold"] = [i for i in range(CFG.n_splits)] * (train.shape[0] // CFG.n_splits) + [i for i in range(train.shape[0] % CFG.n_splits)]
train.sort_values(by="id", ignore_index=True, inplace=True)
print("The variance of the mean of the folds: ", train.groupby("fold")["price"].mean().std())

The variance of the mean of the folds:  7.232164844021037


In [7]:
def preprocessing_per_fold(CFG, train:pd.DataFrame, test: pd.DataFrame,  fold: int = 0):
    """foldごとの前処理: leakageを防ぐ

    Args:
        CFG :config
        train (pd.DataFrame): 学習データ
        test (pd.DataFrame, optional): test data Defaults to None.
        fold (int, optional): Defaults to 0.
    """
    X_train = train[train["fold"] != fold].reset_index(drop=True)
    X_valid = train[train["fold"] == fold].reset_index(drop=True)    
    test_df = test.copy()

    # odometerの補正
    ## odometerが100以下or400000以上を異常値と考えて補完する
    fillna_map = X_train[(X_train["odometer"] > 100)&(X_train["odometer"] < 400000)].groupby(["region"])["odometer"].mean().reset_index()
    
    def replace_odometer(df: pd.DataFrame, fillna_map: pd.DataFrame)-> pd.DataFrame:
        """odometerの異常値をfillna_mapを利用して補完する

        Args:
            df (pd.DataFrame): 補完前のデータ
            fillna_map (pd.DataFrame): 補完するデータ

        Returns:
            pd.DataFrame: 補完後のデータ
        """
        df_1 = df[(df["odometer"] < 100)|(df["odometer"] > 400000)].reset_index(drop=True)
        df_2 = df[(df["odometer"] >= 100)&(df["odometer"] <= 400000)].reset_index(drop=True)
        df_1.drop("odometer", inplace=True, axis=1)
        df_1 = pd.merge(df_1, fillna_map, on="region", how="left")
        df = pd.concat([df_1, df_2])
        return df.sort_values("id", ignore_index=True)
    
    
    X_train = replace_odometer(X_train, fillna_map)
    X_valid = replace_odometer(X_valid, fillna_map)
    test_df = replace_odometer(test_df, fillna_map)
    X_train["odometer"].fillna(X_train["odometer"].mean(), inplace=True)
    X_valid["odometer"].fillna(X_train["odometer"].mean(), inplace=True)
    test_df["odometer"].fillna(X_train["odometer"].mean(), inplace=True)
        

    # 交互作用
    def apply_fe(df: pd.DataFrame) -> pd.DataFrame:
        """foldごとの特徴量作成

        Args:
            df (pd.DataFrame)

        Returns:
            pd.DataFrame: 特徴量作成後のdf
        """
        df["log_odometer"] = np.log(df["odometer"])
        df["sqrt_odometer"] = np.sqrt(df["odometer"])
        
        df["elapsed_years*odometer"] = df["elapsed_years"] * df["odometer"]
        df["elapsed_years*log_odometer"] = df["elapsed_years"] * df["log_odometer"]
        df["elapsed_years*sqrt_odometer"] = df["elapsed_years"] * df["sqrt_odometer"]
        
        df["log_elapsed_years*odometer"] = df["log_elapsed_years"] * df["odometer"]
        df["log_elapsed_years*log_odometer"] = df["log_elapsed_years"] * df["log_odometer"]
        df["log_elapsed_years*sqrt_odometer"] = df["log_elapsed_years"] * df["sqrt_odometer"]
        
        df["sqrt_elapsed_years*odometer"] = df["sqrt_elapsed_years"] * df["odometer"]
        df["sqrt_elapsed_years*log_odometer"] = df["sqrt_elapsed_years"] * df["log_odometer"]
        df["sqrt_elapsed_years*sqrt_odometer"] = df["sqrt_elapsed_years"] * df["sqrt_odometer"]
        return df
    X_train = apply_fe(X_train)
    X_valid = apply_fe(X_valid)
    test_df = apply_fe(test_df)
        
        
    # カウントエンコーディング
    for col in CFG.categorical_features:
        count_map = X_train[col].value_counts().to_dict()
        X_train[col+"_count_encoding"] = X_train[col].map(count_map)
        X_valid[col+"_count_encoding"] = X_valid[col].map(count_map)
        test_df[col+"_count_encoding"] = test_df[col].map(count_map)
        
        
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["price"].agg(agg_)
            X_train[col+f"_{agg_}_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_encoding"] = X_valid[col].map(fillna_map)
            test_df[col+f"_{agg_}_encoding"] = test_df[col].map(fillna_map)
                
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["odometer"].agg(agg_)
            X_train[col+f"_{agg_}_odometer_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_odometer_encoding"] = X_valid[col].map(fillna_map)
            test_df[col+f"_{agg_}_odometer_encoding"] = test_df[col].map(fillna_map)
            if agg_ == "median" or agg_ == "mean":
                X_train[col+f"_{agg_}_odometer_encoding_diff"] = X_train[col+f"_{agg_}_odometer_encoding"] - X_train["odometer"]
                X_valid[col+f"_{agg_}_odometer_encoding_diff"] = X_valid[col+f"_{agg_}_odometer_encoding"] - X_valid["odometer"]
                test_df[col+f"_{agg_}_odometer_encoding_diff"] = test_df[col+f"_{agg_}_odometer_encoding"] - test_df["odometer"]
                    
                    
    # 集約特徴量を用いたエンコーディング
    for col in CFG.categorical_features:
        for agg_ in ["mean", "std", "max", "min", "median"]:
            fillna_map = X_train.groupby(col)["elapsed_years"].agg(agg_)
            X_train[col+f"_{agg_}_elapsed_years_encoding"] = X_train[col].map(fillna_map)
            X_valid[col+f"_{agg_}_elapsed_years_encoding"] = X_valid[col].map(fillna_map)
            test_df[col+f"_{agg_}_elapsed_years_encoding"] = test_df[col].map(fillna_map)
            if agg_ == "median" or agg_ == "mean":
                X_train[col+f"_{agg_}_elapsed_years_encoding_diff"] = X_train[col+f"_{agg_}_elapsed_years_encoding"] - X_train["elapsed_years"]
                X_valid[col+f"_{agg_}_elapsed_years_encoding_diff"] = X_valid[col+f"_{agg_}_elapsed_years_encoding"] - X_valid["elapsed_years"]
                test_df[col+f"_{agg_}_elapsed_years_encoding_diff"] = test_df[col+f"_{agg_}_elapsed_years_encoding"] - test_df["elapsed_years"]

    # カテゴリ2変数
    X_train["year_map"], bins = pd.cut(X_train["year"], bins=20, labels=False, retbins=True)
    X_valid["year_map"] = pd.cut(X_valid["year"], bins=bins, labels=False)
    test_df["year_map"] = pd.cut(test_df["year"], bins=bins, labels=False)
    
    X_train["odometer_map"], bins = pd.cut(X_train["odometer"], bins=20, labels=False, retbins=True)
    X_valid["odometer_map"] = pd.cut(X_valid["odometer"], bins=bins, labels=False)
    test_df["odometer_map"] = pd.cut(test_df["odometer"], bins=bins, labels=False)
    
    cross_features = [
        'region', 'year_map', 'manufacturer', 'condition', 'cylinders','fuel', 'odometer_map', 'title_status', 'transmission', 'drive', 'size',
        'type', 'paint_color', 'state'
    ]
    for i, col1 in enumerate(cross_features):
        for col2 in cross_features[i+1:]:
            tmp = X_train.groupby([col1, col2])["price"].mean().reset_index()
            X_train = pd.merge(X_train, tmp.rename(columns={"price":f"{col1}*{col2}_price"}), on=[col1, col2], how="left")
            X_valid = pd.merge(X_valid, tmp.rename(columns={"price":f"{col1}*{col2}_price"}), on=[col1, col2], how="left")
            test_df = pd.merge(test_df, tmp.rename(columns={"price":f"{col1}*{col2}_price"}), on=[col1, col2], how="left")
            
            
    for cols in tqdm(combinations(cross_features, 3), total=comb(len(cross_features), 3)):
        group_cols = list(cols)  # Convert the combination tuple to a list

        tmp = X_train.groupby(group_cols)["price"].mean().reset_index()
        tmp.rename(columns={"price": f"{group_cols[0]}*{group_cols[1]}*{group_cols[2]}_price"}, inplace=True)

        X_train = pd.merge(X_train, tmp, on=group_cols, how="left")
        X_valid = pd.merge(X_valid, tmp, on=group_cols, how="left")
        test_df = pd.merge(test_df, tmp, on=group_cols, how="left")

            
    # OrdinalEncoder: これはfoldごとではなくともよい
    oe = OrdinalEncoder(categories="auto",
                        handle_unknown="use_encoded_value",
                        unknown_value=999,
                        encoded_missing_value=np.nan, # QUESTION: 欠損値は-1に変換する -> NaNに??
                        )
    CFG.categorical_features_ = [feature + "_category" for feature in CFG.categorical_features]
    X_train[CFG.categorical_features_] = oe.fit_transform(X_train[CFG.categorical_features].values)
    X_valid[CFG.categorical_features_] = oe.transform(X_valid[CFG.categorical_features].values)
    test_df[CFG.categorical_features_] = oe.transform(test_df[CFG.categorical_features].values)
    return X_train, X_valid, test_df

In [8]:
# ===================================================================
#  evaluate
# ===================================================================
def train_lgb(CFG, lgb_param):
    oof_df = pd.DataFrame()
    preds = []
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        # train
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train["price"], categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid["price"], categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                        )
        X_valid[f"pred"] = model.predict(X_valid[CFG.use_features], num_iteration=model.best_iteration)
        print(f"fold{fold}:", get_score(y_true=X_valid["price"], y_pred=X_valid["pred"]))
        oof_df = pd.concat([oof_df, X_valid], ignore_index=True)
        preds.append(model.predict(test_df[CFG.use_features], num_iteration=model.best_iteration))
    test[f"pred"] = np.mean(preds, axis=0)        
    score = get_score(oof_df["price"], oof_df["pred"])
    return score, oof_df, test

In [9]:
# ===================================================================
#  evaluate
# ===================================================================
CFG.use_features = [
    'odometer', 'year', 'type_mean_elapsed_years_encoding_diff', 'title_status*transmission*size_price', 'paint_color_category',
    'odometer_map*drive*size_price', 'region*fuel_price', 'manufacturer_max_odometer_encoding', 'sqrt_elapsed_years*odometer',
    'cylinders*drive*size_price', 'year_map*condition*odometer_map_price', 'year_map*type_price', 'condition_std_odometer_encoding', 
    'manufacturer*drive*type_price', 'condition*cylinders*fuel_price', 'year_map*fuel*drive_price', 'year_map*fuel*type_price', 
    'type_mean_elapsed_years_encoding', 'odometer_map*drive_price', 'condition*transmission*paint_color_price',
    'condition*title_status*paint_color_price'
]


lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "n_estimators": 4358,
    "learning_rate": 0.011020815051252329,
    "max_depth": 14,
    "num_leaves": 9096,
    "min_data_in_leaf": 223,
    "max_bin": 317,
    "subsample": 0.5793029095425193,
    "subsample_freq": 6,
    "feature_fraction": 0.19198340541599757,
    "reg_lambda": 0.00016294484253991218,
    "reg_alpha": 7.36743363005601e-05,
    "scale_pos_weight": 34.564814252410926,
    "num_threads":CFG.num_cores,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1,   
}


best_score, oof_df, test_df = train_lgb(CFG, lgb_param)
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score}'+'\033[0m')

  0%|          | 0/364 [00:00<?, ?it/s]

fold0: 43.24492287022692


  0%|          | 0/364 [00:00<?, ?it/s]

fold1: 44.83931308314574


  0%|          | 0/364 [00:00<?, ?it/s]

fold2: 43.66379093572097


  0%|          | 0/364 [00:00<?, ?it/s]

fold3: 44.90243827488906


  0%|          | 0/364 [00:00<?, ?it/s]

fold4: 43.149383102925015


  0%|          | 0/364 [00:00<?, ?it/s]

fold5: 43.179885007607496


  0%|          | 0/364 [00:00<?, ?it/s]

fold6: 44.77057050626627


  0%|          | 0/364 [00:00<?, ?it/s]

fold7: 43.937338857624766
[32m43.96098462821153[0m


In [10]:
# ===================================================================
#  test
# ===================================================================
oof_df[["id", "pred"]].to_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", index=False, header=None)
test_df[["id", "pred"]].to_csv(CFG.save_dir+f"{CFG.filename}.csv", index=False, header=None)
test_df[["id", "pred"]]

Unnamed: 0,id,pred
0,27532,9211.745829
1,27533,5638.017983
2,27534,5378.844623
3,27535,18263.872975
4,27536,4245.195615
...,...,...
27532,55064,10014.768978
27533,55065,8761.781589
27534,55066,6072.441981
27535,55067,5146.113453
