In [120]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import polars as pl

from math import comb
from tqdm.auto import tqdm
from itertools import combinations

from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error

import warnings
#warnings.simplefilter("ignore")

import unicodedata
import lightgbm as lgb

In [121]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    filename = "exp040"
    seed = 42
    n_splits = 8
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    num_boost_round = 10000
    stopping_rounds = 100
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    num_cores = 4 # kaggleの方と統一
    categorical_features = [
        "fuel", "title_status", "type", "state", "region", "manufacturer", "condition", "cylinders", "transmission", "drive", "size", "paint_color"
        ]

In [122]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [123]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv")
test = pl.read_csv(CFG.data_dir+"test.csv")

region_coor = pl.read_csv(CFG.data_dir+"region_coordinate.csv")
state_coor = pl.read_csv(CFG.data_dir+"state_coordinate.csv")

train = train.with_columns(pl.lit("train").alias("flag"))
test = test.with_columns(
    [
        pl.lit(None, dtype=pl.Int64).alias("price"),
        pl.lit("test").alias("flag"),
     ])
all_data = pl.concat([train, test])

In [124]:
# ===================================================================
#  feature_engineering
# ===================================================================
def preprocessing(all_data: pl.DataFrame) ->pl.DataFrame:
    """
    train, testデータで共通の前処理のコード
    
    ・yearの異常値を直す
    ・manufacturerの表記を統一する
    ・sizeの表記を統一する
    ・regionの欠損値をtrain dataの(state, region)の組み合わせから補完する。残った欠損値は調べて補完する。
    ・title_statusとtypeの欠損値処理はとりあえず放置
    ・year関係の特徴量を加える

    Args:
        all_data (pl.DataFrame): pl.concat([train, test])
    """
    # year
    year_dict = {
        2999:1999,
        3008:2008,
        3011:2011,
        3015:2015,
        3017:2017,
        3019:2019,
    }
    all_data = all_data.with_columns(
        pl.when(pl.col("year").is_in(list(year_dict.keys())))
        .then(pl.col("year").apply(lambda x: year_dict.get(x)))
        .otherwise(pl.col("year"))
        .alias("year")
    )
    
    # manufacturer
    all_data = all_data.with_columns(
        pl.col("manufacturer").apply(lambda x: unicodedata.normalize('NFKC', x.lower())).alias("manufacturer")
    )
    manufacturer_map = {
        'niѕsan':'nissan',
        'nisѕan':'nissan',
        'subαru':'subaru',
        'toyotа':'toyota',
        'sαturn':'saturn',
        'аcura':'acura',
        'vоlkswagen':'volkswagen',
        'lexuѕ':'lexus',
        'ᴄhrysler':'chrysler',
    }
    all_data = all_data.with_columns(
        [
            pl.when(pl.col("manufacturer").is_in(list(manufacturer_map.keys())))
            .then(pl.col("manufacturer").apply(lambda x: manufacturer_map.get(x)))
            .otherwise(pl.col("manufacturer"))
        ]
    )
    
    # size
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    all_data = all_data.with_columns([
        pl.when(pl.col("size").is_in(list(size_dict.keys())))
        .then(pl.col("size").apply(lambda x: size_dict.get(x)))
        .otherwise(pl.col("size"))
    ])
    
    
    # region
    region_state = {region:{} for region in all_data.filter(pl.col("flag") == "train").select(pl.col("region")).unique().get_columns()[0]}
    for row in all_data.filter(pl.col("flag") == "train").select(pl.col("state", "region")).rows():
        if row[0] is not None:
            if row[0] not in region_state[row[1]]:
                region_state[row[1]][row[0]] = 1
            else:
                region_state[row[1]][row[0]] += 1
    for region, state_dict in region_state.items():
        if len(state_dict) > 1 or state_dict == {}:
            region_state[region] = np.nan
        else:
            region_state[region] = list(state_dict.keys())[0]
    
    all_data = all_data.with_columns(
        [
            pl.when(pl.col("state").is_null())
            .then(pl.col("region").apply(lambda x: region_state.get(x)))
            .otherwise(pl.col("state"))
            .alias("state")
        ]
    )
    
    all_data = all_data.with_columns(
        pl.when(pl.col("region") == "northwest KS").then(pl.lit('ks'))
        .when(pl.col("region") == "ashtabula").then(pl.lit('oh'))
        .when(pl.col("region") == "southern WV").then(pl.lit('wv'))
        .otherwise(pl.col("state"))
        .alias("state")
    )
    
    # 緯度経度
    all_data = all_data.join(region_coor, on="region", how="left")
    all_data = all_data.join(state_coor, on="state", how="left")
    
    # type
    ## 欠損値 train: 456, test: 229
    
    # title_status
    ## 欠損値 train: 456, test: 229
    
    # fuel
    ## 欠損値 train: 1239, test: 1495
    
    all_data = all_data.with_columns(
        [
            (2023 - pl.col("year")).alias("elapsed_years"),
        ]
    )
    
    all_data = all_data.with_columns(
        [
            pl.col("elapsed_years").log().alias("log_elapsed_years"),
            pl.col("elapsed_years").sqrt().alias("sqrt_elapsed_years"),
        ]
    )
    return all_data
all_data = preprocessing(all_data)

In [125]:
# ===================================================================
#  Cross Validation
# ===================================================================
train = all_data.filter(pl.col("flag") == "train")
test = all_data.filter(pl.col("flag") == "test")

train = train.sort(by="price")
train = train.with_columns(
    [
        pl.Series([i for i in range(CFG.n_splits)] * (train.shape[0] // CFG.n_splits) + [i for i in range(train.shape[0] % CFG.n_splits)]).alias("fold")
    ]
)
train.sort(by="id")
print("The variance of the mean of the folds: ", train.groupby("fold").agg(pl.col("price").mean()).get_columns()[1].std())

The variance of the mean of the folds:  7.232164844021038


In [101]:
def preprocessing_per_fold(CFG, train:pd.DataFrame, test: pd.DataFrame,  fold: int = 0):
    """foldごとの前処理: leakageを防ぐ

    Args:
        CFG :config
        train (pd.DataFrame): 学習データ
        test (pd.DataFrame, optional): test data Defaults to None.
        fold (int, optional): Defaults to 0.
    """
    X_train = train.filter(
        pl.col("fold") != fold
    )
    X_valid = train.filter(
        pl.col("fold") == fold
    )
    test_df = test.clone()
    
    fillna_map = X_train.filter(
        (100 < pl.col("odometer")) | (pl.col("odometer") < 400_000) 
    ).groupby("region").agg(pl.col("odometer").mean())
    
    def replace_odometer(df: pl.DataFrame, fillna_map: pl.DataFrame)-> pl.DataFrame:
        df_1 = df.filter(
            (pl.col("odometer") < 100) | (pl.col("odometer") > 400_000)
        )
        df_2 = df.filter(
            (pl.col("odometer") > 100) & (pl.col("odometer") < 400_000)
        )
        df_1 = df_1.drop("odometer")
        df_1 = df_1.join(fillna_map, on="region", how="left")
        df_2 = df_2.with_columns(
            pl.col("odometer").cast(pl.Float64)
        )
        df = pl.concat([df_1, df_2], how="diagonal")
        return df
    X_train = replace_odometer(X_train, fillna_map)
    X_valid = replace_odometer(X_valid, fillna_map)
    test_df = replace_odometer(test_df, fillna_map)

    odometer_mean = X_train["odometer"].mean()

    X_train = X_train.with_columns(pl.col("odometer").fill_nan(odometer_mean))
    X_valid = X_valid.with_columns(pl.col("odometer").fill_nan(odometer_mean))
    test_df = test_df.with_columns(pl.col("odometer").fill_nan(odometer_mean))
    
    
    def apply_fe(df: pl.DataFrame) -> pl.DataFrame:
        df = df.with_columns(
            [
                pl.col("odometer").log().alias("log_odometer"),
                pl.col("odometer").sqrt().alias("sqrt_odometer"),
                (pl.col("elapsed_years") * pl.col("odometer")).alias("elapsed_years*odometer"),
                (pl.col("elapsed_years") * pl.col("odometer").log()).alias("elapsed_years*log_odometer"),
                (pl.col("elapsed_years") * pl.col("odometer").sqrt()).alias("elapsed_years*sqrt_odometer"),
                (pl.col("elapsed_years").log() * pl.col("odometer")).alias("log_elapsed_years*odometer"),
                (pl.col("elapsed_years").log() * pl.col("odometer").log()).alias("log_elapsed_years*log_odometer"),
                (pl.col("elapsed_years").log() * pl.col("odometer").sqrt()).alias("log_elapsed_years*sqrt_odometer"),
                (pl.col("elapsed_years").sqrt() * pl.col("odometer")).alias("sqrt_elapsed_years*odometer"),
                (pl.col("elapsed_years").sqrt() * pl.col("odometer").log()).alias("sqrt_elapsed_years*log_odometer"),
                (pl.col("elapsed_years").sqrt() * pl.col("odometer").sqrt()).alias("sqrt_elapsed_years*sqrt_odometer"),
                
            ]
        )
        return df
    X_train = apply_fe(X_train)
    X_valid = apply_fe(X_valid)
    test_df = apply_fe(test_df)
    
    
    for col in CFG.categorical_features:
        count_map = X_train[col].value_counts().rename({"counts":col+"_count_encoding"})
        X_train = X_train.join(count_map, on=col, how="left")
        X_valid = X_valid.join(count_map, on=col, how="left")
        test_df = test_df.join(count_map, on=col, how="left")
        
        
    for col in CFG.categorical_features:
        fillna_map =  X_train.groupby(col).agg(
            pl.col("price").mean().alias(col+"_mean_encoding"),
            pl.col("price").std().alias(col+"_std_encoding"),
            pl.col("price").max().alias(col+"_max_encoding"),
            pl.col("price").min().alias(col+"_min_encoding"),
            pl.col("price").median().alias(col+"_median_encoding"),
            
            pl.col("odometer").mean().alias(col+"_mean_odometer_encoding"),
            pl.col("odometer").std().alias(col+"_std_odometer_encoding"),
            pl.col("odometer").max().alias(col+"_max_odometer_encoding"),
            pl.col("odometer").min().alias(col+"_min_odometer_encoding"),
            pl.col("odometer").median().alias(col+"_median_odometer_encoding"),
            #(pl.col("odometer").median() - pl.col("odometer")).alias(col+"_median_odometer_encoding_diff"),
            #(pl.col("odometer").mean() - pl.col("odometer")).alias(col+"_mean_odometer_encoding_diff"),
            
            pl.col("year").mean().alias(col+"_mean_elapsed_years_encoding"),
            pl.col("year").std().alias(col+"_std_elapsed_years_encoding"),
            pl.col("year").max().alias(col+"_max_elapsed_years_encoding"),
            pl.col("year").min().alias(col+"_min_elapsed_years_encoding"),
            pl.col("year").median().alias(col+"_median_elapsed_years_encoding"),
            #(pl.col("year").median() - pl.col("year")).alias(col+"_median_year_encoding_diff"),
            #(pl.col("year").mean() - pl.col("year")).alias(col+"_mean_year_encoding_diff"),
            )
        X_train = X_train.join(fillna_map, on=col, how="left")
        X_train = X_train.with_columns(
            [
                (pl.col(col+"_median_odometer_encoding") - pl.col("odometer")).alias(col+"_median_odometer_encoding_diff"),
                (pl.col(col+"_mean_odometer_encoding") - pl.col("odometer")).alias(col+"_mean_odometer_encoding_diff"),
                (pl.col(col+"_median_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_median_elapsed_years_encoding_diff"),
                (pl.col(col+"_mean_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_mean_elapsed_years_encoding_diff"),
                
            ]
        )
        X_valid = X_valid.join(fillna_map, on=col, how="left")
        X_valid = X_valid.with_columns(
            [
                (pl.col(col+"_median_odometer_encoding") - pl.col("odometer")).alias(col+"_median_odometer_encoding_diff"),
                (pl.col(col+"_mean_odometer_encoding") - pl.col("odometer")).alias(col+"_mean_odometer_encoding_diff"),
                (pl.col(col+"_median_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_median_elapsed_years_encoding_diff"),
                (pl.col(col+"_mean_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_mean_elapsed_years_encoding_diff"),
                
            ]
        )
        test_df = test_df.join(fillna_map, on=col, how="left")
        test_df = test_df.with_columns(
            [
                (pl.col(col+"_median_odometer_encoding") - pl.col("odometer")).alias(col+"_median_odometer_encoding_diff"),
                (pl.col(col+"_mean_odometer_encoding") - pl.col("odometer")).alias(col+"_mean_odometer_encoding_diff"),
                (pl.col(col+"_median_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_median_elapsed_years_encoding_diff"),
                (pl.col(col+"_mean_elapsed_years_encoding") - pl.col("elapsed_years")).alias(col+"_mean_elapsed_years_encoding_diff"),
                
            ]
        )
        
        
    value, bins = pd.cut(X_train["year"], bins=20, labels=False, retbins=True)
    X_train = X_train.with_columns(
        pl.Series(value).cast(pl.Float64).alias("year_map")
    )
    value = pd.cut(X_valid["year"], bins=bins, labels=False)
    X_valid = X_valid.with_columns(
        pl.Series(value).cast(pl.Float64).alias("year_map")
    )
    value = pd.cut(test_df["year"], bins=bins, labels=False)
    test_df= test_df.with_columns(
        pl.Series(value).cast(pl.Float64).alias("year_map")
    )
    
    
    value, bins = pd.cut(X_train["odometer"], bins=20, labels=False, retbins=True)
    X_train = X_train.with_columns(
        pl.Series(value).cast(pl.Float64).alias("odometer_map")
    )
    value = pd.cut(X_valid["odometer"], bins=bins, labels=False)
    X_valid = X_valid.with_columns(
        pl.Series(value).cast(pl.Float64).alias("odometer_map")
    )
    value = pd.cut(test_df["odometer"], bins=bins, labels=False)
    test_df= test_df.with_columns(
        pl.Series(value).cast(pl.Float64).alias("odometer_map")
    )
    
    
    cross_features = [
        'region', 'year_map', 'manufacturer', 'condition', 'cylinders','fuel', 'odometer_map', 'title_status', 'transmission', 'drive', 'size',
        'type', 'paint_color', 'state'
    ]
    
    
    for i, col1 in enumerate(cross_features):
        for col2 in cross_features[i+1:]:
            tmp = X_train.groupby([col1, col2]).agg(pl.col("price").mean()).rename({"price":f"{col1}*{col2}_price"})
            X_train = X_train.join(tmp, on=[col1, col2], how="left")
            X_valid = X_valid.join(tmp, on=[col1, col2], how="left")
            test_df = test_df.join(tmp, on=[col1, col2], how="left")
            
            
    for cols in combinations(cross_features, 3):
        group_cols = list(cols)  # Convert the combination tuple to a list
        tmp = X_train.groupby(group_cols).agg(pl.col("price").mean()).rename({"price":f"{group_cols[0]}*{group_cols[1]}*{group_cols[2]}_price"})
        X_train = X_train.join(tmp, on=group_cols, how="left")
        X_valid = X_valid.join(tmp, on=group_cols, how="left")
        test_df = test_df.join(tmp, on=group_cols, how="left")
        
        
    # OrdinalEncoder: これはfoldごとではなくともよい
    oe = OrdinalEncoder(categories="auto",
                        handle_unknown="use_encoded_value",
                        unknown_value=-2,
                        encoded_missing_value=-1, # QUESTION: 欠損値は-1に変換する -> NaNに??
                        )
    
    X_train = pd.DataFrame(X_train, columns=X_train.columns, dtype=)
    X_valid = pd.DataFrame(X_valid, columns=X_valid.columns)
    test_df = pd.DataFrame(test_df, columns=test_df.columns)
    CFG.categorical_features_ = [feature + "_category" for feature in CFG.categorical_features]
    X_train[CFG.categorical_features_] = oe.fit_transform(X_train[CFG.categorical_features].values)
    X_valid[CFG.categorical_features_] = oe.transform(X_valid[CFG.categorical_features].values)
    test_df[CFG.categorical_features_] = oe.transform(test_df[CFG.categorical_features].values)
    return X_train, X_valid, test_df

In [None]:
X_train.

In [102]:
X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, 0)

In [103]:
CFG.use_features = [
    'odometer', 'year', 'drive*size*paint_color_price', 'title_status*transmission_price', 'manufacturer*condition_price', 'condition_max_encoding', 
    'condition_min_elapsed_years_encoding', 'year_map*cylinders*drive_price', 'fuel*title_status*size_price', 'manufacturer*odometer_map_price', 
    'transmission*drive*paint_color_price', 'condition_median_elapsed_years_encoding', 'transmission_max_encoding', 'size_category', 'size*type_price',
    'odometer_map*drive*type_price', 'year_map*title_status*drive_price', 'fuel*drive*paint_color_price', 'cylinders*drive*type_price',
    'condition*type_price', 'year_map*odometer_map_price', 'condition*cylinders_price', 'fuel*title_status*paint_color_price'
]

In [105]:
categorical_features = [col for col in CFG.use_features if "_category" in col]
lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train["price"], categorical_feature = categorical_features,)
lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid["price"], categorical_feature = categorical_features,)

In [108]:
X_train

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24090 entries, 0 to 24089
Columns: 745 entries, id to paint_color_category
dtypes: float64(12), object(733)
memory usage: 136.9+ MB


In [106]:
model = lgb.train(
                lgb_param, 
                lgb_train, 
                valid_sets=[lgb_valid],
                categorical_feature = categorical_features,
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                )



ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: odometer: object, year: object, drive*size*paint_color_price: object, title_status*transmission_price: object, manufacturer*condition_price: object, condition_max_encoding: object, condition_min_elapsed_years_encoding: object, year_map*cylinders*drive_price: object, fuel*title_status*size_price: object, manufacturer*odometer_map_price: object, transmission*drive*paint_color_price: object, condition_median_elapsed_years_encoding: object, transmission_max_encoding: object, size*type_price: object, odometer_map*drive*type_price: object, year_map*title_status*drive_price: object, fuel*drive*paint_color_price: object, cylinders*drive*type_price: object, condition*type_price: object, year_map*odometer_map_price: object, condition*cylinders_price: object, fuel*title_status*paint_color_price: object

In [37]:
# ===================================================================
#  evaluate
# ===================================================================
def train_lgb(CFG, lgb_param):
    oof_df = pl.DataFrame()
    preds = []
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        # train
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train["price"], categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid["price"], categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                        )
        X_valid = X_valid.with_columns(
            pl.Series(model.predict(X_valid[CFG.use_features], num_iteration=model.best_iteration)).alias("pred")
        )
        print(f"fold{fold}:", get_score(y_true=X_valid["price"], y_pred=X_valid["pred"]))
        oof_df = pl.concat([oof_df, X_valid])
        preds.append(model.predict(test_df[CFG.use_features], num_iteration=model.best_iteration))
    test_df = test_df.with_columns(
        pl.Series(np.mean(preds, axis=0))
    )
    score = get_score(oof_df["price"], oof_df["pred"])
    return score, oof_df, test_df

In [39]:
# ===================================================================
#  evaluate
# ===================================================================
CFG.use_features = [
    'odometer', 'year', 'drive*size*paint_color_price', 'title_status*transmission_price', 'manufacturer*condition_price', 'condition_max_encoding', 
    'condition_min_elapsed_years_encoding', 'year_map*cylinders*drive_price', 'fuel*title_status*size_price', 'manufacturer*odometer_map_price', 
    'transmission*drive*paint_color_price', 'condition_median_elapsed_years_encoding', 'transmission_max_encoding', 'size_category', 'size*type_price',
    'odometer_map*drive*type_price', 'year_map*title_status*drive_price', 'fuel*drive*paint_color_price', 'cylinders*drive*type_price',
    'condition*type_price', 'year_map*odometer_map_price', 'condition*cylinders_price', 'fuel*title_status*paint_color_price'
]

CFG.use_features = [col for col in CFG.use_features if "_category" not in col]


lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "n_estimators": 2506,
    "learning_rate": 0.011165706106193283,
    "max_depth": 13,
    "num_leaves": 4371,
    "min_data_in_leaf": 192,
    "max_bin": 378,
    "subsample": 0.9989161453457421,
    "subsample_freq": 1,
    "feature_fraction": 0.21529074165670292,
    "reg_lambda": 4.8708700129192055e-06,
    "reg_alpha": 1.9865145426739786e-06,
    "scale_pos_weight": 0.0010821916071735351,
    "num_threads":CFG.num_cores,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1,   
}

best_score, oof_df, test_df = train_lgb(CFG, lgb_param)
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score}'+'\033[0m')



TypeError: Wrong type(Series) for label.
It should be list, numpy 1-D array or pandas Series