In [48]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import time


from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error
from tqdm.auto import tqdm

import warnings
warnings.simplefilter("ignore")

import unicodedata
import lightgbm as lgb

In [49]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    filename = "exp012"
    seed = 42
    n_splits = 5
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    year_bins = 20
    num_boost_round = 10000
    stopping_rounds = 100
    n_trials = 300
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    num_cores = 4 # kaggleの方と統一
    categorical_features = [
        "fuel", "title_status", "type", "state", "region", "manufacturer", "condition", "cylinders", "transmission", "drive", "size", "paint_color"
        ]
    use_features = ["odometer", "year"]

In [50]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [51]:
# ===================================================================
#  Data Loading
# ===================================================================
train = pd.read_csv(CFG.data_dir+"train.csv")
test = pd.read_csv(CFG.data_dir+"test.csv")

region_coor = pd.read_csv(CFG.data_dir+"region_coordinate.csv")
state_coor = pd.read_csv(CFG.data_dir+"state_coordinate.csv")

train["flag"] = "train"
test["flag"] = "test"
all_data = pd.concat([train, test], ignore_index=True)

In [52]:
# ===================================================================
#  feature_engineering
# ===================================================================
def preprocessing(all_data: pd.DataFrame):
    """
    train, testデータで共通の前処理のコード
    
    ・yearの異常値を直す
    ・manufacturerの表記を統一する
    ・sizeの表記を統一する
    ・regionの欠損値をtrain dataの(state, region)の組み合わせから補完する。残った欠損値は調べて補完する。
    ・title_statusとtypeの欠損値処理はとりあえず放置

    Args:
        all_data (pd.DataFrame): pd.concat([train, test], ignore_index=True)
    """
    # year
    year_dict = {
        2999:1999,
        3008:2008,
        3011:2011,
        3015:2015,
        3017:2017,
        3019:2019,
    }
    all_data["year"] = all_data["year"].replace(year_dict)
    
    
    # manufacturer
    all_data["manufacturer"] = all_data["manufacturer"].str.lower().apply(lambda x: unicodedata.normalize('NFKC', x))
    manufacturer_map = {
        'niѕsan':'nissan',
        'nisѕan':'nissan',
        'subαru':'subaru',
        'toyotа':'toyota',
        'sαturn':'saturn',
        'аcura':'acura',
        'vоlkswagen':'volkswagen',
        'lexuѕ':'lexus',
        'ᴄhrysler':'chrysler',
    }
    all_data["manufacturer"] = all_data["manufacturer"].replace(manufacturer_map)
    
    
    # size
    size_dict = {
        "fullーsize":"full-size",
        "midーsize":"mid-size",
        "subーcompact":"sub-compact",
        "full−size":"full-size",
        "mid−size":"mid-size"
    }
    all_data["size"] = all_data["size"].replace(size_dict)
        
    
    # 地域
    ## region -> stateが一意に定まることを確認
    region_state = {region:{} for region in all_data[all_data["flag"]=="train"]['region'].unique()}
    for row, value in all_data[all_data["flag"]=="train"].iterrows():
        if not pd.isna(value['state']):
            if value['state'] not in region_state[value['region']]:
                region_state[value['region']][value['state']] = 1
            else:
                region_state[value['region']][value['state']] += 1
    for region, state_dict in region_state.items():
        if len(state_dict) > 1 or state_dict == {}:
            region_state[region] = pd.NA
        else:
            region_state[region] = list(state_dict.keys())[0]

    ## regionからstateを決定
    all_data['state'] = [region_state[region] if pd.isna(state) else state for region, state in zip(all_data['region'], all_data['state'])]
    all_data.loc[all_data["region"] == "northwest KS", "state"] = "ks"
    all_data.loc[all_data["region"] == "ashtabula", "state"] = "oh"
    all_data.loc[all_data["region"] == "southern WV", "state"] = "wv"
    
    all_data = pd.merge(all_data, region_coor, on="region", how="left")
    all_data = pd.merge(all_data, state_coor, on="state", how="left")
    
    
    # type
    ## 欠損値 train: 456, test: 229
    
    # title_status
    ## 欠損値 train: 456, test: 229
    
    # fuel
    ## 欠損値 train: 1239, test: 1495
    
    
    #all_data["elapsed_years"] = 2023 - all_data["year"]
    #all_data["log_elapsed_years"] = np.log(all_data["elapsed_years"])
    #all_data["sqrt_elapsed_years"] = np.sqrt(all_data["elapsed_years"])
    
    return all_data

all_data = preprocessing(all_data)

In [53]:
column_lists = []
for i, col1 in enumerate(all_data.columns):
    for col2 in all_data.columns[i+1:]:
        if col1 not in ["id", "flag", "region_latitude", "region_longitude", "state_latitude", "state_longitude"] \
            and col2 not in ["id", "flag", "region_latitude", "region_longitude", "state_latitude", "state_longitude"]:
            column_lists.append(f"{col1}*{col2}")
len(column_lists)

105

In [54]:
column_lists = []
for i, col1 in enumerate(all_data.columns):
    for col2 in all_data.columns[i+1:]:
        for col3 in all_data.columns[i+2:]:
            if col1 not in ["id", "flag", "region_latitude", "region_longitude", "state_latitude", "state_longitude"] \
                and col2 not in ["id", "flag", "region_latitude", "region_longitude", "state_latitude", "state_longitude"] \
                and col3 not in ["id", "flag", "region_latitude", "region_longitude", "state_latitude", "state_longitude"]:
                column_lists.append(f"{col1}*{col2}*{col3}")

In [55]:
len(column_lists)

910

# EDA

In [56]:
train = all_data[all_data["flag"] == "train"].reset_index(drop=True)
test = all_data[all_data["flag"] == "test"].reset_index(drop=True)
train.head(2)

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,...,size,type,paint_color,state,price,flag,region_latitude,region_longitude,state_latitude,state_longitude
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,...,mid-size,convertible,orange,tn,27587.0,train,36.162277,-86.774298,35.773008,-86.282008
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,...,full-size,sedan,silver,pa,4724.0,train,40.79445,-77.861639,40.969989,-77.727883


### region*year

In [57]:
train["year"].describe(), test["year"].describe()

(count    27532.000000
 mean      2007.687673
 std         10.123062
 min       1921.000000
 25%       2005.000000
 50%       2011.000000
 75%       2014.000000
 max       2022.000000
 Name: year, dtype: float64,
 count    27537.000000
 mean      2007.729963
 std         10.070629
 min       1918.000000
 25%       2005.000000
 50%       2011.000000
 75%       2014.000000
 max       2022.000000
 Name: year, dtype: float64)

In [58]:
2022 - 1918

104

In [59]:
train["year_map"] = pd.cut(train["year"], bins=20)

In [60]:
train.groupby(["region", "year_map"])["price"].mean().reset_index().head()

Unnamed: 0,region,year_map,price
0,SF bay area,"(1920.899, 1926.05]",
1,SF bay area,"(1926.05, 1931.1]",
2,SF bay area,"(1931.1, 1936.15]",
3,SF bay area,"(1936.15, 1941.2]",
4,SF bay area,"(1941.2, 1946.25]",


In [61]:
train[["region", "year", "price"]]

Unnamed: 0,region,year,price
0,nashville,1949,27587.0
1,state college,2013,4724.0
2,wichita,1998,10931.0
3,albany,2014,16553.0
4,redding,2005,5158.0
...,...,...,...
27527,williamsport,2008,32212.0
27528,tulsa,2007,5400.0
27529,rochester,2019,22227.0
27530,rochester,2007,3054.0


### region*manufacturer

In [62]:
train.groupby(["region", "manufacturer"])["price"].mean().reset_index()

Unnamed: 0,region,manufacturer,price
0,SF bay area,audi,24500.500000
1,SF bay area,bmw,12924.333333
2,SF bay area,buick,13430.250000
3,SF bay area,chevrolet,11825.833333
4,SF bay area,chrysler,12113.000000
...,...,...,...
4761,yuma,mercury,11351.000000
4762,yuma,nissan,18456.000000
4763,yuma,ram,14038.500000
4764,zanesville / cambridge,cadillac,7035.000000


In [63]:
train.columns

Index(['id', 'region', 'year', 'manufacturer', 'condition', 'cylinders',
       'fuel', 'odometer', 'title_status', 'transmission', 'drive', 'size',
       'type', 'paint_color', 'state', 'price', 'flag', 'region_latitude',
       'region_longitude', 'state_latitude', 'state_longitude', 'year_map'],
      dtype='object')

In [64]:
train["year_map"], bins = pd.cut(train["year"], bins=20, labels=False, retbins=True)
test["year_map"] = pd.cut(test["year"], bins=bins, labels=False)

train["odometer_map"], bins = pd.cut(train["odometer"], bins=20, labels=False, retbins=True)
test["odometer_map"] = pd.cut(test["odometer"], bins=bins, labels=False)

In [65]:
cross_features = [
    'region', 'year_map', 'manufacturer', 'condition', 'cylinders','fuel', 'odometer_map', 'title_status', 'transmission', 'drive', 'size','type', 'paint_color', 'state'
]

In [66]:
%%time
feature_lists = []
for i, col1 in enumerate(cross_features):
    for col2 in cross_features[i+1:]:
        tmp = train.groupby([col1, col2])["price"].mean().reset_index()
        train = pd.merge(train, tmp.rename(columns={"price":f"{col1}*{col2}_price"}), on=[col1, col2], how="left")
        test = pd.merge(test, tmp.rename(columns={"price":f"{col1}*{col2}_price"}), on=[col1, col2], how="left")
        feature_lists.append(f"{col1}*{col2}_price")
len(feature_lists)

CPU times: total: 1.02 s
Wall time: 3.56 s


91

In [67]:
train.shape

(27532, 114)

In [68]:
train

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,...,drive*size_price,drive*type_price,drive*paint_color_price,drive*state_price,size*type_price,size*paint_color_price,size*state_price,type*paint_color_price,type*state_price,paint_color*state_price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,...,16013.005004,14651.078125,13348.240000,17345.658824,14390.806452,12030.455882,13714.420513,12605.055556,14647.800000,15759.000000
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,...,16014.492322,13744.359005,13588.812030,15436.922330,10112.552864,13243.710119,15889.824627,8905.090805,10428.287154,11651.779412
2,2,wichita,1998,ford,good,6 cylinders,gas,152492,clean,automatic,...,10008.531190,10886.885397,8857.045902,9172.852459,14356.455067,13243.710119,12315.029586,11680.770390,13446.423729,13645.285714
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118,clean,manual,...,10316.294803,10886.885397,10258.387225,10727.686623,13471.419145,12666.507825,13194.025678,14818.886775,14672.803894,14593.364780
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554,clean,manual,...,10316.294803,9329.735034,8025.303290,10127.907615,10586.228855,10353.979550,12274.816759,8366.818942,10255.738824,11547.035971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,williamsport,2008,ford,good,6 cylinders,gas,26660,clean,automatic,...,12710.242424,18667.249761,16672.089356,15436.922330,10227.571429,9650.829907,9482.431250,22469.251661,21385.462069,15496.245000
27528,27528,tulsa,2007,ford,excellent,8 cylinders,gas,108072,clean,automatic,...,16014.492322,17171.089015,16672.089356,15503.544118,19430.294643,16606.119514,16546.134715,22077.809589,20500.125000,17193.436893
27529,27529,rochester,2019,jeep,like new,6 cylinders,gas,139908,clean,automatic,...,15951.936823,16077.483015,22431.067251,19652.651034,13471.419145,14119.694707,13194.025678,15261.641849,14672.803894,18549.196226
27530,27530,rochester,2007,jeep,excellent,6 cylinders,gas,112326,clean,automatic,...,15951.936823,13195.144869,22431.067251,19652.651034,10586.228855,14119.694707,13194.025678,11616.464578,10332.437194,18549.196226


In [69]:
feature_lists = []
for i, col1 in enumerate(tqdm(cross_features)):
    for col2 in cross_features[i+1:]:
        for col3 in cross_features[i+2:]:      
            tmp = train.groupby([col1, col2, col3])["price"].mean().reset_index()
            train = pd.merge(train, tmp.rename(columns={"price":f"{col1}*{col2}*{col3}_price"}), on=[col1, col2], how="left")
            test = pd.merge(test, tmp.rename(columns={"price":f"{col1}*{col2}*{col3}_price"}), on=[col1, col2], how="left")
            feature_lists.append(f"{col1}*{col2}*{col3}_price")

  0%|          | 0/14 [00:00<?, ?it/s]

In [72]:
from math import comb
from itertools import combinations

for cols in tqdm(combinations(cross_features, 3), len=comb(len(cross_features), 3)):
    group_cols = list(cols)  # Convert the combination tuple to a list
    
    tmp = train.groupby(group_cols)["price"].mean().reset_index()
    tmp.rename(columns={"price": f"{group_cols[0]}*{group_cols[1]}*{group_cols[2]}_price"}, inplace=True)
    
    train = pd.merge(train, tmp, on=group_cols, how="left")
    test = pd.merge(test, tmp, on=group_cols, how="left")

0it [00:00, ?it/s]

In [77]:
len(combinations(cross_features, 3))

TypeError: object of type 'itertools.combinations' has no len()

In [78]:
len(cross_features)

14

In [81]:

num_cross_feature_combinations = comb(len(cross_features), 3)

In [82]:
num_cross_feature_combinations

364

In [80]:
tmp = len(cross_features)
tmp * (tmp-1) * (tmp-2) / (3*2)
14 * 13 * 12 / (3*2)

364.0