In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce

import lightgbm as lgb

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

# load data
train_df = pd.read_csv("G:/マイドライブ/signate_StudentCup2023/data/train.csv")
test_df = pd.read_csv("G:/マイドライブ/signate_StudentCup2023/data/train.csv")

# preprocessing
cat_cols = ["region", "manufacturer", "condition", "fuel", "title_status", "cylinders",
            "transmission", "drive", "size", "type", "paint_color", "state"]

## cat -> count encoding
def count_encoder(df, cat_cols):
    ce_ord = ce.CountEncoder(cols = cat_cols)
    encoded_df = ce_ord.fit_transform(df)
    df = df.drop(columns=cat_cols)
    
    return encoded_df

## target log transform
def log_trainsform(df, cols):
    return df[cols]

train_df = count_encoder(train_df, cat_cols)
test_df = count_encoder(test_df, cat_cols)

# model
features = [c for c in train_df.columns if c not in ["id", "price"]]

target = train_df["price"]

param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'mape',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'regression', 
    'verbosity': -1
}

folds = KFold(n_splits=10, shuffle=True, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_+1))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    rgl = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    callbacks=[lgb.early_stopping(stopping_rounds=3000, verbose=True),
                               lgb.log_evaluation(1000)]
                   )
    oof[val_idx] = rgl.predict(train_df.iloc[val_idx][features], num_iteration=rgl.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = rgl.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += rgl.predict(test_df[features], num_iteration=rgl.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_absolute_percentage_error(target, oof)))

Fold 1
Training until validation scores don't improve for 3000 rounds
[1000]	training's mape: 0.745707	valid_1's mape: 0.757366
[2000]	training's mape: 0.693864	valid_1's mape: 0.704871
[3000]	training's mape: 0.684957	valid_1's mape: 0.699572
Early stopping, best iteration is:
[55]	training's mape: 0.540518	valid_1's mape: 0.542869
Fold 2
Training until validation scores don't improve for 3000 rounds
[1000]	training's mape: 0.750086	valid_1's mape: 0.733298
[2000]	training's mape: 0.696792	valid_1's mape: 0.689048
[3000]	training's mape: 0.685935	valid_1's mape: 0.680045
Early stopping, best iteration is:
[60]	training's mape: 0.542047	valid_1's mape: 0.528498
Fold 3
Training until validation scores don't improve for 3000 rounds
[1000]	training's mape: 0.746993	valid_1's mape: 0.745189
[2000]	training's mape: 0.69496	valid_1's mape: 0.702459
[3000]	training's mape: 0.686266	valid_1's mape: 0.696175
Early stopping, best iteration is:
[60]	training's mape: 0.541718	valid_1's mape: 0.530

In [6]:
train_df

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,229,1949,2736,15219,11504,23546,115148,22365,6156,4834,9371,418,144,3304,27587
1,1,12,2013,1570,2404,5727,23546,172038,22365,20497,4834,14585,9259,4300,983,4724
2,2,233,1998,6166,6009,11504,23546,152492,22365,20497,14602,14585,7311,4300,252,10931
3,3,504,2014,6166,15219,10071,23546,104118,22365,6156,14602,9371,7311,3765,1841,16553
4,4,153,2005,6166,15219,11504,23546,144554,22365,6156,14602,9371,9259,1419,2702,5158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,49,2008,6166,6009,11504,23546,26660,22365,20497,4834,3352,3837,10143,983,32212
27528,27528,166,2007,6166,15219,5727,23546,108072,22365,20497,4834,14585,2118,10143,3304,5400
27529,27529,522,2019,1156,3810,11504,23546,139908,22365,20497,8096,9371,7311,3624,1841,22227
27530,27530,522,2007,1156,15219,11504,23546,112326,22365,20497,8096,9371,9259,3624,1841,3054
