<a class="anchor" id="chapter1"></a>
## Install Packages and Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder, 
    StandardScaler, 
    MinMaxScaler,
    RobustScaler
)
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

RANDOM_STATE = 42
TEST_SIZE = 0.2

import warnings
warnings.filterwarnings("ignore")

<a class="anchor" id="chapter2"></a>
## Data Loading & Description

In [2]:
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv', index_col= 'id')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv', index_col= 'id')
    submission_df = pd.read_csv('/kaggle/input/playground-series-s5e2/sample_submission.csv')
except:
    train_df = pd.read_csv('train.csv', index_col= 'id')
    test_df = pd.read_csv('test.csv', index_col= 'id')
    submission_df = pd.read_csv('sample_submission.csv')

In [3]:
def get_info(df):
    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} rows")
    print(f"\n{df.shape[1]:,.0f} columns")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')
    print(f'\nData Types: \n{df.dtypes}')


display(train_df.head(5))
get_info(train_df)

Unnamed: 0_level_0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312



DataFrame shape: (300000, 10)

300,000 rows

10 columns

Missing Data: 
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

Duplicates: 0

Data Types: 
Brand                    object
Material                 object
Size                     object
Compartments            float64
Laptop Compartment       object
Waterproof               object
Style                    object
Color                    object
Weight Capacity (kg)    float64
Price                   float64
dtype: object


In [4]:
# Define features and target
numerical_features = ['compartments', 'weight_capacity']

categorical_features = ['brand', 'material', 'size', 'laptop_compartment',
                         'waterproof', 'style', 'color']

target_column = 'price'

def prepare_dataset(df):
    df = df.rename(
        columns={
            "Laptop Compartment": "laptop_compartment",
            "Weight Capacity (kg)": "weight_capacity",})
    df.columns = df.columns.str.lower()
    df[categorical_features] = df[categorical_features].fillna('Missing').astype('category')
    return df

train_df = prepare_dataset(train_df)
test_df = prepare_dataset(test_df)

## Split Data & Build Pipeline

In [5]:
train_copy = train_df.copy()
test_copy = test_df.copy()

X = train_df.drop(columns=[target_column])
y = train_df[target_column]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size = TEST_SIZE, 
    random_state = RANDOM_STATE)

print(X_train.shape, X_val.shape)

(240000, 9) (60000, 9)


In [9]:
#Load the train and validation data into the LightGBM dataset object
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

def lgb_objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'boosting_type': "gbdt",
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', -1, 20),  
        "verbosity": -1,
        'random_state': RANDOM_STATE
    }
    # Train LightGBM model
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_eval],
    )

    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    return rmse


lgb_study = optuna.create_study(study_name="LGBM_Kaggle", direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
lgb_study.optimize(lgb_objective, n_trials=30, show_progress_bar=True)

# Best parameters and RMSE
print("Best parameters:", lgb_study.best_params)
print("Best RMSE:", lgb_study.best_value)

[I 2025-02-08 16:21:30,994] A new study created in memory with name: LGBM_Kaggle


  0%|          | 0/30 [00:00<?, ?it/s]

Best parameters: {'n_estimators': 596, 'learning_rate': 0.028636022166460558, 'num_leaves': 101, 'max_depth': 3}
Best RMSE: 38.89825169899989


In [10]:
def cb_objective(trial):
    cat_params = {
        "n_estimators": trial.suggest_int('n_estimators', 100, 200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = cb.CatBoostRegressor(**cat_params, silent=True, cat_features=categorical_features)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse =mean_squared_error(y_val, predictions, squared=False)
    return rmse

cb_study = optuna.create_study(study_name="CatBoost_Kaggle", direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
cb_study.optimize(cb_objective, n_trials=30, show_progress_bar=True)

# Best parameters and RMSE
print("Best parameters:", cb_study.best_params)
print("Best RMSE:", cb_study.best_value)

  0%|          | 0/30 [00:00<?, ?it/s]

Best parameters: {'n_estimators': 171, 'learning_rate': 0.08406160599410098, 'depth': 4, 'subsample': 0.6306540418843685, 'colsample_bylevel': 0.44623630424509697, 'min_data_in_leaf': 24}
Best RMSE: 38.907112539651294


In [11]:
def xgb_objective(trial):
    xgb_params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int('n_estimators', 100, 200),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBRegressor(**xgb_params, enable_categorical=True)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

xgb_study = optuna.create_study(study_name="CatBoost_Kaggle", direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
xgb_study.optimize(xgb_objective, n_trials=30, show_progress_bar=True)

# Best parameters and RMSE
print("Best parameters:", xgb_study.best_params)
print("Best RMSE:", xgb_study.best_value)

  0%|          | 0/30 [00:00<?, ?it/s]

Best parameters: {'n_estimators': 183, 'learning_rate': 0.07647201108700992, 'max_depth': 5, 'subsample': 0.7711141818334715, 'colsample_bytree': 0.30804671866341454, 'min_child_weight': 3}
Best RMSE: 38.893023798717


In [12]:
print(lgb_study.best_params)
print(cb_study.best_params)
print(xgb_study.best_params)

In [27]:
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 596,
    'learning_rate': 0.028636022166460558,
    'num_leaves': 101,
    'max_depth': 3,
    "verbosity": -1
}

cat_params = {
    'n_estimators': 171,
     'learning_rate': 0.08406160599410098,
     'depth': 4,
     'subsample': 0.6306540418843685,
     'colsample_bylevel': 0.44623630424509697,
     'min_data_in_leaf': 24,
    'loss_function': 'RMSE',
    'random_seed': 42
}

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_estimators': 183,
     'learning_rate': 0.07647201108700992,
     'max_depth': 5,
     'subsample': 0.7711141818334715,
     'colsample_bytree': 0.30804671866341454,
     'min_child_weight': 3,
    'seed': 42
}

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_catboost = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Fold {fold + 1}/{n_folds}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # LightGBM
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    oof_lgb[val_idx] = lgb_model.predict(X_val)

    # XGBoost
    xgb_model = xgb.XGBRegressor(**xgb_params, enable_categorical=True)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    oof_xgb[val_idx] = xgb_model.predict(X_val)

    # CatBoost
    cat_model = cb.CatBoostRegressor(**cat_params)
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False, cat_features=categorical_features)
    oof_catboost[val_idx] = cat_model.predict(X_val)


meta_features = np.column_stack([oof_lgb, oof_xgb, oof_catboost])

In [41]:
train_copy['price'] = pd.DataFrame(meta_features)[2]

In [38]:
from sklearn.linear_model import LinearRegression

# Обучение финальной модели (линейная регрессия)
blender = LinearRegression()
blender.fit(meta_features, y)

# Предсказания моделей на тестовых данных
test_lgb = lgb_model.predict(test_df)
test_xgb = xgb_model.predict(test_df)
test_catboost = cat_model.predict(test_df)

# Мета-признаки для тестовых данных
test_meta_features = np.column_stack([test_lgb, test_xgb, test_catboost])

# Предсказания блендера
final_predictions = blender.predict(test_meta_features)

In [43]:
pd.DataFrame({'id': submission_df['id'], 'Price': final_predictions}).to_csv("submission.csv", index=False)