In [86]:
import pandas as pd 
import numpy as np
# import grid search
from sklearn.model_selection import GridSearchCV
# import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
# import root mean squared error
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:
# import decision tree regressors 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# import xgboost regressor
from xgboost import XGBRegressor

# import ada boost regressor
from sklearn.ensemble import AdaBoostRegressor

# import support vector regressor
from sklearn.svm import SVR

# import cat boost regressor
from catboost import CatBoostRegressor

# import voting regressor
from sklearn.ensemble import VotingRegressor

# import lightgbm regressor
from lightgbm import LGBMRegressor

# import optuna
import optuna


In [18]:
#load the data
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')
submission = pd.read_csv('data/sample_submission.csv')



In [19]:
# train = train[['shell_weight',
#  'meat_to_shell_ratio',
#  'whole_weight',
#  'shell_thickness',
#  'shape_index',
#  'relative_shell_weight',
#  'length_to_diameter_ratio',
#  'density',
#  'viscera_to_meat_ratio',
#  'gut_density',
#  'meat_density',
#  'gut_weight',
#  'weight_of_meat', 'rings']]

# test = test[['shell_weight',
#     'meat_to_shell_ratio',
#     'whole_weight',
#     'shell_thickness',
#     'shape_index',
#     'relative_shell_weight',
#     'length_to_diameter_ratio',
#     'density',
#     'viscera_to_meat_ratio',
#     'gut_density',
#     'meat_density',
#     'gut_weight']]

In [20]:
from sklearn.metrics import make_scorer

def rmsle(y_true, y_pred):
    """
    Compute the Root Mean Squared Log Error (RMSLE) between true and predicted values.
    
    Parameters:
    y_true : array-like
        Array containing the true target values.
        
    y_pred : array-like
        Array containing the predicted target values.
        
    Returns:
    float
        RMSLE value.
    """
    # Ensure positive values
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    
    # Compute RMSLE
    rmsle = np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))
    
    return rmsle

# Create RMSLE scorer for GridSearchCV
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [54]:

# create grid search function for random forest regressor

params = {
    'n_estimators': [1150,1200, 1275],
    'max_depth': np.arange(12, 13, 1),
    'min_samples_split': np.arange(5, 7, 1),
    'min_samples_leaf': np.arange(12, 13, 1)
}

rf_grid = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

rf_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1150; total time=11.8min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1150; total time=11.9min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1150; total time=11.9min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1200; total time=12.4min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1200; total time=12.5min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1200; total time=12.5min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1275; total time=13.3min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=5, n_estimators=1275; total time=13.3min
[CV] END max_depth=12, min_samples_leaf=12, min_samples_split=6, n_estimators=1150; total time=13.5min
[CV] END max_

In [55]:
# print the best parameters
print(rf_grid.best_params_)

# print the best score
print(rf_grid.best_score_)


{'max_depth': 12, 'min_samples_leaf': 12, 'min_samples_split': 6, 'n_estimators': 1200}
-0.15115600032728035


In [56]:
# create grid search function for gradient boosting regressor

params = {
    'n_estimators': [275, 300, 325],
    'max_depth': np.arange(5, 7,1),
    'min_samples_split': np.arange(5, 7,1),
    'min_samples_leaf': np.arange(8, 9,1)
}

gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

gb_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=275; total time= 2.1min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=275; total time= 2.1min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=275; total time= 2.1min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time= 2.3min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time= 2.3min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=300; total time= 2.4min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=325; total time= 2.6min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=325; total time= 2.6min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=6, n_estimators=275; total time= 2.5min
[CV] END max_depth=5, min_samples_leaf=8, min_samples_split=6, n_estimators=275; total time= 2.5min


In [57]:
# print the best parameters
print(gb_grid.best_params_)

# print the best score
print(gb_grid.best_score_)


{'max_depth': 5, 'min_samples_leaf': 8, 'min_samples_split': 6, 'n_estimators': 300}
-0.15048456907087643


In [58]:
# create grid search function for xgboost regressor

params = {
    'n_estimators': [1075, 1100, 1125],
    'max_depth': np.arange(5, 6,1),
    'learning_rate': [0.0175, 0.02, 0.0225]
}

xgb_grid = GridSearchCV(XGBRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

xgb_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1075; total time=   5.6s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1075; total time=   5.6s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1075; total time=   5.6s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1100; total time=   5.7s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1100; total time=   5.8s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1100; total time=   5.8s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1125; total time=   5.8s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1125; total time=   5.8s
[CV] END .learning_rate=0.02, max_depth=5, n_estimators=1075; total time=   5.8s
[CV] END .learning_rate=0.02, max_depth=5, n_estimators=1075; total time=   5.8s
[CV] END .learning_rate=0.02, max_depth=5, n_estimators=1075; total time=   6.0s
[CV] END learning_rate=0.0175, max_depth=5, n_estimators=1125; total time=   6.2s
[CV] END .learning_

In [59]:
# print the best parameters
print(xgb_grid.best_params_)

# print the best score
print(xgb_grid.best_score_)


{'learning_rate': 0.0175, 'max_depth': 5, 'n_estimators': 1125}
-0.15051402758702237


In [60]:
# create grid search function for cat boost regressor

params = {
    'iterations': [2250,2500,2750],
    'learning_rate': [ 0.03, 0.035,0.04]
}

cat_grid = GridSearchCV(CatBoostRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

cat_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 9 candidates, totalling 27 fits
0:	learn: 3.1268873	total: 108ms	remaining: 4m 2s
0:	learn: 3.1089887	total: 116ms	remaining: 4m 20s
1:	learn: 3.0748740	total: 118ms	remaining: 2m 12s
1:	learn: 3.0410795	total: 122ms	remaining: 2m 16s
2:	learn: 3.0251483	total: 125ms	remaining: 1m 33s
2:	learn: 2.9768529	total: 129ms	remaining: 1m 36s
3:	learn: 2.9148366	total: 134ms	remaining: 1m 15s
3:	learn: 2.9766175	total: 131ms	remaining: 1m 13s
4:	learn: 2.8561379	total: 138ms	remaining: 1m 2s
4:	learn: 2.9299198	total: 135ms	remaining: 1m
5:	learn: 2.8853498	total: 144ms	remaining: 53.8s
5:	learn: 2.8008323	total: 147ms	remaining: 55s
0:	learn: 3.1061786	total: 85.1ms	remaining: 3m 11s
6:	learn: 2.8429370	total: 151ms	remaining: 48.4s
6:	learn: 2.7490915	total: 157ms	remaining: 50.2s
1:	learn: 3.0551453	total: 93.8ms	remaining: 1m 45s
7:	learn: 2.8023211	total: 163ms	remaining: 45.5s
0:	learn: 3.0971488	total: 77.2ms	remaining: 2m 53s
7:	learn: 2.7005670	total: 171ms

In [61]:
# print the best parameters
print(cat_grid.best_params_)

# print the best score
print(cat_grid.best_score_)


{'iterations': 2750, 'learning_rate': 0.03}
-0.15015616246405447


In [62]:


# create grid search function for lightgbm regressor
params = {
    'n_estimators': [700, 750, 800],
    'max_depth': np.arange(3, 5,1),
    'learning_rate': [0.6,0.07,.08]
}

lgbm_grid = GridSearchCV(LGBMRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

lgbm_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4144
[LightGBM] [Info] Number of data points in the train set: 60410, number of used features: 21
[LightGBM] [Info] Start training from score 9.692650
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4148
[LightGBM] [Info] Total Bins 4147
[LightGBM] [Info] Number of data points in the train set: 60410, number of used features: 21[LightGBM] [Info] Number of data points in the

In [63]:
# print the best parameters

print(lgbm_grid.best_params_)

# print the best score

print(lgbm_grid.best_score_)

{'learning_rate': 0.07, 'max_depth': 4, 'n_estimators': 750}
-0.15038262104512087


In [79]:
cv_estimators = [
    ('lgbm', LGBMRegressor(**lgbm_grid.best_params_)),
    ('xgboost', XGBRegressor(**xgb_grid.best_params_)),
    ('catboost', CatBoostRegressor(**cat_grid.best_params_)),
    ('gradientboost', GradientBoostingRegressor(**gb_grid.best_params_)),
    ('randomforest', RandomForestRegressor(**rf_grid.best_params_))
]

In [80]:
y = train['rings']
# Because RMSLE score, We make a conversion like below:
y_log = np.log(1+y)

train_features = train.drop('rings', axis=1)

FIND_BEST_PARAMS = False


In [88]:
def objective(trial):
    
    params = {
        'lgbm_weight': trial.suggest_float('lgbm_weight', 0.0, 5.0),
        'xgboost_weight': trial.suggest_float('xgboost_weight', 0.0, 5.0),
        'catboost_weight': trial.suggest_float('catboost_weight', 0.0, 5.0),
        'gradientboost_weight': trial.suggest_float('gradientboost_weight', 0.0, 5.0),
        'randomforest_weight': trial.suggest_float('randomforest_weight', 0.0, 5.0)

    }


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train_features, y)):
        X_train, y_train = train_features.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train_features.iloc[valid_index], y_log.iloc[valid_index]
        voting_regressor = VotingRegressor(
            estimators=cv_estimators,
            weights=[params['lgbm_weight'], params['xgboost_weight'], params['catboost_weight'], params['gradientboost_weight'], params['randomforest_weight']]
        )
        voting_regressor.fit(X_train, y_train)
        y_pred = voting_regressor.predict(X_valid)  
        scores = mean_squared_error(y_valid, y_pred)  
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="voting_regressor_optuna")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=100)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

# run study 
study.optimize(objective, n_trials=100)

[I 2024-04-08 11:13:26,401] A new study created in memory with name: voting_regressor_optuna


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4151
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 21
[LightGBM] [Info] Start training from score 2.328973
0:	learn: 0.2801297	total: 5.8ms	remaining: 15.9s
1:	learn: 0.2748409	total: 13.9ms	remaining: 19s
2:	learn: 0.2698115	total: 17.5ms	remaining: 16s
3:	learn: 0.2649485	total: 21ms	remaining: 14.4s
4:	learn: 0.2602425	total: 24.3ms	remaining: 13.3s
5:	learn: 0.2557853	total: 27.8ms	remaining: 12.7s
6:	learn: 0.2515181	total: 31.2ms	remaining: 12.2s
7:	learn: 0.2474065	total: 34.6ms	remaining: 11.8s
8:	learn: 0.2434665	total: 37.9ms	remaining: 11.5s
9:	learn: 0.2395637	total: 41.1ms	remaining: 11.3s
10:	learn: 0.2359377	total: 44.5ms	remaining: 11.1s
11:	learn: 0.2324441	total: 47.8ms	remaining: 10.9s
12:	learn: 0.2290969	total: 51.2ms	remaining: 10.8s

[I 2024-04-08 12:14:04,412] Trial 0 finished with value: 0.021787391711552154 and parameters: {'lgbm_weight': 3.1884048152167903, 'xgboost_weight': 2.5132421002172505, 'catboost_weight': 3.3460575403051696, 'gradientboost_weight': 2.4440507634488733, 'randomforest_weight': 4.496705921889042}. Best is trial 0 with value: 0.021787391711552154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4151
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 21
[LightGBM] [Info] Start training from score 2.328973
0:	learn: 0.2801297	total: 5.32ms	remaining: 14.6s
1:	learn: 0.2748409	total: 9.1ms	remaining: 12.5s
2:	learn: 0.2698115	total: 12.6ms	remaining: 11.6s
3:	learn: 0.2649485	total: 16.4ms	remaining: 11.3s
4:	learn: 0.2602425	total: 19.8ms	remaining: 10.9s
5:	learn: 0.2557853	total: 23.2ms	remaining: 10.6s
6:	learn: 0.2515181	total: 26.6ms	remaining: 10.4s
7:	learn: 0.2474065	total: 29.8ms	remaining: 10.2s
8:	learn: 0.2434665	total: 33.2ms	remaining: 10.1s
9:	learn: 0.2395637	total: 36.4ms	remaining: 9.96s
10:	learn: 0.2359377	total: 40ms	remaining: 9.95s
11:	learn: 0.2324441	total: 43.2ms	

[I 2024-04-08 13:28:05,346] Trial 1 finished with value: 0.021851124758554143 and parameters: {'lgbm_weight': 0.4492954223640089, 'xgboost_weight': 3.62363915894746, 'catboost_weight': 2.5352582391484546, 'gradientboost_weight': 3.4187854099162296, 'randomforest_weight': 4.549762536360818}. Best is trial 0 with value: 0.021787391711552154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4151
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 21
[LightGBM] [Info] Start training from score 2.328973
0:	learn: 0.2801297	total: 14.7ms	remaining: 40.4s
1:	learn: 0.2748409	total: 18.2ms	remaining: 25.1s
2:	learn: 0.2698115	total: 21.7ms	remaining: 19.9s
3:	learn: 0.2649485	total: 24.8ms	remaining: 17s
4:	learn: 0.2602425	total: 28.1ms	remaining: 15.4s
5:	learn: 0.2557853	total: 31.5ms	remaining: 14.4s
6:	learn: 0.2515181	total: 34.8ms	remaining: 13.6s
7:	learn: 0.2474065	total: 38.1ms	remaining: 13.1s
8:	learn: 0.2434665	total: 41.5ms	remaining: 12.6s
9:	learn: 0.2395637	total: 44.6ms	remaining: 12.2s
10:	learn: 0.2359377	total: 47.8ms	remaining: 11.9s
11:	learn: 0.2324441	total: 51.1ms	remaining: 11.7s
12:	learn: 0.2290969	total: 54.6ms	remaining: 

[I 2024-04-08 14:31:23,694] Trial 2 finished with value: 0.021840451826203713 and parameters: {'lgbm_weight': 0.3191036537743741, 'xgboost_weight': 2.5572214787550034, 'catboost_weight': 2.2327925320498343, 'gradientboost_weight': 3.7196540145270607, 'randomforest_weight': 3.6536726597584055}. Best is trial 0 with value: 0.021787391711552154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4151
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 21
[LightGBM] [Info] Start training from score 2.328973
0:	learn: 0.2801297	total: 8.34ms	remaining: 22.9s
1:	learn: 0.2748409	total: 11.9ms	remaining: 16.3s
2:	learn: 0.2698115	total: 15.3ms	remaining: 14s
3:	learn: 0.2649485	total: 18.5ms	remaining: 12.7s
4:	learn: 0.2602425	total: 21.8ms	remaining: 12s
5:	learn: 0.2557853	total: 25ms	remaining: 11.4s
6:	learn: 0.2515181	total: 28.3ms	remaining: 11.1s
7:	learn: 0.2474065	total: 31.7ms	remaining: 10.9s
8:	learn: 0.2434665	total: 34.9ms	remaining: 10.6s
9:	learn: 0.2395637	total: 38.2ms	remaining: 10.5s
10:	learn: 0.2359377	total: 41.6ms	remaining: 10.4s
11:	learn: 0.2324441	total: 44.9ms	remaining: 10.2s
12:	learn: 0.2290969	total: 48.1ms	remaining: 10.1

In [None]:
weight_best_params = {
    'lgbm_weight': 3.0860711610688636, 
    'xgboost_weight': 1.793424750707662, 
    'catboost_weight': 4.59273791580418
}


voting_regressor = VotingRegressor(
    estimators=cv_estimators,
    weights=[ weight_best_params['lgbm_weight'], 
              weight_best_params['xgboost_weight'], 
              weight_best_params['catboost_weight']
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
y_pred_test = []
for fold_i, (train_index, valid_index) in enumerate(cv.split(train, y)):
    X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
    X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
    voting_regressor.fit(X_train, y_train)
    y_pred = voting_regressor.predict(X_valid)  
    scores = rmsle_scorer(y_valid, y_pred)  
    y_pred_test.append(voting_regressor.predict(test))
    print(f"FOLD {fold_i} Done. RMSE : {scores}")
print(f"All FOLD. Mean RMSE : {np.mean(scores)}")