In [1]:
import pandas as pd 
import numpy as np
# import grid search
from sklearn.model_selection import GridSearchCV

In [2]:
# import decision tree regressors 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# import xgboost regressor
from xgboost import XGBRegressor

# import ada boost regressor
from sklearn.ensemble import AdaBoostRegressor

# import support vector regressor
from sklearn.svm import SVR

# import cat boost regressor
from catboost import CatBoostRegressor

# import voting regressor
from sklearn.ensemble import VotingRegressor

# import optuna
import optuna


In [3]:
#load the data
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')
submission = pd.read_csv('data/sample_submission.csv')



In [4]:
from sklearn.metrics import make_scorer

def rmsle(y_true, y_pred):
    """
    Compute the Root Mean Squared Log Error (RMSLE) between true and predicted values.
    
    Parameters:
    y_true : array-like
        Array containing the true target values.
        
    y_pred : array-like
        Array containing the predicted target values.
        
    Returns:
    float
        RMSLE value.
    """
    # Ensure positive values
    y_true = np.maximum(0, y_true)
    y_pred = np.maximum(0, y_pred)
    
    # Compute RMSLE
    rmsle = np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))
    
    return rmsle

# Create RMSLE scorer for GridSearchCV
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [5]:

# create grid search function for random forest regressor

params = {
    'n_estimators': [500, 600, 700, 800],
    'max_depth': np.arange(8, 16, 4),
    'min_samples_split': np.arange(2, 8, 2),
    'min_samples_leaf': np.arange(8, 16, 4)
}

rf_grid = GridSearchCV(RandomForestRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

rf_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 3.7min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 3.7min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 3.7min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=600; total time= 4.4min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=600; total time= 4.4min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=600; total time= 4.4min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=700; total time= 5.2min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=2, n_estimators=700; total time= 5.2min
[CV] END max_depth=8, min_samples_leaf=8, min_samples_split=4, n_estimators=500; total time= 3.7min
[CV] END max_depth=8, min_samples_leaf

In [None]:
# print the best parameters
print(rf_grid.best_params_)

# print the best score
print(rf_grid.best_score_)


{'max_depth': 9, 'min_samples_leaf': 9, 'min_samples_split': 4, 'n_estimators': 500}
-0.15181639257059806


In [None]:
# create grid search function for gradient boosting regressor

params = {
    'n_estimators': [100, 200, 300, 350],
    'max_depth': np.arange(2, 10,2),
    'min_samples_split': np.arange(2, 10,2),
    'min_samples_leaf': np.arange(2, 10,2)
}

gb_grid = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

gb_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  56.7s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  56.9s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  57.0s
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 1.5min
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 1.5min
[CV] END max_depth=2, min_samples_leaf=2, min_samples_split=4, n_estimators=300; total time=  53.1s
[CV] END max_depth=2, min_samples_leaf

In [None]:
# print the best parameters
print(gb_grid.best_params_)

# print the best score
print(gb_grid.best_score_)


{'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 300}
-0.15081923346306483


In [None]:
# create grid search function for xgboost regressor

params = {
    'n_estimators': [500, 600, 700, 800],
    'max_depth': np.arange(2, 10,2),
    'learning_rate': [0.001, 0.01, 0.1]
}

xgb_grid = GridSearchCV(XGBRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

xgb_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=100; total time=   0.5s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=100; total time=   0.5s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=100; total time=   0.5s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.7s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.7s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=300; total time=   0.9s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=400; total time=   1.1s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=400; total time=   1.1s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=200; total time=   0.6s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=400; total time=   1.1s
[CV] END .learning_rate=0.001, max_depth=2, n_estimators=500; total time=   1.2s
[CV] END .learning_rate=0.001, max_depth=2, n_

In [None]:
# print the best parameters
print(xgb_grid.best_params_)

# print the best score
print(xgb_grid.best_score_)


{'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 500}
-0.15086746538311258


In [None]:
# create grid search function for cat boost regressor

params = {
    'iterations': [450, 500, 600, 700, 800],
    'learning_rate': [ 0.01, 0.1, 0.2]
}

cat_grid = GridSearchCV(CatBoostRegressor(), param_grid=params, cv=3, n_jobs=-1, verbose=2, scoring=rmsle_scorer)

cat_grid.fit(train.drop('rings', axis=1), train['rings'])

Fitting 3 folds for each of 25 candidates, totalling 75 fits
0:	learn: 2.9816824	total: 95.6ms	remaining: 9.46s
0:	learn: 3.1848796	total: 95.3ms	remaining: 9.44s
0:	learn: 3.1684963	total: 95.3ms	remaining: 9.43s
0:	learn: 3.1424983	total: 95.8ms	remaining: 9.48s
0:	learn: 3.1792504	total: 96.3ms	remaining: 9.53s
0:	learn: 3.1629283	total: 106ms	remaining: 10.5s
0:	learn: 3.1589447	total: 110ms	remaining: 10.9s
0:	learn: 3.0033946	total: 111ms	remaining: 11s
1:	learn: 3.1506904	total: 111ms	remaining: 5.43s
1:	learn: 3.1830527	total: 111ms	remaining: 5.45s
1:	learn: 3.1774514	total: 114ms	remaining: 5.59s
1:	learn: 3.1249533	total: 118ms	remaining: 5.78s
1:	learn: 3.1451872	total: 122ms	remaining: 5.96s
1:	learn: 3.1571662	total: 122ms	remaining: 5.97s
1:	learn: 2.8303489	total: 122ms	remaining: 5.99s
2:	learn: 3.1330732	total: 123ms	remaining: 3.98s
1:	learn: 2.8499328	total: 124ms	remaining: 6.07s
2:	learn: 3.1812442	total: 124ms	remaining: 4.01s
2:	learn: 3.1756338	total: 132ms	rem

In [None]:
# print the best parameters
print(cat_grid.best_params_)

# print the best score
print(cat_grid.best_score_)


{'iterations': 500, 'learning_rate': 0.1}
-0.15059435623518969


In [None]:
cv_estimators = [
    ('lgbm', LGBMRegressor(**lgbm_params)),
    ('xgboost', XGBRegressor(**xgboost_params)),
    ('catboost', CatBoostRegressor(**catboost_params))
]

In [None]:
def objective(trial):
    
    params = {
        'lgbm_weight': trial.suggest_float('lgbm_weight', 0.0, 5.0),
        'xgboost_weight': trial.suggest_float('xgboost_weight', 0.0, 5.0),
        'catboost_weight': trial.suggest_float('catboost_weight', 0.0, 5.0),
    }


    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    scores = []
    for _, (train_index, valid_index) in enumerate(cv.split(train, y)):
        X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
        X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
        voting_regressor = VotingRegressor(
            estimators=cv_estimators,
            weights=[params['lgbm_weight'], params['xgboost_weight'], params['catboost_weight']]
        )
        voting_regressor.fit(X_train, y_train)
        y_pred = voting_regressor.predict(X_valid)  
        scores = root_mean_squared_error(y_valid, y_pred)  
    return np.mean(scores)


study = optuna.create_study(direction='minimize', study_name="voting_regressor_optuna")
if FIND_BEST_PARAMS:
    study.optimize(objective, n_trials=100)
    print(f"Best trial average RMSE: {study.best_value:.4f}")
    for key, value in study.best_params.items():
        print(f"{key}: {value}")

In [None]:
weight_best_params = {
    'lgbm_weight': 3.0860711610688636, 
    'xgboost_weight': 1.793424750707662, 
    'catboost_weight': 4.59273791580418
}


voting_regressor = VotingRegressor(
    estimators=cv_estimators,
    weights=[ weight_best_params['lgbm_weight'], 
              weight_best_params['xgboost_weight'], 
              weight_best_params['catboost_weight']
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
scores = []
y_pred_test = []
for fold_i, (train_index, valid_index) in enumerate(cv.split(train, y)):
    X_train, y_train = train.iloc[train_index], y_log.iloc[train_index]
    X_valid, y_valid = train.iloc[valid_index], y_log.iloc[valid_index]
    voting_regressor.fit(X_train, y_train)
    y_pred = voting_regressor.predict(X_valid)  
    scores = root_mean_squared_error(y_valid, y_pred)  
    y_pred_test.append(voting_regressor.predict(test))
    print(f"FOLD {fold_i} Done. RMSE : {scores}")
print(f"All FOLD. Mean RMSE : {np.mean(scores)}")