In [4]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import os
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("dark") # Theme for plots as Dark
sns.set_palette("viridis")
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, mean_squared_log_error
from sklearn.svm import OneClassSVM
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor, HistGradientBoostingRegressor, IsolationForest
from sklearn.model_selection import RepeatedKFold
import optuna
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from catboost import Pool, CatBoostRegressor, cv
import sys
from tqdm import tqdm


In [5]:
train_data = pd.read_csv("train.csv", index_col="ID")
test_data = pd.read_csv("test.csv", index_col="ID")
sub = pd.read_csv("sample_submission.csv", index_col="ID")

In [6]:
seed = np.random.seed(6)

X = train_data.drop(["y"],axis=1)
y = train_data["y"]

In [7]:
lgbmmodel = LGBMRegressor(random_state=seed, verbose=-1)
xgbmodel = XGBRegressor(random_state=seed)
catmodel = CatBoostRegressor(random_state=seed, verbose=0)

In [None]:
print("CV RMSLE score of LGBM is ",np.sqrt(-cross_val_score(lgbmmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))
print("CV RMSLE score of XGB is ",np.sqrt(-cross_val_score(xgbmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))
print("CV RMSLE score of CAT is ",np.sqrt(-cross_val_score(catmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))

In [8]:
cols = test_data.columns

for fold in range(10):
    print(f"> Generating Fold {fold+1}")
    print(f"  Initial Size = {train_data.shape[0]}",end=" | ")
    
    lgbmmodel.fit(X,y)
    xgbmodel.fit(X,y)
    catmodel.fit(X,y)
    
    extra_train = test_data.copy()
    extra_train["LGBM"] = lgbmmodel.predict(extra_train[cols])
    extra_train["XGB"] = xgbmodel.predict(extra_train[cols])
    extra_train["CAT"] = catmodel.predict(extra_train[cols])
    extra_train["STD"] = np.std(extra_train[["LGBM","XGB","CAT"]],axis=1)
    extra_train["MEAN"] = np.mean(extra_train[["LGBM","XGB","CAT"]],axis=1)

    STD_THRESHOLD = extra_train["STD"].quantile(0.5)
    extra_train = extra_train[extra_train["STD"]<=STD_THRESHOLD]
    
    MEAN_THRESHOLD = 0.25
    extra_train = pd.concat([extra_train[extra_train["MEAN"]%1<MEAN_THRESHOLD],extra_train[extra_train["MEAN"]%1>(1-MEAN_THRESHOLD)]])
    extra_train["y"] = np.round(extra_train["MEAN"])

    train_data = pd.concat([train_data,extra_train[train_data.columns]])
    train_data.drop_duplicates(inplace=True)
    train_data.reset_index(inplace=True,drop=True)
    print(f"Final Size = {train_data.shape[0]}\n")

    X = train_data.drop(["y"],axis=1)
    y = train_data["y"]

> Generating Fold 1
  Initial Size = 40118 | Final Size = 41374

> Generating Fold 2
  Initial Size = 41374 | Final Size = 41868

> Generating Fold 3
  Initial Size = 41868 | Final Size = 42179

> Generating Fold 4
  Initial Size = 42179 | Final Size = 42401

> Generating Fold 5
  Initial Size = 42401 | Final Size = 42551

> Generating Fold 6
  Initial Size = 42551 | Final Size = 42661

> Generating Fold 7
  Initial Size = 42661 | Final Size = 42749

> Generating Fold 8
  Initial Size = 42749 | Final Size = 42839

> Generating Fold 9
  Initial Size = 42839 | Final Size = 42917

> Generating Fold 10
  Initial Size = 42917 | Final Size = 42998



In [9]:
train_data = train_data.sample(frac=1.0)
X = train_data.drop(["y"],axis=1)
y = train_data["y"]

In [None]:
print("New CV RMSLE score of LGBM is ",np.sqrt(-cross_val_score(lgbmmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))
print("New CV RMSLE score of XGB is ",np.sqrt(-cross_val_score(xgbmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))
print("New CV RMSLE score of CAT is ",np.sqrt(-cross_val_score(catmodel,X,y,cv=3, scoring = 'neg_mean_squared_log_error').mean()))

In [8]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_log_error
import numpy as np
import optuna
from tqdm import tqdm
import sys

In [None]:
def objective(trial):
    lgbm_params = {
        "random_state": seed,
        'n_estimators': 5000,        
        "max_depth": trial.suggest_int('max_depth', 2, 11),
        "learning_rate": trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        "min_child_weight": trial.suggest_float('min_child_weight', 0.5, 4),
        "min_child_samples": trial.suggest_int('min_child_samples', 1, 250),
        "subsample": trial.suggest_float('subsample', 0.2, 1),
        "subsample_freq": trial.suggest_int('subsample_freq', 0, 5),
        "colsample_bytree": trial.suggest_float('colsample_bytree', 0.2, 1),
        'num_leaves': trial.suggest_int('num_leaves', 8, 64),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        "metric": "rmse",  # Changed to always use RMSE
        "boosting_type": "gbdt",    
        "objective": 'regression',
        "device": "cpu",
        "verbose": -1,
        "early_stopping_rounds": 25,
    }
    
    cv = RepeatedKFold(n_splits=4, n_repeats=1, random_state=seed)
    scores = []
    
    for i, (tr, val) in tqdm(enumerate(cv.split(X, y)), total=4):
        X_train, X_test = X.iloc[tr, :], X.iloc[val, :]
        y_train, y_test = y.iloc[tr], y.iloc[val]

        lgbmmodel = LGBMRegressor(**lgbm_params)
        lgbmmodel.fit(
            X_train, y_train, 
            eval_set=[(X_test, y_test)],
            eval_metric='rmse',
        )
        
        y_pred = lgbmmodel.predict(X_test)
        msle = mean_squared_log_error(y_test, y_pred)
        rmsle = np.sqrt(msle)
        scores.append(rmsle)
    
    mean_rmsle = np.mean(scores)
    print(f" > Mean RMSLE of LGBM = {mean_rmsle:.4f}", file=sys.stderr)
    return mean_rmsle

# Create and run the study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=5000)

# Print the best parameters and score
print("Best parameters:", study.best_params)
print("Best RMSLE:", study.best_value)

Best parameters: {'max_depth': 14, 'learning_rate': 0.0016646618260675332, 'min_child_weight': 2.3502628015430296, 'min_child_samples': 86, 'subsample': 0.6188009687544263, 'subsample_freq': 1, 'colsample_bytree': 0.36345733100814603, 'num_leaves': 34, 'lambda_l1': 2.407605586973264e-05, 'lambda_l2': 0.005280242664440079}
Best RMSLE: 0.020562794021959464

Best parameters: {'max_depth': 8, 'learning_rate': 0.08510271028056993, 'min_child_weight': 3.770439629993889, 'min_child_samples': 84, 'subsample': 0.7900530607404135, 'subsample_freq': 3, 'colsample_bytree': 0.4825271131189338, 'num_leaves': 11, 'lambda_l1': 5.3464140299853975, 'lambda_l2': 3.891889594418509e-06}
Best RMSLE: 0.02055562526899524

Best parameters: {'max_depth': 4, 'learning_rate': 0.003853343265751353, 'min_child_weight': 1.8238667242311086, 'min_child_samples': 88, 'subsample': 0.20172390039981303, 'subsample_freq': 3, 'colsample_bytree': 0.4391384113015727, 'num_leaves': 21, 'lambda_l1': 0.011023766128262906, 'lambda_l2': 0.6331521226723741}
Best RMSLE: 0.020565202328586323

In [18]:
lgbm_params = {
    'n_estimators' : 8000,  
    "random_state": seed,
    "boosting_type": "gbdt",    
    "objective":'regression',
    "device": "cpu",
    "verbose": -1,
    "early_stopping_rounds" : 100,
    'max_depth': 14,
    'learning_rate': 0.0016646618260675332,
    'min_child_weight': 2.3502628015430296,
    'min_child_samples': 86,
    'subsample': 0.6188009687544263,
    'subsample_freq': 1,
    'colsample_bytree': 0.36345733100814603,
    'num_leaves': 34,
    'lambda_l1': 2.407605586973264e-05,
    'lambda_l2': 0.005280242664440079,
    'metric': 'huber'
}

In [None]:
def objective(trial):
    xgb_params = {
        'n_estimators': 5000,
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        "max_bin": trial.suggest_int('max_bin', 128, 512),
        'subsample': trial.suggest_float('subsample', 0.2, 1),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'gamma': trial.suggest_float("gamma", 1e-4, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 2, 4),
        "learning_rate": trial.suggest_float('learning_rate', 1e-3, 0.2, log=True),
        "colsample_bytree": trial.suggest_float('colsample_bytree', 0.2, 1),
        "colsample_bylevel": trial.suggest_float('colsample_bylevel', 0.2, 1),
        "colsample_bynode": trial.suggest_float('colsample_bynode', 0.2, 1),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "objective": trial.suggest_categorical("objective", ["reg:quantileerror", "reg:squaredlogerror", "reg:squarederror"]),
        #"tree_method": "gpu_hist",
        "early_stopping_rounds": 100,
        "random_state": seed,
        "eval_metric": "rmsle",
        "verbosity": 0,
        "device": "cpu"
    }

    if xgb_params["objective"] == "reg:quantileerror":
        xgb_params["quantile_alpha"] = trial.suggest_float('quantile_alpha', 0.1, 1.0, log=True)

    score = []
    rkf = RepeatedKFold(n_splits=4, n_repeats=1, random_state=seed)
    for i, (tr, val) in tqdm(enumerate(rkf.split(X, y)), total=4):
        X_train, X_test, y_train, y_test = X.iloc[tr, :], X.iloc[val, :], y.iloc[tr], y.iloc[val]

        xgbmodel = XGBRegressor(**xgb_params)
        xgbmodel.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            verbose=0
        )

        msle = mean_squared_log_error(y_test, xgbmodel.predict(X_test))
        rmsle = np.sqrt(msle)
        score.append(rmsle)

    print(f" > RMSLE of XGB =", score, file=sys.stderr)
    return np.mean(score)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=5000)

In [14]:
xgb_params = {
    'n_estimators' : 8000,
    'max_depth': 4,
    'max_bin': 371,
    'subsample': 0.44067836416916173,
    'alpha': 0.0006151068205422344,
    'gamma': 0.0003007261920999099,
    'lambda': 2.640618902919003,
    'min_child_weight': 3.672380942972272,
    'learning_rate': 0.014120863675708872,
    'colsample_bytree': 0.8969088050616272,
    'colsample_bylevel': 0.5646491700100473,
    'colsample_bynode': 0.21803093929408546,
    'grow_policy': 'depthwise',
    'objective': 'reg:squarederror',
    "tree_method" : "gpu_hist",
    "early_stopping_rounds" : 1000,
    "random_state" : seed,
    "eval_metric": "rmsle",
    "verbosity" :  0,
}

 Trial 6 finished with value: 0.02058957683499799 and parameters: {'max_depth': 4, 'max_bin': 371, 'subsample': 0.44067836416916173, 'alpha': 0.0006151068205422344, 'gamma': 0.0003007261920999099, 'lambda': 2.640618902919003, 'min_child_weight': 3.672380942972272, 'learning_rate': 0.014120863675708872, 'colsample_bytree': 0.8969088050616272, 'colsample_bylevel': 0.5646491700100473, 'colsample_bynode': 0.21803093929408546, 'grow_policy': 'depthwise', 'objective': 'reg:squarederror'}.

[I 2024-08-27 14:04:47,042] Trial 31 finished with value: 0.02058445996801202 and parameters: {'max_depth': 5, 'max_bin': 313, 'subsample': 0.6508269563647683, 'alpha': 7.351826666415423e-08, 'gamma': 0.900513980759424, 'lambda': 2.0746562001247155, 'min_child_weight': 3.7800724402378445, 'learning_rate': 0.011107654821623757, 'colsample_bytree': 0.9277848447908962, 'colsample_bylevel': 0.9125515253916135, 'colsample_bynode': 0.40599231187525964, 'grow_policy': 'lossguide', 'objective': 'reg:squarederror'}. Best is trial 31 with value: 0.02058445996801202.
100%|██████████| 4/4 [00:08<00:00,  2.19s/it]
 > RMSLE of XGB = [0.01854416913947817, 0.023020430986385277, 0.023611248547836135, 0.018377042499863803]

In [13]:
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_log_error
import optuna
import numpy as np
import sys
from tqdm import tqdm

def objective(trial):
    cat_params = {
        "iterations": 3000,
        "verbose": False,
        'depth': trial.suggest_int('depth', 6, 12), 
        'max_bin': trial.suggest_int("max_bin", 20, 256), 
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.5, 8.0), 
        "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 1, 100),         
        'random_strength': trial.suggest_float('random_strength', 0.5, 5.0), 
        "learning_rate": trial.suggest_float('learning_rate', 1e-2, 0.2, log=True), 
        "max_leaves": trial.suggest_int('max_leaves', 8, 256), 
        "eval_metric": trial.suggest_categorical("eval_metric",["RMSE","Quantile","MSLE"]),
        "loss_function": trial.suggest_categorical("loss_function",["RMSE","Quantile"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bernoulli", "Poisson"]),
        "task_type": "GPU",
        "od_type": "Iter",
        "random_state": seed,
        "early_stopping_rounds": 30,
        "grow_policy": 'Lossguide' 
    }
        
    score = []
    for i, (tr, val) in tqdm(enumerate(RepeatedKFold(n_splits=5, n_repeats=1, random_state=seed).split(X, y)), total=5):
        X_train, X_test, y_train, y_test = X.iloc[tr,:], X.iloc[val,:], y.iloc[tr], y.iloc[val]
        
        train_dataset = Pool(data=X.iloc[tr,:], label=y.iloc[tr])
        eval_dataset = Pool(data=X.iloc[val,:], label=y.iloc[val])
    
        catmodel = CatBoostRegressor(**cat_params)
        catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
        
        msle = mean_squared_log_error(y.iloc[val], catmodel.predict(X.iloc[val,:]))
        rmsle = np.sqrt(msle)
        score.append(rmsle)

    print(f" > RMSLE of CAT =", score, file=sys.stderr)
    return np.mean(score)
    
study = optuna.create_study(direction='minimize') 
study.optimize(objective, n_trials=100, timeout=8000)


[I 2024-09-06 21:17:42,937] A new study created in memory with name: no-name-82a427e9-b295-4856-b309-973cf632345e
  0%|          | 0/5 [00:19<?, ?it/s]
[W 2024-09-06 21:18:02,507] Trial 0 failed with parameters: {'depth': 11, 'max_bin': 244, 'l2_leaf_reg': 3.172537495334227, 'min_data_in_leaf': 68, 'random_strength': 4.681983482122428, 'learning_rate': 0.03518971818134924, 'max_leaves': 191, 'eval_metric': 'Quantile', 'loss_function': 'RMSE', 'bootstrap_type': 'Poisson'} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/home/ubuntu01/anaconda3/lib/python3.11/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_1369/3053619220.py", line 38, in objective
    catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
  File "/home/ubuntu01/anaconda3/lib/python3.11/site-packages/catboost/core.py", line 5827, in fit
    return 

KeyboardInterrupt: 

In [13]:
cat_params = {
    "iterations": 8000,
    "verbose": False,
    'depth': 6,
    'max_bin': 92,
    'l2_leaf_reg': 3.5831191004106717,
    'min_data_in_leaf': 37,
    'random_strength': 1.2543270199229692,
    'learning_rate': 0.035712453455385845,
    'max_leaves': 118,
    'eval_metric': 'RMSE',
    'loss_function': 'RMSE',
    'bootstrap_type': 'Poisson',
    "grow_policy": 'Lossguide',
    "task_type": "GPU",
    "random_state": seed,
    "early_stopping_rounds": 1000
}

Trial 89 finished with value: 0.020468911093357866 and parameters: {'depth': 6, 'max_bin': 92, 'l2_leaf_reg': 3.5831191004106717, 'min_data_in_leaf': 37, 'random_strength': 1.2543270199229692, 'learning_rate': 0.035712453455385845, 'max_leaves': 118, 'eval_metric': 'MSLE', 'loss_function': 'RMSE', 'bootstrap_type': 'Poisson'}. Best is trial 89 with value: 0.020468911093357866.

In [20]:
def objective(trial):

    xgb_wt =  trial.suggest_float('xgb_wt',0,10)
    lgbm_wt = trial.suggest_float('lgbm_wt',0,10)
    cat_wt = trial.suggest_float('cat_wt',0,10)
    RMSLE = []

    for i,(tr,val) in tqdm(enumerate(RepeatedKFold(n_splits=4, n_repeats=1,random_state=seed).split(X,y)),total = 4):

        X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
        
        print(f"\nLGBM_{i+1}",end=" | ", file = sys.stderr)
        lgbmmodel = LGBMRegressor(**lgbm_params)
        lgbmmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)], eval_names=["valid"],eval_metric=['MSLE'])

        print(f"CAT_{i+1}",end=" | ", file = sys.stderr)
        train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
        eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
        catmodel = CatBoostRegressor(**cat_params)
        catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)

        print(f"XGB_{i+1}", end = "", file = sys.stderr)
        xgbmodel = XGBRegressor(**xgb_params)
        xgbmodel.fit(X_train,y_train, eval_set=[(X_test,y_test)],verbose = 0)

        xgb_preds = xgbmodel.predict(X_test)
        lgbm_preds = lgbmmodel.predict(X_test)
        cat_preds = catmodel.predict(X_test)

        preds = ((xgb_wt*xgb_preds)+(lgbm_wt*lgbm_preds)+(cat_wt*cat_preds))/(xgb_wt+cat_wt+lgbm_wt)
        msle = mean_squared_log_error(y_test, preds)

        RMSLE.append(np.sqrt(msle))
    return np.mean(RMSLE)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200,timeout=5000)

[I 2024-09-10 21:19:33,126] A new study created in memory with name: no-name-4e97fb55-b7b5-40ab-a460-a35d623efc3a
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:31<01:35, 31.74s/it]
 50%|█████     | 2/4 [01:02<01:02, 31.29s/it]
 75%|███████▌  | 3/4 [01:31<00:30, 30.04s/it]
100%|██████████| 4/4 [01:58<00:00, 29.72s/it]
[I 2024-09-10 21:21:32,013] Trial 0 finished with value: 0.02084475751062633 and parameters: {'xgb_wt': 8.014485973509426, 'lgbm_wt': 2.5444905238673243, 'cat_wt': 1.152603522489306}. Best is trial 0 with value: 0.02084475751062633.
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:27<01:21, 27.02s/it]
 50%|█████     | 2/4 [00:53<00:52, 26.44s/it]
 75%|███████▌  | 3/4 [01:27<00:30, 30.03s/it]
100%|██████████| 4/4 [01:53<00:00, 28.34s/it]
[I 2024-09-10 21:23:25,369] Trial 1 finished with value: 0.020821467379592763 and parameters: {'xgb_wt': 4.6094788125823865, 'lgbm_wt': 0.2314903485932751, 'cat_wt': 4.325269006042918}. Best is trial 1 with va

KeyboardInterrupt: 