In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dm-dataset-2/classesup.npy
/kaggle/input/dm-dataset-2/model_dataset.csv
/kaggle/input/dm-dataset-2/k_fold_10.json
/kaggle/input/dm-dataset-2/classesdown.npy
/kaggle/input/dm-dataset-2/classes.npy


In [2]:
import json
import re
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler
from sklearn.metrics import mean_absolute_error
# Load the saved fold indices from the JSON file
with open('/kaggle/input/dm-dataset-2/k_fold_10.json', 'r') as file:
    fold_indices = json.load(file)
    
df=pd.read_csv('/kaggle/input/dm-dataset-2/model_dataset.csv')
score=df['score']
    
# Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
new_names = {col: re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns}
new_n_list = list(new_names.values())
# [LightGBM] Feature appears more than one time.
new_names = {col: f'{new_col}_{i}' if new_col in new_n_list[:i] else new_col for i, (col, new_col) in enumerate(new_names.items())}
df = df.rename(columns=new_names)

# 'fold_indices' now contains the loaded fold indices
Y=score
X=df.drop(columns=['id','score'])
scaler = StandardScaler()
X = scaler.fit_transform(X)

# optuna hyperparameter optimization

def objective(trial,X,Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)
    
    param = {
              'random_state': 42 ,
            'n_estimators': 10000,
            'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
            'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6,0.7,0.8]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.05,0.01,0.01]),
        'max_depth': trial.suggest_categorical('max_depth', [10,25,30]),
        'eta': trial.suggest_float('eta', 0.007, 0.013),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
            }
    
    model=xgb.XGBRegressor(predictor='gpu_predictor',
        n_jobs=4,eval_metric=mean_absolute_error,**param) 
    model.fit(x_train,y_train, eval_set=[(x_test,y_test)],verbose=False)
    
    preds=model.predict(x_test)
    
    mae=mean_absolute_error(y_test,preds)
    
    return mae
    
# studying the parameter
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X,Y), n_trials=15)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-11-07 23:49:47,841] A new study created in memory with name: no-name-1005bb72-9d8f-404a-a594-94d446f509a7
  'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
  'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
  'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
[I 2023-11-07 23:52:54,620] Trial 0 finished with value: 0.5353005528450012 and parameters: {'reg_alpha': 0.005436604958228731, 'reg_lambda': 0.4492973632526505, 'colsample_bytree': 0.6, 'subsample': 0.8, 'learning_rate': 0.05, 'max_depth': 25, 'eta': 0.011261950207626305, 'gamma': 5.442512957265412}. Best is trial 0 with value: 0.5353005528450012.
  'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
  'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
  'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
[I 2023-11-07 23:54:20,531] Trial 1 finished with value: 0.5333800156589462 and parameters: {'reg_alpha': 0.005243348077832719, 'reg_lambda': 0.27557353119

Number of finished trials: 15
Best trial: {'reg_alpha': 0.005267890553504806, 'reg_lambda': 0.274995940556105, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'eta': 0.01176581662520899, 'gamma': 5.460322647793291}


In [5]:
params=study.best_params 
params['random_state'] = 42
params['n_estimators'] = 10000 
mae=[]

import xgboost as xgb
for fold_number, fold_data in enumerate(fold_indices):
    train_index = fold_data['train_indices']
    test_index = fold_data['test_indices']
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    model_x = xgb.XGBRegressor(predictor='gpu_predictor',
        n_jobs=4,eval_metric=mean_absolute_error,**params )
    valid_check = xgb.callback.EvaluationMonitor(period=10000)
    model_x.fit(X_train,y_train, eval_set=[(X_test,y_test)] ,callbacks=[valid_check] , verbose = False )
    mae.append(mean_absolute_error(y_test,model_x.predict(X_test)))
    print(f'Mean absolute error is fold is {mae[-1]}')
    
                               
                               

[0]	validation_0-rmse:3.27383	validation_0-mean_absolute_error:3.09753




[9999]	validation_0-rmse:0.68908	validation_0-mean_absolute_error:0.54395
Mean absolute error is fold is 0.5439491561549877
[0]	validation_0-rmse:3.30865	validation_0-mean_absolute_error:3.15467




[9999]	validation_0-rmse:0.66098	validation_0-mean_absolute_error:0.51072
Mean absolute error is fold is 0.5107218982719699
[0]	validation_0-rmse:3.29416	validation_0-mean_absolute_error:3.12392




[9999]	validation_0-rmse:0.65008	validation_0-mean_absolute_error:0.49610
Mean absolute error is fold is 0.49610333404077694




[0]	validation_0-rmse:3.34041	validation_0-mean_absolute_error:3.16577
[9999]	validation_0-rmse:0.69803	validation_0-mean_absolute_error:0.54118
Mean absolute error is fold is 0.5411760922868242
[0]	validation_0-rmse:3.28771	validation_0-mean_absolute_error:3.13262




[9999]	validation_0-rmse:0.61623	validation_0-mean_absolute_error:0.47354
Mean absolute error is fold is 0.47354286209291774
[0]	validation_0-rmse:3.34990	validation_0-mean_absolute_error:3.21256




[9999]	validation_0-rmse:0.62731	validation_0-mean_absolute_error:0.47674
Mean absolute error is fold is 0.47674272079699437
[0]	validation_0-rmse:3.38185	validation_0-mean_absolute_error:3.22926




[9999]	validation_0-rmse:0.63770	validation_0-mean_absolute_error:0.49721
Mean absolute error is fold is 0.49720516292060296
[0]	validation_0-rmse:3.44785	validation_0-mean_absolute_error:3.30270




[9999]	validation_0-rmse:0.62936	validation_0-mean_absolute_error:0.48838
Mean absolute error is fold is 0.4883750158596814
[0]	validation_0-rmse:3.35149	validation_0-mean_absolute_error:3.21376




[9999]	validation_0-rmse:0.60520	validation_0-mean_absolute_error:0.46851
Mean absolute error is fold is 0.46851005011457736
[0]	validation_0-rmse:3.38065	validation_0-mean_absolute_error:3.22433




[9999]	validation_0-rmse:0.63561	validation_0-mean_absolute_error:0.48721
Mean absolute error is fold is 0.48720671587843234


In [6]:
Final_mae=np.mean(mae)
Final_mae

0.49835330084177654

# Hence, the Final Mean Absolute Error is 0.49835