In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/linking-cleaned/classesup.npy
/kaggle/input/linking-cleaned/classesdown.npy
/kaggle/input/linking-cleaned/model_train_final_withencoding2.csv
/kaggle/input/linking-cleaned/submission (2).csv
/kaggle/input/linking-cleaned/classes.npy
/kaggle/input/linking-cleaned/k_fold_indices_10.json
/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv
/kaggle/input/dm-dataset-2/classesup.npy
/kaggle/input/dm-dataset-2/model_dataset.csv
/kaggle/input/dm-dataset-2/k_fold_10.json
/kaggle/input/dm-dataset-2/classesdown.npy
/kaggle/input/dm-dataset-2/classes.npy


In [17]:
import json
import lightgbm as lgb
import re
import optuna
import sklearn.metrics as met
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler

# Load the saved fold indices from the JSON file
with open('/kaggle/input/dm-dataset-2/k_fold_10.json', 'r') as file:
    fold_indices = json.load(file)
    
df=pd.read_csv('/kaggle/input/dm-dataset-2/model_dataset.csv')
score=df['score']
    
# Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
new_names = {col: re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns}
new_n_list = list(new_names.values())
# [LightGBM] Feature appears more than one time.
new_names = {col: f'{new_col}_{i}' if new_col in new_n_list[:i] else new_col for i, (col, new_col) in enumerate(new_names.items())}
df = df.rename(columns=new_names)

# 'fold_indices' now contains the loaded fold indices
Y=score
X=df.drop(columns=['id','score'])
scaler = StandardScaler()
X = scaler.fit_transform(X)

# optuna hyperparameter optimization

def objective(trial,X,Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)
    
    param = {
            'metric':'mae' , 'random_state': 42 , 'objective':'regression', "verbosity": -1,
            'n_estimators': 10000,
            'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
            'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6,0.7,0.8]),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.05,0.01,0.01]),
        'max_depth': trial.suggest_categorical('max_depth', [10,25,50]),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 20),
         'verbosity': -1
            }
    
    model=lgb.LGBMRegressor(**param) 
    early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False )
    model.fit(x_train,y_train, eval_set=[(x_test,y_test)] , callbacks=[early_stopping_callback])
    
    preds=model.predict(x_test)
    
    mae=met.mean_absolute_error(y_test,preds)
    
    return mae
    
# studying the parameter
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial,X,Y), n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-11-07 22:37:59,867] A new study created in memory with name: no-name-84aad709-d59a-4d65-8c6b-ae8b2a63014b
  'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
  'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
[I 2023-11-07 22:38:02,907] Trial 0 finished with value: 0.5255852467430251 and parameters: {'reg_alpha': 0.0059595830294934815, 'reg_lambda': 0.2797002393021912, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 25, 'num_leaves': 10, 'min_child_samples': 15}. Best is trial 0 with value: 0.5255852467430251.
  'reg_alpha': trial.suggest_loguniform('reg_alpha',0.005,0.007),
  'reg_lambda': trial.suggest_loguniform('reg_lambda',0.25, 0.5 ),
[I 2023-11-07 22:38:05,673] Trial 1 finished with value: 0.5279087616275437 and parameters: {'reg_alpha': 0.005889047709657216, 'reg_lambda': 0.4351135990442131, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 50, 'num_leaves': 19, 'min_child_samples'

Number of finished trials: 50
Best trial: {'reg_alpha': 0.00696289257129596, 'reg_lambda': 0.3111656149412144, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 25, 'num_leaves': 16, 'min_child_samples': 11}


In [20]:
params

{'reg_alpha': 0.00696289257129596,
 'reg_lambda': 0.3111656149412144,
 'colsample_bytree': 0.7,
 'subsample': 0.7,
 'learning_rate': 0.01,
 'max_depth': 25,
 'num_leaves': 16,
 'min_child_samples': 11}

In [23]:
params=study.best_params 
params['random_state'] = 42
params['n_estimators'] = 10000 
params['metric'] = 'mae'
params['objective']='regression'
mae=[]
# You can access the train and test indices for each fold as needed
for fold_number, fold_data in enumerate(fold_indices):
    train_index = fold_data['train_indices']
    test_index = fold_data['test_indices']
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    model = lgb.LGBMRegressor(objective='regression',metric='mae',  n_estimators =12000)
    early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False )
    verbose_callback = lgb.log_evaluation(100)
    model.fit(X_train,y_train, eval_set=[(X_test,y_test)] , callbacks=[early_stopping_callback, verbose_callback])
    
    mae.append(met.mean_absolute_error(y_test,model.predict(X_test)))
    print(f'Mean absolute error is fold is {mae[-1]}')
    

[100]	valid_0's l1: 0.544975
[200]	valid_0's l1: 0.551734
Mean absolute error is fold is 0.5413048550488309
[100]	valid_0's l1: 0.537137
[200]	valid_0's l1: 0.536744
Mean absolute error is fold is 0.5154181073518406
[100]	valid_0's l1: 0.502714
[200]	valid_0's l1: 0.511754
Mean absolute error is fold is 0.5016119538496592
[100]	valid_0's l1: 0.557064
[200]	valid_0's l1: 0.55882
Mean absolute error is fold is 0.5410043606952623
[100]	valid_0's l1: 0.495809
[200]	valid_0's l1: 0.50976
Mean absolute error is fold is 0.48268266003908733
[100]	valid_0's l1: 0.465566
[200]	valid_0's l1: 0.464359
[300]	valid_0's l1: 0.467902
[400]	valid_0's l1: 0.467656
Mean absolute error is fold is 0.4635156591273206
[100]	valid_0's l1: 0.512295
[200]	valid_0's l1: 0.516994
Mean absolute error is fold is 0.49889080531414665
[100]	valid_0's l1: 0.499173
[200]	valid_0's l1: 0.50708
Mean absolute error is fold is 0.4963818971555343
[100]	valid_0's l1: 0.479044
[200]	valid_0's l1: 0.486332
Mean absolute error i

In [26]:
Final_mae=np.mean(mae)
Final_mae

0.5005340321563364

# Hence, the final MAE is 0.500