# Importing data and libraries

In [297]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import lightgbm
import catboost
import xgboost as xg

import warnings
warnings.filterwarnings("ignore")

# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [298]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


# Reducing memory usage

In [299]:
# remove unnecessary 'id' column
train_df=train_df.drop(['id'],axis=1)
test_df=test_df.drop(['id'],axis=1)

print('Unique in Sex: {}'.format(list(train_df.Sex.unique())))

Unique in Sex: ['I', 'M', 'F']


In [300]:
def reduce_memory_usage(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_')
    mem_usage_before = df.memory_usage().sum() / 1024**2
    for i in list(df.columns):
        if df[i].dtype == 'object':
            df[i] = df[i].map({'I':1, 'F':2, 'M':3})
            df[i] = df[i].astype('int8')
        elif df[i].dtype == 'float64':
            df[i] = df[i].astype('float32')
        elif df[i].dtype == 'int64':
            df[i] = df[i].astype('int8')
        elif df[i].dtype == 'category':
            df[i] = df[i].astype('int8')
    
    mem_usage_after = df.memory_usage().sum() / 1024**2
    
    print('Before reducing memory usage: {:.2f} Mb'.format(mem_usage_before))
    print('After reducing memory usage: {:.2f} Mb'.format(mem_usage_after))
    print('Difference: {:.2f} Mb that is by {:.2f}% less'.format((mem_usage_after - mem_usage_before),(100* ((mem_usage_before - mem_usage_after)/mem_usage_before))))
    print('New df types: \n{}'.format(df.dtypes))

In [301]:
reduce_memory_usage(train_df)

Before reducing memory usage: 5.08 Mb
After reducing memory usage: 2.12 Mb
Difference: -2.97 Mb that is by 58.33% less
New df types: 
sex                  int8
length            float32
diameter          float32
height            float32
weight            float32
shucked_weight    float32
viscera_weight    float32
shell_weight      float32
age                  int8
dtype: object


In [302]:
reduce_memory_usage(test_df)

Before reducing memory usage: 3.01 Mb
After reducing memory usage: 1.37 Mb
Difference: -1.65 Mb that is by 54.69% less
New df types: 
sex                  int8
length            float32
diameter          float32
height            float32
weight            float32
shucked_weight    float32
viscera_weight    float32
shell_weight      float32
dtype: object


In [303]:
train_df.head()

Unnamed: 0,sex,length,diameter,height,weight,shucked_weight,viscera_weight,shell_weight,age
0,1,1.525,1.175,0.375,28.973188,12.728926,6.647958,8.348927,9
1,1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,3,1.3875,1.1125,0.375,24.777464,11.3398,5.556502,6.662132,9
3,2,1.7,1.4125,0.5,50.660557,20.35494,10.991838,14.996885,11
4,1,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


# Model selection

In [304]:
X = list(train_df.drop(columns = ['age']).columns)
y = ['age']

train, test = train_test_split(train_df,test_size=0.2,random_state=42, shuffle=True)

def cross_val(model):
    pred = cross_val_score(model, train[X], train[y], cv=2)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mape = metrics.mean_absolute_percentage_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MAPE:', mape)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mape = metrics.mean_absolute_percentage_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_score = metrics.r2_score(true, predicted)
    return mae, mape, mse, rmse, r2_score

# XGBoost

In [340]:
xgb_model_baseline = xg.XGBRegressor(n_estimators=350 , 
                                     # disable_default_eval_metric=True,
                            # eval_metric='mae',
                          learning_rate=0.01 ,
                          subsample=0.5,
                          colsample_bytree=0.75,
                          gamma=0.5,
                          refresh_leaf=0,
                          grow_policy='lossguide',
                          max_depth=8).fit(train[X], train[y])

test_pred = xgb_model_baseline.predict(test[X])
train_pred = xgb_model_baseline.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df = pd.DataFrame(data=[["XGB", *evaluate(test[y], test_pred)]], 
                          columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])

# results_df_temp = pd.DataFrame(data=[["XGBoost-Baseline", *evaluate(test[y], test_pred)]], 
#                             columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
# results_df = results_df.append(results_df_temp, ignore_index=True)

# 1.37184
results_df

Test set evaluation:
_____________________________________
MAE: 1.3718402
MAPE: 0.13008562226476292
MSE: 4.2270117
RMSE: 2.0559697
R2 Square 0.5852441861791899
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.3034861
MAPE: 0.12260635227791322
MSE: 3.8026378
RMSE: 1.9500353
R2 Square 0.6217858149295916
__________________________________


Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.37184,0.130086,4.227012,2.05597,0.585244


In [70]:
cross_val(xgb_model_baseline)

0.5714842449331268

# LightGBM

In [341]:
light_gbm_model = lightgbm.LGBMRegressor(
    objective='regression_l1',
    device_type='cpu',
    metrics='mae',
    n_estimators=1200 , 
    learning_rate=0.03 ,
    num_leaves=64,
    colsample_bytree=0.75,
    min_split_gain=0.3,
    max_depth=10).fit(train[X], train[y])


test_pred = light_gbm_model.predict(test[X])
train_pred = light_gbm_model.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df_temp = pd.DataFrame(data=[["LightGBM", *evaluate(test[y], test_pred)]], 
                            columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
results_df = results_df.append(results_df_temp, ignore_index=True)

# 1.3543
results_df

Test set evaluation:
_____________________________________
MAE: 1.354312797248433
MAPE: 0.12763432663184454
MSE: 4.285927766757646
RMSE: 2.07024823795545
R2 Square 0.5794633129041176
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.2199786162111796
MAPE: 0.11421837384534882
MSE: 3.786010012702472
RMSE: 1.9457672041388898
R2 Square 0.6234396293682823
__________________________________


Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.37184,0.130086,4.227012,2.05597,0.585244
1,LightGBM,1.354313,0.127634,4.285928,2.070248,0.579463


Подобрать сабсемпл еще лучше

In [45]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [93]:
xgb1 = xg.XGBRegressor()

parameters = {
              # 'nthread':[4], #when use hyperthread, xgboost may become slower
              # 'objective':['reg:squarederror'],
                # 'eval_metric'='rmse',
              'learning_rate': [.01, .02, .03],
              'max_depth': [5, 6, 7, 8, 9, 10],
              # 'min_child_weight': [2,3,4],
              # 'silent': [1],
              # 'subsample': [0.7, 0.8],
              # 'colsample_bytree': [0.7],
              'n_estimators': [300, 400, 500, 600]}

xgb_model_gridsearchcv = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        # n_jobs = -1,
                        scoring='neg_mean_absolute_error',
                        verbose=10
                                     )

xgb_model_gridsearchcv.fit(train[X], train[y])

Fitting 2 folds for each of 72 candidates, totalling 144 fits
[CV 1/2; 1/72] START learning_rate=0.01, max_depth=5, n_estimators=300..........
[CV 1/2; 1/72] END learning_rate=0.01, max_depth=5, n_estimators=300;, score=-1.382 total time=   2.4s
[CV 2/2; 1/72] START learning_rate=0.01, max_depth=5, n_estimators=300..........
[CV 2/2; 1/72] END learning_rate=0.01, max_depth=5, n_estimators=300;, score=-1.400 total time=   2.4s
[CV 1/2; 2/72] START learning_rate=0.01, max_depth=5, n_estimators=400..........
[CV 1/2; 2/72] END learning_rate=0.01, max_depth=5, n_estimators=400;, score=-1.385 total time=   3.1s
[CV 2/2; 2/72] START learning_rate=0.01, max_depth=5, n_estimators=400..........
[CV 2/2; 2/72] END learning_rate=0.01, max_depth=5, n_estimators=400;, score=-1.400 total time=   3.2s
[CV 1/2; 3/72] START learning_rate=0.01, max_depth=5, n_estimators=500..........
[CV 1/2; 3/72] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=-1.396 total time=   4.0s
[CV 2/2; 3/72] STA

In [95]:
xgb_model_gridsearchcv.best_params_

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}

In [52]:
best_params_xgb = {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}

In [96]:
xgb_model_gridsearchcv = xg.XGBRegressor(**best_params_xgb).fit(train[X], train[y])

test_pred = xgb_model_gridsearchcv.predict(test[X])
train_pred = xgb_model_gridsearchcv.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df_temp = pd.DataFrame(data=[["XGB-GSCV", *evaluate(test[y], test_pred)]], 
                            columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
results_df = results_df.append(results_df_temp, ignore_index=True)

Test set evaluation:
_____________________________________
MAE: 1.3839899
MAPE: 0.129012792492191
MSE: 4.428327
RMSE: 2.1043591
R2 Square 0.5654910452215556
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.3551793
MAPE: 0.12509559734639394
MSE: 4.2693768
RMSE: 2.066247
R2 Square 0.5753634753844339
__________________________________


In [97]:
results_df.sort_values(by='MAE', ascending = True)

Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.378179,0.131256,4.231293,2.057011,0.584824
1,XGB-GSCV,1.38399,0.129013,4.428327,2.104359,0.565491


## LightGBM optuna

In [337]:
import optuna

def objective(trial):
    param = {
        'objective': trial.suggest_categorical('objective', ['regression_l1']),
        'metrics': trial.suggest_categorical('metrics', ['mae']),
        'n_estimators': trial.suggest_int('n_estimators', 1000,1200,1400),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.03),
        'num_leaves': trial.suggest_int('num_leaves', 64, 96),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.75, 0.85),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.3, 0.4),
        'max_depth': trial.suggest_int('max_depth', 10, 10)
    }
    
    lgbm_optuna = lightgbm.LGBMRegressor(**param).fit(train[X], train[y])
    y_pred = lgbm_optuna.predict(test[X])
    return metrics.mean_absolute_error(test[y], y_pred)

In [339]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=50)

[I 2023-06-03 20:02:48,090] A new study created in memory with name: regression
[I 2023-06-03 20:03:13,037] Trial 0 finished with value: 1.3571277701309936 and parameters: {'objective': 'regression_l1', 'metrics': 'mae', 'n_estimators': 1000, 'learning_rate': 0.014758341859480922, 'num_leaves': 74, 'colsample_bytree': 0.8164919832077873, 'min_split_gain': 0.3278405337927693, 'max_depth': 10}. Best is trial 0 with value: 1.3571277701309936.
[I 2023-06-03 20:03:35,958] Trial 1 finished with value: 1.3590352123541465 and parameters: {'objective': 'regression_l1', 'metrics': 'mae', 'n_estimators': 1000, 'learning_rate': 0.013610188645523985, 'num_leaves': 72, 'colsample_bytree': 0.843699873010578, 'min_split_gain': 0.3177480177468518, 'max_depth': 10}. Best is trial 0 with value: 1.3571277701309936.
[I 2023-06-03 20:03:54,659] Trial 2 finished with value: 1.3557991817857058 and parameters: {'objective': 'regression_l1', 'metrics': 'mae', 'n_estimators': 1000, 'learning_rate': 0.02938680422

In [342]:
print('Best parameters: \n{}'.format(study.best_params))

Best parameters: 
{'objective': 'regression_l1', 'metrics': 'mae', 'n_estimators': 1000, 'learning_rate': 0.0238465224775758, 'num_leaves': 86, 'colsample_bytree': 0.7677037213450317, 'min_split_gain': 0.37891101491189083, 'max_depth': 10}


In [343]:
lgbm_optuna = lightgbm.LGBMRegressor(**study.best_params).fit(train[X],train[y])

test_pred = lgbm_optuna.predict(test[X])
train_pred = lgbm_optuna.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df_temp = pd.DataFrame(data=[["lgbm-optuna", *evaluate(test[y], test_pred)]], 
                            columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
results_df = results_df.append(results_df_temp, ignore_index=True)

results_df

Test set evaluation:
_____________________________________
MAE: 1.3531485241509682
MAPE: 0.12753429042784425
MSE: 4.287570982557014
RMSE: 2.0706450643596583
R2 Square 0.5793020800121848
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.2396688729281162
MAPE: 0.11555440762310211
MSE: 3.8756186433560766
RMSE: 1.968659097801363
R2 Square 0.614527064674182
__________________________________


Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.37184,0.130086,4.227012,2.05597,0.585244
1,LightGBM,1.354313,0.127634,4.285928,2.070248,0.579463
2,lgbm-optuna,1.353149,0.127534,4.287571,2.070645,0.579302


# Submission result

In [344]:
submission = pd.read_csv('sample_submission.csv')

result_for_submission = lgbm_optuna.predict(test_df)
submission['Age'] = result_for_submission
submission['Age'] = submission['Age'].round()

submission.to_csv('submission_result_lgbm_optuna.csv', index=False)
# import os
# os.chdir(r'../working')
# from IPython.display import FileLink
# FileLink(r'submission_result.csv')

In [346]:
check_df = pd.read_csv('submission_result_lgbm_optuna.csv')
check_df

Unnamed: 0,id,Age
0,74051,7.0
1,74052,8.0
2,74053,10.0
3,74054,9.0
4,74055,7.0
...,...,...
49363,123414,9.0
49364,123415,8.0
49365,123416,13.0
49366,123417,9.0
