# Importing data and libraries

In [58]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import lightgbm
import catboost
import xgboost as xg

import warnings
warnings.filterwarnings("ignore")

# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [81]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [82]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,74051.0,37025.0,21376.826729,0.0,18512.5,37025.0,55537.5,74050.0
Length,74051.0,1.31746,0.287757,0.1875,1.15,1.375,1.5375,2.012815
Diameter,74051.0,1.024496,0.237396,0.1375,0.8875,1.075,1.2,1.6125
Height,74051.0,0.348089,0.092034,0.0,0.3,0.3625,0.4125,2.825
Weight,74051.0,23.385217,12.648153,0.056699,13.437663,23.799405,32.162508,80.101512
Shucked Weight,74051.0,10.10427,5.618025,0.028349,5.712424,9.90815,14.033003,42.184056
Viscera Weight,74051.0,5.058386,2.792729,0.042524,2.8633,4.989512,6.988152,21.54562
Shell Weight,74051.0,6.72387,3.584372,0.042524,3.96893,6.931453,9.07184,28.491248
Age,74051.0,9.967806,3.175189,1.0,8.0,10.0,11.0,29.0


# Reducing memory usage

In [83]:
# remove unnecessary 'id' column
train_df=train_df.drop(['id'],axis=1)
test_df=test_df.drop(['id'],axis=1)

train_df.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
Age                 int64
dtype: object

In [84]:
print('Unique in Sex: {}'.format(list(train_df.Sex.unique())))

Unique in Sex: ['I', 'M', 'F']


In [85]:
def reduce_memory_usage(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '_')
    mem_usage_before = df.memory_usage().sum() / 1024**2
    for i in list(df.columns):
        if df[i].dtype == 'object':
            df[i] = df[i].map({'I':1, 'F':2, 'M':3})
            df[i] = df[i].astype('int8')
        elif df[i].dtype == 'float64':
            df[i] = df[i].astype('float32')
        elif df[i].dtype == 'int64':
            df[i] = df[i].astype('int8')
    
    mem_usage_after = df.memory_usage().sum() / 1024**2
    
    print('Before reducing memory usage: {:.2f} Mb'.format(mem_usage_before))
    print('After reducing memory usage: {:.2f} Mb'.format(mem_usage_after))
    print('Difference: {:.2f} Mb that is by {:.2f}% less'.format((mem_usage_after - mem_usage_before),(100* ((mem_usage_before - mem_usage_after)/mem_usage_before))))
    print('New df types: \n{}'.format(df.dtypes))

In [86]:
reduce_memory_usage(train_df)

Before reducing memory usage: 5.08 Mb
After reducing memory usage: 2.12 Mb
Difference: -2.97 Mb that is by 58.33% less
New df types: 
sex                  int8
length            float32
diameter          float32
height            float32
weight            float32
shucked_weight    float32
viscera_weight    float32
shell_weight      float32
age                  int8
dtype: object


In [87]:
reduce_memory_usage(test_df)

Before reducing memory usage: 3.01 Mb
After reducing memory usage: 1.37 Mb
Difference: -1.65 Mb that is by 54.69% less
New df types: 
sex                  int8
length            float32
diameter          float32
height            float32
weight            float32
shucked_weight    float32
viscera_weight    float32
shell_weight      float32
dtype: object


# Model selection

In [88]:
X = list(train_df.drop(columns = ['age']).columns)
y = ['age']

train, test = train_test_split(train_df,test_size=0.2,random_state=42, shuffle=True)

In [89]:
def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mape = metrics.mean_absolute_percentage_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MAPE:', mape)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mape = metrics.mean_absolute_percentage_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_score = metrics.r2_score(true, predicted)
    return mae, mape, mse, rmse, r2_score

# XGBoost

In [103]:
xgb_model_baseline = xg.XGBRegressor(n_estimators=360 , 
                          learning_rate=0.01 ,
                          subsample=0.5,
                          max_depth=8).fit(train[X], train[y])

test_pred = xgb_model_baseline.predict(test[X])
train_pred = xgb_model_baseline.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df = pd.DataFrame(data=[["XGB", *evaluate(test[y], test_pred)]], 
                          columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])

# results_df_temp = pd.DataFrame(data=[["XGBoost-Baseline", *evaluate(test[y], test_pred)]], 
#                             columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
# results_df = results_df.append(results_df_temp, ignore_index=True)

Test set evaluation:
_____________________________________
MAE: 1.373804
MAPE: 0.13073809372192835
MSE: 4.2163496
RMSE: 2.0533752
R2 Square 0.5862903710412601
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.2970244
MAPE: 0.12244116335525224
MSE: 3.7364366
RMSE: 1.9329865
R2 Square 0.6283702717806297
__________________________________


In [104]:
results_df

Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.373804,0.130738,4.21635,2.053375,0.58629


## XGBoost GridSearchCV

Подобрать сабсемпл еще лучше

In [45]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [93]:
xgb1 = xg.XGBRegressor()

parameters = {
              # 'nthread':[4], #when use hyperthread, xgboost may become slower
              # 'objective':['reg:squarederror'],
              'learning_rate': [.01, .02, .03],
              'max_depth': [5, 6, 7, 8, 9, 10],
              # 'min_child_weight': [2,3,4],
              # 'silent': [1],
              # 'subsample': [0.7, 0.8],
              # 'colsample_bytree': [0.7],
              'n_estimators': [300, 400, 500, 600]}

xgb_model_gridsearchcv = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        # n_jobs = -1,
                        scoring='neg_mean_absolute_error',
                        verbose=10
                                     )

xgb_model_gridsearchcv.fit(train[X], train[y])

Fitting 2 folds for each of 72 candidates, totalling 144 fits
[CV 1/2; 1/72] START learning_rate=0.01, max_depth=5, n_estimators=300..........
[CV 1/2; 1/72] END learning_rate=0.01, max_depth=5, n_estimators=300;, score=-1.382 total time=   2.4s
[CV 2/2; 1/72] START learning_rate=0.01, max_depth=5, n_estimators=300..........
[CV 2/2; 1/72] END learning_rate=0.01, max_depth=5, n_estimators=300;, score=-1.400 total time=   2.4s
[CV 1/2; 2/72] START learning_rate=0.01, max_depth=5, n_estimators=400..........
[CV 1/2; 2/72] END learning_rate=0.01, max_depth=5, n_estimators=400;, score=-1.385 total time=   3.1s
[CV 2/2; 2/72] START learning_rate=0.01, max_depth=5, n_estimators=400..........
[CV 2/2; 2/72] END learning_rate=0.01, max_depth=5, n_estimators=400;, score=-1.400 total time=   3.2s
[CV 1/2; 3/72] START learning_rate=0.01, max_depth=5, n_estimators=500..........
[CV 1/2; 3/72] END learning_rate=0.01, max_depth=5, n_estimators=500;, score=-1.396 total time=   4.0s
[CV 2/2; 3/72] STA

In [95]:
xgb_model_gridsearchcv.best_params_

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}

In [52]:
best_params_xgb = {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}

In [96]:
xgb_model_gridsearchcv = xg.XGBRegressor(**best_params_xgb).fit(train[X], train[y])

test_pred = xgb_model_gridsearchcv.predict(test[X])
train_pred = xgb_model_gridsearchcv.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df_temp = pd.DataFrame(data=[["XGB-GSCV", *evaluate(test[y], test_pred)]], 
                            columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
results_df = results_df.append(results_df_temp, ignore_index=True)

Test set evaluation:
_____________________________________
MAE: 1.3839899
MAPE: 0.129012792492191
MSE: 4.428327
RMSE: 2.1043591
R2 Square 0.5654910452215556
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.3551793
MAPE: 0.12509559734639394
MSE: 4.2693768
RMSE: 2.066247
R2 Square 0.5753634753844339
__________________________________


In [97]:
results_df.sort_values(by='MAE', ascending = True)

Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,XGB,1.378179,0.131256,4.231293,2.057011,0.584824
1,XGB-GSCV,1.38399,0.129013,4.428327,2.104359,0.565491


## XGBoost optuna

In [180]:
import optuna

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.03),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 8),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1)
    }
    xgb_model_optuna = xg.XGBRegressor(**param).fit(train[X], train[y])
    y_pred = xgb_model_optuna.predict(test[X])
    return metrics.r2_score(test[y], y_pred)

In [122]:
study = optuna.create_study(direction='maximize', study_name='regression')
study.optimize(objective, n_trials=10)

[I 2023-06-02 22:25:42,048] A new study created in memory with name: regression
[I 2023-06-02 22:25:54,223] Trial 0 finished with value: 0.5831307381517229 and parameters: {'max_depth': 10, 'learning_rate': 0.023925628199091643, 'n_estimators': 500, 'min_child_weight': 2, 'gamma': 0.9776100481451038, 'subsample': 0.587166592590955, 'colsample_bytree': 0.40685663560491714, 'reg_alpha': 0.8409000749645817, 'reg_lambda': 0.061792026188733494, 'random_state': 1}. Best is trial 0 with value: 0.5831307381517229.
[I 2023-06-02 22:26:23,183] Trial 1 finished with value: 0.5807553717847537 and parameters: {'max_depth': 9, 'learning_rate': 0.014264249709000582, 'n_estimators': 1500, 'min_child_weight': 2, 'gamma': 0.8674350305642768, 'subsample': 0.6840886532340226, 'colsample_bytree': 0.3329106100385388, 'reg_alpha': 0.4419242231129774, 'reg_lambda': 0.9307559471453095, 'random_state': 1}. Best is trial 0 with value: 0.5831307381517229.
[I 2023-06-02 22:26:36,638] Trial 2 finished with value: 0

In [123]:
print('Best parameters', study.best_params)

Best parameters {'max_depth': 9, 'learning_rate': 0.022163529860008142, 'n_estimators': 500, 'min_child_weight': 2, 'gamma': 0.4041403193905385, 'subsample': 0.7106353708474683, 'colsample_bytree': 0.4166802252135955, 'reg_alpha': 0.34194260072050764, 'reg_lambda': 0.7949584813814371, 'random_state': 1}


In [113]:
params = {'max_depth': 9, 'learning_rate': 0.022163529860008142, 'n_estimators': 500, 'min_child_weight': 2, 'gamma': 0.4041403193905385, 'subsample': 0.7106353708474683, 'colsample_bytree': 0.4166802252135955, 'reg_alpha': 0.34194260072050764, 'reg_lambda': 0.7949584813814371, 'random_state': 1}

In [124]:
xgb_model_optuna_ver1 = xg.XGBRegressor(**params).fit(train[X],train[y])

test_pred = xgb_model_optuna_ver1.predict(test[X])
train_pred = xgb_model_optuna_ver1.predict(train[X])

print('Test set evaluation:\n_____________________________________')
print_evaluate(test[y], test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(train[y], train_pred)

results_df_temp = pd.DataFrame(data=[["XGBoost-optuna-ver2", *evaluate(test[y], test_pred)]], 
                            columns=['Model', 'MAE', 'MAPE', 'MSE', 'RMSE', 'R2_Score'])
results_df = results_df.append(results_df_temp, ignore_index=True)

results_df

Test set evaluation:
_____________________________________
MAE: 1.6111729
MAPE: 0.15729478952109044
MSE: 5.418746
RMSE: 2.32782
R2 Square 0.4661269954013275
__________________________________
Train set evaluation:
_____________________________________
MAE: 1.6066928
MAPE: 0.15598058055578307
MSE: 5.427621
RMSE: 2.3297255
R2 Square 0.46006387376842983
__________________________________


Unnamed: 0,Model,MAE,MAPE,MSE,RMSE,R2_Score
0,Auto_ML_XGBoost,1.393461,0.135415,4.145107,2.035954,0.59161
1,LightGBM,1.405033,0.137025,4.164706,2.040761,0.589679
2,catboost,1.433476,0.127666,5.274112,2.296543,0.480377
3,XGBoost-Baseline,1.386107,0.131317,4.311231,2.07635,0.575243
4,XGBoost-GridSearchCV-ver1,1.397549,0.135907,4.139693,2.034624,0.592144
5,XGBoost-optuna-ver1,1.611173,0.157295,5.418746,2.32782,0.466127
6,XGBoost-optuna-ver2,1.611173,0.157295,5.418746,2.32782,0.466127


# Submission result

In [105]:
submission = pd.read_csv('sample_submission.csv')

result_for_submission = xgb_model_baseline.predict(test_df)
submission['Age'] = result_for_submission
submission['Age'] = submission['Age'].round()

submission.to_csv('submission_result.csv', index=False)
# import os
# os.chdir(r'../working')
# from IPython.display import FileLink
# FileLink(r'submission_result.csv')

In [106]:
check_df = pd.read_csv('submission_result.csv')
check_df

Unnamed: 0,id,Age
0,74051,7.0
1,74052,8.0
2,74053,11.0
3,74054,9.0
4,74055,7.0
...,...,...
49363,123414,9.0
49364,123415,8.0
49365,123416,13.0
49366,123417,10.0
