In [None]:
!pip install catboost
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Col

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

import optuna

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Crab/main/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Crab/main/test.csv')
sub = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Crab/main/sample_submission.csv')
ori = pd.read_csv('https://raw.githubusercontent.com/AkemiRiemann/Crab/main/synthetic_data_100.csv')

train = pd.concat([train, ori])

target = train['Age']
train.drop(['id', 'Age'], inplace=True, axis=1)
test.drop('id', inplace=True, axis=1)

In [None]:
enc = OrdinalEncoder()
train['Sex'] = enc.fit_transform(train[['Sex']])
test['Sex'] = enc.transform(test[['Sex']])

In [None]:
models = {
    'logistic_regression': linear_model.LogisticRegression(),
    'lasso': linear_model.LassoCV(),
    'ridge': linear_model.RidgeCV(),
    'svc': svm.SVR(),
    'randomforest': ensemble.RandomForestRegressor(),
    'gradientboosting': ensemble.GradientBoostingRegressor(),
    'xgboost': xgb.XGBRegressor(),
    'lightgbm': lgbm.LGBMRegressor(objective='MAE'),
    'catboost' : cb.CatBoostRegressor(iterations=300, depth=6, silent=True, objective='MAE')
}

In [None]:
for name, model in models.items():
    print(name, ':', -cross_val_score(model, train, target, cv=5, scoring="neg_mean_absolute_error"))

lightgbm : [1.35807453 1.36977945 1.33825585 1.31314448 1.3155421 ]


In [None]:
def obj(trial):
    params = {#"device": "gpu",
              "n_jobs": -1,
              "verbose": -1,
              "n_estimators": trial.suggest_int("n_estimators", 100,2000),
              "boosting_type": 'gbdt',
              "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1),
              "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0),
              "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0),
              "max_depth": trial.suggest_int("max_depth", 2, 20),
              "num_leaves": trial.suggest_int("num_leaves", 40,50),
              "colsample_bytree":trial.suggest_float("colsample_bytree",0.8, 0.9),
              "subsample":trial.suggest_float("subsample", 0.8,0.9),
              #"subsample_freq": 100,
              "objective": "regression_l1",
              "metric" : "mae"
    }
    model = lgbm.LGBMRegressor(**params)
    return np.mean(-cross_val_score(model, train, target, cv=5, scoring="neg_mean_absolute_error"))

study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(obj, n_trials=50)

[I 2023-06-08 07:04:02,341] A new study created in memory with name: no-name-f6d43cbb-3390-4507-a607-3c2ce891536c
[I 2023-06-08 07:06:25,212] Trial 0 finished with value: 1.335123500664326 and parameters: {'n_estimators': 1225, 'learning_rate': 0.003866138962948717, 'reg_alpha': 0.7656580730832833, 'reg_lambda': 0.12948162423312284, 'max_depth': 13, 'num_leaves': 46, 'colsample_bytree': 0.8757233458268641, 'subsample': 0.8959472135663327}. Best is trial 0 with value: 1.335123500664326.
[I 2023-06-08 07:07:45,067] Trial 1 finished with value: 1.3215941566024374 and parameters: {'n_estimators': 1185, 'learning_rate': 0.04929185108192221, 'reg_alpha': 0.9961738292344691, 'reg_lambda': 0.07030662789121869, 'max_depth': 20, 'num_leaves': 46, 'colsample_bytree': 0.8125412117050667, 'subsample': 0.8289063160429617}. Best is trial 1 with value: 1.3215941566024374.
[I 2023-06-08 07:08:40,652] Trial 2 finished with value: 1.322493294970767 and parameters: {'n_estimators': 714, 'learning_rate': 0

In [None]:
sns.heatmap(train.corr())

In [None]:
best1 = {'n_estimators': 660, 'learning_rate': 0.0336812735699011, 'reg_alpha': 0.6735574755211416, 'reg_lambda': 0.3736800180213783, 'max_depth': 8, 'num_leaves': 48, 'colsample_bytree': 0.857142570499423, 'subsample': 0.8821756060849634}
best2 = {'n_estimators': 988, 'learning_rate': 0.037034603118878906, 'reg_alpha': 0.4617411543255769, 'reg_lambda': 0.5456571982169995, 'max_depth': 7, 'num_leaves': 45, 'colsample_bytree': 0.8836512335903922, 'subsample': 0.8953646491942437}
best3 = {'n_estimators': 887, 'learning_rate': 0.03256128999320696, 'reg_alpha': 0.46223695959902733, 'reg_lambda': 0.7288219044766407, 'max_depth': 6, 'num_leaves': 46, 'colsample_bytree': 0.8838637325098199, 'subsample': 0.8978416952275811}

In [None]:
preds = [0 for i in range(len(test))]

In [None]:
preds += lgbm.LGBMRegressor(**best1).fit(train,target).predict(test)
preds += lgbm.LGBMRegressor(**best2).fit(train,target).predict(test)
preds += lgbm.LGBMRegressor(**best3).fit(train,target).predict(test)

In [None]:
preds/=3
preds

In [None]:
preds = [round(preds[i]) for i in range(len(preds))]
preds

In [None]:
sub['Age'] = preds

In [None]:
sub

In [None]:
sub.to_csv('lgbm_optuna_3.csv', columns=['id', 'Age'], index=False)

In [None]:
lgbm_best = lgbm.LGBMRegressor(**study_lgbm.best_params).fit(train,target).predict(test)

In [None]:
lgbm_best

array([ 7.58508057,  7.71856808, 10.80000511, ..., 13.40378946,
       10.06944522, 12.36206653])

In [None]:
lgbm_best = [round(lgbm_best[i]) for i in range(len(lgbm_best))]

In [None]:
lgbm_best[:5]

In [None]:
sub['Age'] = lgbm_best

In [None]:
sub.to_csv('lgbm_synthesis_100_tuning.csv', columns=['id', 'Age'], index=False)

In [None]:
sub

Unnamed: 0,id,Age
0,74051,8
1,74052,8
2,74053,11
3,74054,10
4,74055,7
...,...,...
49363,123414,10
49364,123415,8
49365,123416,13
49366,123417,10
