## Градиентный бустинг

Решение задачи предсказания зарплаты data scientist-ов в зависимости от ряда факторов с помощью градиентного бустинга.

work_year: The number of years of work experience in the field of data science.

experience_level: The level of experience, such as Junior, Senior, or Lead.

employment_type: The type of employment, such as Full-time or Contract.

job_title: The specific job title or role, such as Data Analyst or Data Scientist.

salary: The salary amount for the given job.

salary_currency: The currency in which the salary is denoted.

salary_in_usd: The equivalent salary amount converted to US dollars (USD) for comparison purposes.

employee_residence: The country or region where the employee resides.

remote_ratio: The percentage of remote work offered in the job.

company_location: The location of the company or organization.

company_size: The company’s size is categorized as Small, Medium, or Large.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("09_boosting_data.csv")

In [None]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


#### Подготовка данных

In [None]:
import numpy as np

In [None]:
y = np.array(df['salary_in_usd'])
df = df.drop(['salary_in_usd', 'salary'], axis=1) # удаление из признаков salary чтобы избежать лика в данных

In [None]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,EUR,ES,100,ES,L
1,2023,MI,CT,ML Engineer,USD,US,100,US,S
2,2023,MI,CT,ML Engineer,USD,US,100,US,S
3,2023,SE,FT,Data Scientist,USD,CA,100,CA,M
4,2023,SE,FT,Data Scientist,USD,CA,100,CA,M


In [None]:
df.shape

(3755, 9)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# разобью выборку на train и rest в отношении 8:2
X_train, X_rest, y_train, y_rest = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
# теперь разобью rest на val и test в отношении 5:5
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

In [None]:
X_train.shape

(3004, 9)

In [None]:
X_val.shape

(375, 9)

In [None]:
X_test.shape

(376, 9)

#### Обучение модели линейной регрессии

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# обучу линрег только на вещественных признаках
lr = LinearRegression().fit(X_train[['work_year', 'remote_ratio']], y_train)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
mean_absolute_percentage_error(y_test, lr.predict(X_test[['work_year', 'remote_ratio']]))

0.6103673078891431

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_test, lr.predict(X_test[['work_year', 'remote_ratio']]), squared=False)

62197.64955394882

#### Обучение модели бустинга (CatBoostRegressor)

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import catboost as cb

In [None]:
from catboost import CatBoostRegressor

In [None]:
cat = CatBoostRegressor(n_estimators=50).fit(X_train[['work_year', 'remote_ratio']], y_train)

Learning rate set to 0.5
0:	learn: 60948.3941993	total: 46.6ms	remaining: 2.28s
1:	learn: 60394.8332622	total: 47.2ms	remaining: 1.13s
2:	learn: 60176.2257295	total: 47.9ms	remaining: 750ms
3:	learn: 60117.9950696	total: 49.5ms	remaining: 570ms
4:	learn: 60106.8817286	total: 52.3ms	remaining: 470ms
5:	learn: 60104.3937532	total: 53.5ms	remaining: 392ms
6:	learn: 60098.2161304	total: 54.8ms	remaining: 337ms
7:	learn: 60096.7874529	total: 56.3ms	remaining: 296ms
8:	learn: 60096.6242825	total: 57.4ms	remaining: 261ms
9:	learn: 60096.6155603	total: 58.5ms	remaining: 234ms
10:	learn: 60096.5300106	total: 59.5ms	remaining: 211ms
11:	learn: 60096.5053989	total: 60.5ms	remaining: 192ms
12:	learn: 60096.4874410	total: 61.7ms	remaining: 175ms
13:	learn: 60096.4487925	total: 62.8ms	remaining: 162ms
14:	learn: 60095.9397628	total: 64ms	remaining: 149ms
15:	learn: 60095.9351465	total: 65.2ms	remaining: 139ms
16:	learn: 60095.9319477	total: 66.2ms	remaining: 128ms
17:	learn: 60095.9286157	total: 67.

In [None]:
mean_absolute_percentage_error(y_test, cat.predict(X_test[['work_year', 'remote_ratio']]))

0.6178580405816663

In [None]:
mean_squared_error(y_test, cat.predict(X_test[['work_year', 'remote_ratio']]), squared=False)

63245.15835370137

#### Обработка категориальных признаков и обучение моделей линейной регрессии и бустинга

In [None]:
X_train.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,employee_residence,remote_ratio,company_location,company_size
2238,2022,SE,FT,Data Engineer,EUR,ES,0,ES,M
485,2023,MI,FT,Research Scientist,USD,US,100,US,M
2177,2022,SE,FT,Data Analyst,USD,US,0,US,M
3305,2022,SE,FT,Data Engineer,USD,US,100,US,M
1769,2023,SE,FT,Data Engineer,USD,US,100,US,M


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
# так как experience_level, employment_type и company_size имеют небольшое число уникальных значений,
# закодирую их при помощи one-hot энкодера, чтобы сильно не увеличить размер датафрейма

for feature in ['experience_level', 'employment_type', 'company_size']:
  ohe_encoder = OneHotEncoder()
  ohe_data = ohe_encoder.fit_transform(X_train[[feature]])
  X_train[ohe_encoder.categories_[0]] = ohe_data.toarray()

  ohe_data = ohe_encoder.transform(X_val[[feature]])
  X_val[ohe_encoder.categories_[0]] = ohe_data.toarray()

  ohe_data = ohe_encoder.transform(X_test[[feature]])
  X_test[ohe_encoder.categories_[0]] = ohe_data.toarray()


X_train = X_train.drop(['experience_level', 'employment_type', 'company_size'], axis=1)
X_test = X_test.drop(['experience_level', 'employment_type', 'company_size'], axis=1)
X_val = X_val.drop(['experience_level', 'employment_type', 'company_size'], axis=1)

X_train.head()

Unnamed: 0,work_year,job_title,salary_currency,employee_residence,remote_ratio,company_location,EN,EX,MI,SE,CT,FL,FT,PT,L,M,S
2238,2022,Data Engineer,EUR,ES,0,ES,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
485,2023,Research Scientist,USD,US,100,US,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2177,2022,Data Analyst,USD,US,0,US,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3305,2022,Data Engineer,USD,US,100,US,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1769,2023,Data Engineer,USD,US,100,US,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
# так как job_title, employee_residence, company_location имеют большое число категорий, закодирую их при помощи labelencoder

label_encoder = LabelEncoder()

for feature in ['job_title', 'employee_residence', 'company_location']:
  label_encoder.fit(df[feature])
  X_train[feature] = label_encoder.transform(X_train[feature])
  X_val[feature] = label_encoder.transform(X_val[feature])
  X_test[feature] = label_encoder.transform(X_test[feature])

X_train.head()

Unnamed: 0,work_year,job_title,salary_currency,employee_residence,remote_ratio,company_location,EN,EX,MI,SE,CT,FL,FT,PT,L,M,S
2238,2022,33,EUR,26,0,25,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
485,2023,89,USD,75,100,70,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2177,2022,25,USD,75,0,70,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3305,2022,33,USD,75,100,70,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1769,2023,33,USD,75,100,70,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from category_encoders.target_encoder import TargetEncoder

In [None]:
targetencoder = TargetEncoder(cols=['salary_currency'])
targetencoder.fit(X_train, y_train)
X_train = targetencoder.transform(X_train)
X_val = targetencoder.transform(X_val)
X_test = targetencoder.transform(X_test)

In [None]:
lr = LinearRegression().fit(X_train, y_train)
print('MAPE linreg:', mean_absolute_percentage_error(y_test, lr.predict(X_test)))
print('RMSE linreg:', mean_squared_error(y_test, lr.predict(X_test), squared=False))

MAPE linreg: 0.385516828923877
RMSE linreg: 52114.03939530348


In [None]:
cat = CatBoostRegressor(n_estimators=50).fit(X_train, y_train)

Learning rate set to 0.5
0:	learn: 55825.2148944	total: 6.74ms	remaining: 330ms
1:	learn: 51730.6492512	total: 15.3ms	remaining: 366ms
2:	learn: 50177.3712530	total: 20.3ms	remaining: 318ms
3:	learn: 48953.7946521	total: 21.7ms	remaining: 250ms
4:	learn: 48122.2821876	total: 31.3ms	remaining: 282ms
5:	learn: 47759.2068213	total: 32.7ms	remaining: 240ms
6:	learn: 47454.6243836	total: 45.1ms	remaining: 277ms
7:	learn: 47038.9994287	total: 46.5ms	remaining: 244ms
8:	learn: 46923.4097714	total: 53ms	remaining: 242ms
9:	learn: 46634.0513645	total: 58.5ms	remaining: 234ms
10:	learn: 46461.3806314	total: 63ms	remaining: 223ms
11:	learn: 46261.0309868	total: 66.2ms	remaining: 209ms
12:	learn: 46151.2954820	total: 67.5ms	remaining: 192ms
13:	learn: 46078.3939500	total: 68.9ms	remaining: 177ms
14:	learn: 45945.1032444	total: 69.8ms	remaining: 163ms
15:	learn: 45895.6923367	total: 70.8ms	remaining: 150ms
16:	learn: 45845.5633487	total: 71.7ms	remaining: 139ms
17:	learn: 45758.8704884	total: 72.6m

In [None]:
print('MAPE boost:', mean_absolute_percentage_error(y_test, cat.predict(X_test)))
print('RMSE boost:', mean_squared_error(y_test, cat.predict(X_test), squared=False))

# лучшая модель

MAPE boost: 0.3353155242336402
RMSE boost: 50206.295834023105


#### Подбор оптимальных гиперпараметров модели бустинга

In [None]:
from catboost import Pool

In [None]:
pool_train, pool_rest, y_pool_train, y_pool_rest = train_test_split(df, y, test_size=0.2, random_state=42)
pool_val, pool_test, y_pool_val, y_pool_test = train_test_split(pool_rest, y_pool_rest, test_size=0.5, random_state=42)

In [None]:
cat_features = [1, 2, 3, 4, 5, 7, 8]

In [None]:
# pool для передачи данных в модель с указанием категориальных признаков с помощью параметра cat_features
pool_train = Pool(data=pool_train, label=y_pool_train, cat_features=cat_features)
pool_val = Pool(data=pool_val, label=y_pool_val, cat_features=cat_features)
pool_test = Pool(data=pool_test, label=y_pool_test, cat_features=cat_features)

In [None]:
model = CatBoostRegressor(random_state=42, n_estimators=50)
model.fit(pool_train)

Learning rate set to 0.5
0:	learn: 55768.1307694	total: 9.39ms	remaining: 460ms
1:	learn: 53225.8173462	total: 31.3ms	remaining: 751ms
2:	learn: 50694.8262880	total: 42.7ms	remaining: 669ms
3:	learn: 49736.5225324	total: 48.7ms	remaining: 561ms
4:	learn: 49019.4843433	total: 57ms	remaining: 513ms
5:	learn: 48344.1794804	total: 64.6ms	remaining: 474ms
6:	learn: 47904.9079249	total: 71.2ms	remaining: 437ms
7:	learn: 47453.4258185	total: 79.7ms	remaining: 419ms
8:	learn: 47245.8326948	total: 88.3ms	remaining: 402ms
9:	learn: 47031.6160472	total: 98.8ms	remaining: 395ms
10:	learn: 46760.8410885	total: 107ms	remaining: 378ms
11:	learn: 46642.0991206	total: 118ms	remaining: 375ms
12:	learn: 46597.4033199	total: 128ms	remaining: 365ms
13:	learn: 46476.9205844	total: 156ms	remaining: 402ms
14:	learn: 46312.2238932	total: 175ms	remaining: 407ms
15:	learn: 46303.8398899	total: 176ms	remaining: 375ms
16:	learn: 46175.8404274	total: 191ms	remaining: 371ms
17:	learn: 46076.0060912	total: 201ms	rema

<catboost.core.CatBoostRegressor at 0x7f04c9fa5180>

In [None]:
print('MAPE cat:', mean_absolute_percentage_error(y_pool_test, model.predict(pool_test)))
print('RMSE cat:', mean_squared_error(y_pool_test, model.predict(pool_test), squared=False))

MAPE cat: 0.4011957226266806
RMSE cat: 51321.41739737924


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
cat = CatBoostRegressor(random_state=42, n_estimators=50)

grid = {'learning_rate': [0.03, 0.1], 'depth': [4, 6, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = cat.grid_search(grid, pool_train)

0:	learn: 148121.9966996	test: 146908.1498383	best: 146908.1498383 (0)	total: 6.94ms	remaining: 340ms
1:	learn: 144336.4270759	test: 143154.0724309	best: 143154.0724309 (1)	total: 8.99ms	remaining: 216ms
2:	learn: 140686.4338252	test: 139514.2078347	best: 139514.2078347 (2)	total: 17.1ms	remaining: 268ms
3:	learn: 137156.8284015	test: 136031.9844975	best: 136031.9844975 (3)	total: 18.9ms	remaining: 217ms
4:	learn: 133823.5869389	test: 132733.1848517	best: 132733.1848517 (4)	total: 22.1ms	remaining: 199ms
5:	learn: 130541.2191868	test: 129458.6988120	best: 129458.6988120 (5)	total: 23.7ms	remaining: 174ms
6:	learn: 127475.1906500	test: 126411.3301115	best: 126411.3301115 (6)	total: 27.7ms	remaining: 170ms
7:	learn: 124424.7651069	test: 123366.0016758	best: 123366.0016758 (7)	total: 32.9ms	remaining: 173ms
8:	learn: 121589.4071524	test: 120537.2276899	best: 120537.2276899 (8)	total: 34.4ms	remaining: 157ms
9:	learn: 118734.9989071	test: 117688.9051096	best: 117688.9051096 (9)	total: 40.1

In [None]:
grid_search_result['params']

{'depth': 10, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

In [None]:
cat.learning_rate = grid_search_result['params']['learning_rate']
cat.depth = grid_search_result['params']['depth']
cat.l2_leaf_reg = grid_search_result['params']['l2_leaf_reg']

In [None]:
%%time
cat = cat.fit(pool_train)

0:	learn: 61313.4490378	total: 5.32ms	remaining: 261ms
1:	learn: 59568.3813430	total: 23.9ms	remaining: 574ms
2:	learn: 58046.7395382	total: 38.6ms	remaining: 605ms
3:	learn: 56837.2215045	total: 50.2ms	remaining: 577ms
4:	learn: 55711.5216134	total: 61.6ms	remaining: 555ms
5:	learn: 54802.8932952	total: 79.2ms	remaining: 581ms
6:	learn: 54068.8742315	total: 99.1ms	remaining: 609ms
7:	learn: 53498.5420580	total: 104ms	remaining: 548ms
8:	learn: 52861.1827655	total: 130ms	remaining: 594ms
9:	learn: 52380.0948113	total: 147ms	remaining: 590ms
10:	learn: 52006.1308376	total: 155ms	remaining: 550ms
11:	learn: 51256.7856371	total: 171ms	remaining: 543ms
12:	learn: 50859.4065578	total: 175ms	remaining: 498ms
13:	learn: 50311.0835016	total: 190ms	remaining: 489ms
14:	learn: 49958.6559628	total: 193ms	remaining: 451ms
15:	learn: 49566.2337730	total: 210ms	remaining: 446ms
16:	learn: 49223.9309399	total: 229ms	remaining: 444ms
17:	learn: 48888.9108647	total: 256ms	remaining: 454ms
18:	learn: 48

In [None]:
%%time
predictions = cat.predict(pool_test)

CPU times: user 1.3 ms, sys: 12 µs, total: 1.31 ms
Wall time: 1.32 ms


In [None]:
print('MAPE cat:', mean_absolute_percentage_error(y_pool_test, predictions))
print('RMSE cat:', mean_squared_error(y_pool_test, predictions, squared=False))

MAPE cat: 0.36161631145639245
RMSE cat: 49741.31563365187


#### Использование XGBRegressor: обучение модели и подбор оптимальных гиперпараметров

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
# сначала обучу без подбора гиперпараметров
xgb = XGBRegressor(random_state=42, n_estimators=50).fit(X_train, y_train)
print('MAPE xgb:', mean_absolute_percentage_error(y_test, xgb.predict(X_test)))
print('RMSE xgb:', mean_squared_error(y_test, xgb.predict(X_test), squared=False))

MAPE xgb: 0.3299072657164567
RMSE xgb: 50496.86686111962


In [None]:
model_xgb = XGBRegressor(random_state=42, n_estimators=50)

In [None]:
param_test1 = {'max_depth': range(1, 10), 'min_child_weight': range(1, 10)}
gsearch1 = GridSearchCV(estimator=model_xgb, param_grid=param_test1, cv=5)
gsearch1.fit(X_train, y_train)

gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 9}, 0.429706162378238)

In [None]:
model_xgb.max_depth = gsearch1.best_params_['max_depth']
model_xgb.min_child_weight = gsearch1.best_params_['min_child_weight']

In [None]:
param_test2 = {'gamma': np.linspace(0, 1, 10)}
gsearch2 = GridSearchCV(estimator=model_xgb, param_grid=param_test2, cv=5)
gsearch2.fit(X_train, y_train)

gsearch2.best_params_, gsearch2.best_score_

({'gamma': 0.0}, 0.429706162378238)

In [None]:
model_xgb.gamma = gsearch2.best_params_['gamma']

In [None]:
param_test3 = {'subsample': [0.6, 0.7, 0.8, 0.9, 1.], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.]}
gsearch3 = GridSearchCV(estimator=model_xgb, param_grid=param_test3, cv=5)
gsearch3.fit(X_train, y_train)

gsearch3.best_params_, gsearch3.best_score_

({'colsample_bytree': 0.8, 'subsample': 0.7}, 0.43253907402571884)

In [None]:
model_xgb.subsample = gsearch3.best_params_['subsample']
model_xgb.colsample_bytree = gsearch3.best_params_['colsample_bytree']

In [None]:
param_test4 = {'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]}
gsearch4 = GridSearchCV(estimator=model_xgb, param_grid=param_test4, cv=5)
gsearch4.fit(X_train, y_train)

gsearch4.best_params_, gsearch4.best_score_

({'reg_alpha': 100}, 0.432539319260324)

In [None]:
model_xgb.reg_alpha = gsearch4.best_params_['reg_alpha']

In [None]:
# теперь обучу с подобранными гиперпараметрами
%%time
model_xgb = model_xgb.fit(X_train, y_train)

CPU times: user 168 ms, sys: 972 µs, total: 168 ms
Wall time: 88.3 ms


In [None]:
%%time
predictions = model_xgb.predict(X_test)

CPU times: user 6.14 ms, sys: 4 µs, total: 6.14 ms
Wall time: 6.17 ms


In [None]:
print('MAPE xgb:', mean_absolute_percentage_error(y_test, predictions))
print('RMSE xgb:', mean_squared_error(y_test, predictions, squared=False))

MAPE xgb: 0.3379300922162153
RMSE xgb: 50161.35347533137


#### Использование LGBMRegressor: обучение модели и подбор оптимальных гиперпараметров

In [None]:
!arch -arm64 brew install libomp

arch: invalid option -- 'a'
Try 'arch --help' for more information.


In [None]:
import lightgbm as lgb

In [None]:
gbm = lgb.LGBMRegressor(random_state=42, n_estimators=50)
gbm.fit(X_train, y_train)
print('MAPE gbm:', mean_absolute_percentage_error(y_test, gbm.predict(X_test)))
print('RMSE gbm:', mean_squared_error(y_test, gbm.predict(X_test), squared=False))

MAPE gbm: 0.32065867701756245
RMSE gbm: 49057.97620671302


In [None]:
estimator = lgb.LGBMRegressor(random_state=42, n_estimators=50)

param_grid = {'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40], 'num_leaves': [10, 20, 30]}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

gbm.best_params_, gbm.best_score_

({'learning_rate': 0.1, 'n_estimators': 40, 'num_leaves': 20},
 0.43540218608409803)

In [None]:
estimator.learning_rate = gbm.best_params_['learning_rate']
estimator.n_estimators = gbm.best_params_['n_estimators']
estimator.num_leaves = gbm.best_params_['num_leaves']

In [None]:
%%time
estimator = estimator.fit(X_train, y_train)

CPU times: user 50.2 ms, sys: 1.98 ms, total: 52.2 ms
Wall time: 31.2 ms


In [None]:
%%time
predictions = estimator.predict(X_test)

CPU times: user 10 ms, sys: 963 µs, total: 11 ms
Wall time: 11.3 ms


In [None]:
print('MAPE gbm:', mean_absolute_percentage_error(y_test, predictions))
print('RMSE gbm:', mean_squared_error(y_test, predictions, squared=False))

MAPE gbm: 0.3249798938405584
RMSE gbm: 49006.798841964875


#### Выводы

In [None]:
# MAPE cat: 0.36161631145639245, RMSE cat: 49741.31563365187
# MAPE xgb: 0.3379300922162153 RMSE xgb: 50161.35347533137
# MAPE gbm: 0.3249798938405584, RMSE gbm: 49006.798841964875

Лучшее качество и скорость обучения показала модель lightgbm, однако она немного медленнее делает предсказания.

Самой долгой моделью при обучении оказался CatBoost. Он также хуже всего работает без подбора гиперпараметров.